From 653d21ed11074dcb12e4485167dacba51e6e7663 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Thu, 20 Dec 2018 17:59:19 -0500 Subject: [PATCH 01/32] First take at exhaustive TAAT --- include/pisa/query/queries.hpp | 3 +- .../query/algorithm/exhaustive_taat_query.hpp | 103 ++++++++++++++++++ src/queries.cpp | 2 + 3 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 include/query/algorithm/exhaustive_taat_query.hpp diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp index 6da30f2ab..bdb81a17e 100644 --- a/include/pisa/query/queries.hpp +++ b/include/pisa/query/queries.hpp @@ -60,4 +60,5 @@ term_freq_vec query_freqs(term_id_vec terms) { #include "algorithm/or_query.hpp" #include "algorithm/ranked_and_query.hpp" #include "algorithm/ranked_or_query.hpp" -#include "algorithm/wand_query.hpp" \ No newline at end of file +#include "algorithm/wand_query.hpp" +#include "algorithm/exhaustive_taat_query.hpp" diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp new file mode 100644 index 000000000..7c6154427 --- /dev/null +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -0,0 +1,103 @@ +#pragma once + +#include "topk_queue.hpp" + +namespace pisa { + +// TODO: These are functions common to query processing in general. +// They should be moved out of this file. +namespace query { + +template +struct Scored_Cursor { + Cursor unscored_cursor; + std::function score_function; + + [[gnu::always_inline]] [[nodiscard]] constexpr auto docid() const -> uint64_t { + return unscored_cursor.docid(); + } + + [[gnu::always_inline]] [[nodiscard]] constexpr auto freq() -> uint64_t { + return unscored_cursor.freq(); + } + + [[gnu::always_inline]] [[nodiscard]] constexpr auto score() -> float { + return score_function(docid(), freq()); + } + + [[gnu::always_inline]] constexpr void next() { unscored_cursor.next(); } +}; + +template +[[nodiscard]] auto scored_cursors(Index const& index, WandType const &wdata, term_id_vec terms) +{ + // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more + // complex refactoring I want to avoid for now. + using scorer_type = bm25; + using cursor_type = Scored_Cursor; + + auto query_term_freqs = query_freqs(terms); + std::vector cursors; + cursors.reserve(query_term_freqs.size()); + + for (auto term : query_term_freqs) { + auto list = index[term.first]; + uint64_t num_docs = index.num_docs(); + auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); + cursors.push_back({std::move(list), [q_weight, &wdata](auto docid, auto freq) { + float norm_len = wdata.norm_len(docid); + return q_weight * scorer_type::doc_term_weight(freq, norm_len); + }}); + } + return cursors; +} + +} // namespace query + +template +struct exhaustive_taat_query { + exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} + + uint64_t operator()(term_id_vec terms) { + return taat(query::scored_cursors(m_index, m_wdata, terms)); + } + + // TODO(michal): I think this should be eventually the `operator()` + template + uint64_t taat(std::vector cursors) { + m_topk.clear(); + if (cursors.empty()) { + return 0; + } + std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); + for (auto &cursor : cursors) { + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + m_accumulators[cursor.docid()] += cursor.score(); + } + } + for (uint64_t docid = 0u; docid < m_accumulators.size(); ++docid) { + m_topk.insert(m_accumulators[docid], docid); + } + + m_topk.finalize(); + return m_topk.topk().size(); + } + + std::vector> const &topk() const { return m_topk.topk(); } + + private: + Index const & m_index; + WandType const & m_wdata; + topk_queue m_topk; + std::vector m_accumulators; +}; + +template +[[nodiscard]] auto make_exhaustive_taat_query(Index const & index, + WandType const &wdata, + uint64_t k) { + return exhaustive_taat_query(index, wdata, k); +} + +}; // namespace pisa diff --git a/src/queries.cpp b/src/queries.cpp index e5b8f320a..e21c5eba4 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -134,6 +134,8 @@ void perftest(const std::string &index_filename, query_fun = [&](term_id_vec query) { return maxscore_query(wdata, k)(index, query); }; + } else if (t == "exhaustive_taat" && wand_data_filename) { + query_fun = pisa::make_exhaustive_taat_query(index, wdata, k); } else { logger() << "Unsupported query type: " << t << std::endl; break; From 8f0b54a79200ca634b5fd7b9275140b65d5ccc0a Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 22 Dec 2018 11:31:11 -0500 Subject: [PATCH 02/32] Fetch an entire block at a time for TAAT --- CMakeLists.txt | 5 +++++ include/pisa/block_posting_list.hpp | 13 +++++++++++++ include/pisa/freq_index.hpp | 8 ++++++++ include/query/algorithm/exhaustive_taat_query.hpp | 15 +++++++++++++-- 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da1c80382..3f6cbb6cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,11 @@ list(APPEND LCOV_REMOVE_PATTERNS "'${PROJECT_SOURCE_DIR}/external/*'") if (UNIX) +<<<<<<< HEAD +======= + # C++14 + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") +>>>>>>> Fetch an entire block at a time for TAAT # For hardware popcount and other special instructions set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") diff --git a/include/pisa/block_posting_list.hpp b/include/pisa/block_posting_list.hpp index ac29c70f3..e7dce90c9 100644 --- a/include/pisa/block_posting_list.hpp +++ b/include/pisa/block_posting_list.hpp @@ -156,6 +156,19 @@ namespace pisa { } } + [[nodiscard]] std::pair, std::vector> next_block() + { + // TODO: For now only, gotta be changed. + if (m_pos_in_block != 0) throw std::runtime_error("Oops."); + + auto block = std::make_pair(std::move(m_docs_buf), std::move(m_freqs_buf)); + m_docs_buf.resize(BlockCodec::block_size); + m_freqs_buf.resize(BlockCodec::block_size); + m_pos_in_block = m_cur_block_size - 1; + next(); + return block; + } + uint64_t docid() const { return m_cur_docid; diff --git a/include/pisa/freq_index.hpp b/include/pisa/freq_index.hpp index 3aaf0bed6..2d87860a6 100644 --- a/include/pisa/freq_index.hpp +++ b/include/pisa/freq_index.hpp @@ -103,6 +103,14 @@ namespace pisa { m_cur_docid = val.second; } + [[nodiscard]] std::pair, std::vector> next_block() + { + auto block = std::make_pair, std::vector>( + {m_cur_docid}, {m_freqs_enum.move(m_cur_pos).second}); + next(); + return block; + } + uint64_t docid() const { return m_cur_docid; diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 7c6154427..93635fdb6 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -26,6 +26,14 @@ struct Scored_Cursor { } [[gnu::always_inline]] constexpr void next() { unscored_cursor.next(); } + [[nodiscard]] auto next_block() -> std::pair, std::vector> { + auto [documents, frequencies] = unscored_cursor.next_block(); + std::vector scores(documents.size()); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + scores[idx] = score_function(documents[idx], frequencies[idx]); + } + return std::make_pair(std::move(documents), std::move(scores)); + } }; template @@ -72,8 +80,11 @@ struct exhaustive_taat_query { } std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); for (auto &cursor : cursors) { - for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - m_accumulators[cursor.docid()] += cursor.score(); + while (cursor.docid() < m_accumulators.size()) { + auto [documents, scores] = cursor.next_block(); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + m_accumulators[documents[idx]] += scores[idx]; + } } } for (uint64_t docid = 0u; docid < m_accumulators.size(); ++docid) { From 9f4e32554800889b1f3999fb442b92ebf524a453 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 22 Dec 2018 12:04:07 -0500 Subject: [PATCH 03/32] Add prefetching for TAAT --- include/query/algorithm/exhaustive_taat_query.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 93635fdb6..cd5f7f962 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -1,5 +1,6 @@ #pragma once +#include "util/intrinsics.hpp" #include "topk_queue.hpp" namespace pisa { @@ -83,6 +84,7 @@ struct exhaustive_taat_query { while (cursor.docid() < m_accumulators.size()) { auto [documents, scores] = cursor.next_block(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { + intrinsics::prefetch(&m_accumulators[documents[idx + 3]]); m_accumulators[documents[idx]] += scores[idx]; } } From c28e5d21ecaaabfcdfa31fa3e51f073d5380f6aa Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 22 Dec 2018 15:46:06 -0500 Subject: [PATCH 04/32] Return buffer references from posting lists instead of moved buffers. --- include/pisa/block_posting_list.hpp | 24 ++++-- include/pisa/freq_index.hpp | 9 +-- include/pisa/util/util.hpp | 4 + .../query/algorithm/exhaustive_taat_query.hpp | 76 ++++++++----------- 4 files changed, 53 insertions(+), 60 deletions(-) diff --git a/include/pisa/block_posting_list.hpp b/include/pisa/block_posting_list.hpp index e7dce90c9..f2f13315e 100644 --- a/include/pisa/block_posting_list.hpp +++ b/include/pisa/block_posting_list.hpp @@ -82,6 +82,8 @@ namespace pisa { class document_enumerator { public: + using enumerator_category = ds2i::block_enumerator_tag; + document_enumerator(uint8_t const* data, uint64_t universe, size_t term_id = 0) : m_n(0) // just to silence warnings @@ -156,17 +158,23 @@ namespace pisa { } } - [[nodiscard]] std::pair, std::vector> next_block() - { - // TODO: For now only, gotta be changed. - if (m_pos_in_block != 0) throw std::runtime_error("Oops."); + // TODO(michal): I recommend using some view, like gsl::span or something + // instead of a reference to a vector. + [[nodiscard]] auto document_buffer() -> std::vector const & { + return m_docs_buf; + } - auto block = std::make_pair(std::move(m_docs_buf), std::move(m_freqs_buf)); - m_docs_buf.resize(BlockCodec::block_size); - m_freqs_buf.resize(BlockCodec::block_size); + [[nodiscard]] auto frequency_buffer() -> std::vector const & { + if (!m_freqs_decoded) { + decode_freqs_block(); + } + return m_freqs_buf; + } + + void next_block() + { m_pos_in_block = m_cur_block_size - 1; next(); - return block; } uint64_t docid() const diff --git a/include/pisa/freq_index.hpp b/include/pisa/freq_index.hpp index 2d87860a6..38804f615 100644 --- a/include/pisa/freq_index.hpp +++ b/include/pisa/freq_index.hpp @@ -76,6 +76,7 @@ namespace pisa { class document_enumerator { public: + using enumerator_category = ds2i::input_enumerator_tag; void reset() { m_cur_pos = 0; @@ -103,14 +104,6 @@ namespace pisa { m_cur_docid = val.second; } - [[nodiscard]] std::pair, std::vector> next_block() - { - auto block = std::make_pair, std::vector>( - {m_cur_docid}, {m_freqs_enum.move(m_cur_pos).second}); - next(); - return block; - } - uint64_t docid() const { return m_cur_docid; diff --git a/include/pisa/util/util.hpp b/include/pisa/util/util.hpp index 6281152ea..01223ac59 100644 --- a/include/pisa/util/util.hpp +++ b/include/pisa/util/util.hpp @@ -265,4 +265,8 @@ namespace pisa { bool first; }; + // TODO(michal): We should extract it in a better place, couldn't find anything better quickly. + struct input_enumerator_tag {}; + struct block_enumerator_tag : public input_enumerator_tag {}; + } diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index cd5f7f962..5cf5e3d76 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -9,56 +9,32 @@ namespace pisa { // They should be moved out of this file. namespace query { -template -struct Scored_Cursor { - Cursor unscored_cursor; - std::function score_function; - - [[gnu::always_inline]] [[nodiscard]] constexpr auto docid() const -> uint64_t { - return unscored_cursor.docid(); - } - - [[gnu::always_inline]] [[nodiscard]] constexpr auto freq() -> uint64_t { - return unscored_cursor.freq(); - } - - [[gnu::always_inline]] [[nodiscard]] constexpr auto score() -> float { - return score_function(docid(), freq()); - } - - [[gnu::always_inline]] constexpr void next() { unscored_cursor.next(); } - [[nodiscard]] auto next_block() -> std::pair, std::vector> { - auto [documents, frequencies] = unscored_cursor.next_block(); - std::vector scores(documents.size()); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - scores[idx] = score_function(documents[idx], frequencies[idx]); - } - return std::make_pair(std::move(documents), std::move(scores)); - } -}; - template -[[nodiscard]] auto scored_cursors(Index const& index, WandType const &wdata, term_id_vec terms) +[[nodiscard]] auto cursors_with_scores(Index const& index, WandType const &wdata, term_id_vec terms) { // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more // complex refactoring I want to avoid for now. - using scorer_type = bm25; - using cursor_type = Scored_Cursor; + using scorer_type = bm25; + using cursor_type = typename Index::document_enumerator; + using score_function_type = std::function; auto query_term_freqs = query_freqs(terms); std::vector cursors; + std::vector score_functions; cursors.reserve(query_term_freqs.size()); + score_functions.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { auto list = index[term.first]; uint64_t num_docs = index.num_docs(); auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); - cursors.push_back({std::move(list), [q_weight, &wdata](auto docid, auto freq) { - float norm_len = wdata.norm_len(docid); - return q_weight * scorer_type::doc_term_weight(freq, norm_len); - }}); + cursors.push_back(std::move(list)); + score_functions.push_back([q_weight, &wdata](auto docid, auto freq) { + float norm_len = wdata.norm_len(docid); + return q_weight * scorer_type::doc_term_weight(freq, norm_len); + }); } - return cursors; + return std::make_pair(cursors, score_functions); } } // namespace query @@ -69,24 +45,36 @@ struct exhaustive_taat_query { : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} uint64_t operator()(term_id_vec terms) { - return taat(query::scored_cursors(m_index, m_wdata, terms)); + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + return taat(std::move(cws.first), std::move(cws.second)); } + using score_function_type = std::function; // TODO(michal): I think this should be eventually the `operator()` template - uint64_t taat(std::vector cursors) { + uint64_t taat(std::vector cursors, std::vector score_functions) { m_topk.clear(); if (cursors.empty()) { return 0; } std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); - for (auto &cursor : cursors) { - while (cursor.docid() < m_accumulators.size()) { - auto [documents, scores] = cursor.next_block(); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - intrinsics::prefetch(&m_accumulators[documents[idx + 3]]); - m_accumulators[documents[idx]] += scores[idx]; + for (uint32_t term = 0; term < cursors.size(); ++term) { + auto &cursor = cursors[term]; + auto &score_function = score_functions[term]; + if constexpr (std::is_same_v) { + while (cursor.docid() < m_accumulators.size()) { + auto const &documents = cursor.document_buffer(); + auto const &freqs = cursor.frequency_buffer(); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + intrinsics::prefetch(&m_accumulators[documents[idx + 3]]); + m_accumulators[documents[idx]] += + score_function(documents[idx], freqs[idx] + 1); + } + cursor.next_block(); } + } else { + // TODO(michal): when no blocks } } for (uint64_t docid = 0u; docid < m_accumulators.size(); ++docid) { From 5d5519b81715bb57681c3a401da879fa4ad33074 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 22 Dec 2018 16:53:02 -0500 Subject: [PATCH 05/32] Blocked accumulator array for TAAT --- .../query/algorithm/exhaustive_taat_query.hpp | 50 ++++++++++++++++--- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 5cf5e3d76..55314f58a 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -39,10 +39,21 @@ template } // namespace query -template +template struct exhaustive_taat_query { exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) - : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} + : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) + { + static_assert(accumulator_block_size >= 0, "must be non-negative"); + if constexpr (accumulator_block_size > 0) { + m_acc_block_count = + (m_accumulators.size() + accumulator_block_size - 1) / accumulator_block_size; + m_accumulators_max.resize(m_acc_block_count); + } else { + m_acc_block_count = 1; + m_accumulators_max.push_back(std::numeric_limits::max()); + } + } uint64_t operator()(term_id_vec terms) { auto cws = query::cursors_with_scores(m_index, m_wdata, terms); @@ -50,6 +61,24 @@ struct exhaustive_taat_query { } using score_function_type = std::function; + void init() + { + std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); + } + + void aggregate() + { + for (uint64_t block = 0u; block < m_acc_block_count; ++block) { + if (not m_topk.would_enter(m_accumulators_max[block])) { + break; + } + uint64_t end = std::max(accumulator_block_size * (block + 1), m_accumulators.size()); + for (uint64_t docid = accumulator_block_size * block; docid < end; ++docid) { + m_topk.insert(m_accumulators[docid], docid); + } + } + } + // TODO(michal): I think this should be eventually the `operator()` template uint64_t taat(std::vector cursors, std::vector score_functions) { @@ -57,7 +86,7 @@ struct exhaustive_taat_query { if (cursors.empty()) { return 0; } - std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); + init(); for (uint32_t term = 0; term < cursors.size(); ++term) { auto &cursor = cursors[term]; auto &score_function = score_functions[term]; @@ -68,8 +97,13 @@ struct exhaustive_taat_query { auto const &freqs = cursor.frequency_buffer(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { intrinsics::prefetch(&m_accumulators[documents[idx + 3]]); - m_accumulators[documents[idx]] += - score_function(documents[idx], freqs[idx] + 1); + auto& accumulator = m_accumulators[documents[idx]]; + accumulator += score_function(documents[idx], freqs[idx] + 1); + if constexpr (accumulator_block_size > 1) { + auto block = documents[idx] / accumulator_block_size; + m_accumulators_max[block] = + std::max(m_accumulators_max[block], accumulator); + } } cursor.next_block(); } @@ -77,9 +111,7 @@ struct exhaustive_taat_query { // TODO(michal): when no blocks } } - for (uint64_t docid = 0u; docid < m_accumulators.size(); ++docid) { - m_topk.insert(m_accumulators[docid], docid); - } + aggregate(); m_topk.finalize(); return m_topk.topk().size(); @@ -92,6 +124,8 @@ struct exhaustive_taat_query { WandType const & m_wdata; topk_queue m_topk; std::vector m_accumulators; + size_t m_acc_block_count{}; + std::vector m_accumulators_max{}; }; template From 886e5808c319a66ebfd02f052a4d9f9c0b03ab1c Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Wed, 26 Dec 2018 19:57:20 -0500 Subject: [PATCH 06/32] TAAT maxscore and lazy accumulator --- include/pisa/query/queries.hpp | 1 + .../query/algorithm/exhaustive_taat_query.hpp | 160 +++++++++++------- .../query/algorithm/maxscore_taat_query.hpp | 152 +++++++++++++++++ src/queries.cpp | 7 +- 4 files changed, 257 insertions(+), 63 deletions(-) create mode 100644 include/query/algorithm/maxscore_taat_query.hpp diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp index bdb81a17e..839302c4c 100644 --- a/include/pisa/query/queries.hpp +++ b/include/pisa/query/queries.hpp @@ -62,3 +62,4 @@ term_freq_vec query_freqs(term_id_vec terms) { #include "algorithm/ranked_or_query.hpp" #include "algorithm/wand_query.hpp" #include "algorithm/exhaustive_taat_query.hpp" +#include "algorithm/maxscore_taat_query.hpp" diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 55314f58a..7d3527a32 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -5,6 +5,8 @@ namespace pisa { +using score_function_type = std::function; + // TODO: These are functions common to query processing in general. // They should be moved out of this file. namespace query { @@ -16,7 +18,6 @@ template // complex refactoring I want to avoid for now. using scorer_type = bm25; using cursor_type = typename Index::document_enumerator; - using score_function_type = std::function; auto query_term_freqs = query_freqs(terms); std::vector cursors; @@ -38,46 +39,105 @@ template } } // namespace query +template +struct Lazy_Accumulator { + static_assert(std::is_integral_v && std::is_unsigned_v, + "must be unsigned number"); + constexpr static auto descriptor_size = sizeof(Descriptor); + constexpr static auto counters_in_descriptor = descriptor_size / counter_bit_size; + constexpr static auto mask = (1u << counter_bit_size) - 1; + constexpr static auto cycle = (1u << counter_bit_size); + + struct Block { + Descriptor descriptor{}; + std::array accumulators{}; + + [[nodiscard]] auto counter(int pos) -> int { + return (descriptor >> (pos * counter_bit_size)) & mask; + } + }; + + Lazy_Accumulator(std::size_t size) + : m_size(size), + m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) {} + + void init() { + if (m_counter == 0) { + auto first = reinterpret_cast(&m_accumulators.front()); + auto last = + std::next(reinterpret_cast(&m_accumulators.back()), sizeof(Block)); + std::fill(first, last, std::byte{}); + } + } -template -struct exhaustive_taat_query { - exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) - : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) - { - static_assert(accumulator_block_size >= 0, "must be non-negative"); - if constexpr (accumulator_block_size > 0) { - m_acc_block_count = - (m_accumulators.size() + accumulator_block_size - 1) / accumulator_block_size; - m_accumulators_max.resize(m_acc_block_count); - } else { - m_acc_block_count = 1; - m_accumulators_max.push_back(std::numeric_limits::max()); + float &operator[](std::ptrdiff_t document) { + auto block = document / counters_in_descriptor; + auto pos_in_block = document % counters_in_descriptor; + if (m_accumulators[block].counter(pos_in_block) < m_counter) { + m_accumulators[block].descriptor ^= (mask << pos_in_block * counter_bit_size); + m_accumulators[block].accumulators[pos_in_block] = 0; } + return m_accumulators[block].accumulators[pos_in_block]; } - uint64_t operator()(term_id_vec terms) { - auto cws = query::cursors_with_scores(m_index, m_wdata, terms); - return taat(std::move(cws.first), std::move(cws.second)); + void aggregate(topk_queue &topk) { + uint64_t docid = 0u; + for (auto const &block : m_accumulators) { + for (auto const &score : block.accumulators) { + topk.insert(score, docid++); + } + }; + m_counter = (m_counter + 1) % cycle; } - using score_function_type = std::function; - void init() - { - std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); + [[nodiscard]] auto size() noexcept -> std::size_t { return m_size; } + + private: + std::size_t m_size; + std::vector m_accumulators; + int m_counter{}; +}; + +struct Simple_Accumulator : public std::vector { + Simple_Accumulator(std::ptrdiff_t size) : std::vector(size) {} + void init() { std::fill(begin(), end(), 0.0); } + void aggregate(topk_queue &topk) { + uint64_t docid = 0u; + std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); }); } +}; - void aggregate() - { - for (uint64_t block = 0u; block < m_acc_block_count; ++block) { - if (not m_topk.would_enter(m_accumulators_max[block])) { - break; +struct Taat_Traversal { + template + void static traverse_term(Cursor &cursor, score_function_type score, Acc &acc) { + if constexpr (std::is_same_v) { + while (cursor.docid() < acc.size()) { + auto const &documents = cursor.document_buffer(); + auto const &freqs = cursor.frequency_buffer(); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + acc[documents[idx]] = score(documents[idx], freqs[idx]); + } + cursor.next_block(); } - uint64_t end = std::max(accumulator_block_size * (block + 1), m_accumulators.size()); - for (uint64_t docid = accumulator_block_size * block; docid < end; ++docid) { - m_topk.insert(m_accumulators[docid], docid); + } else { + for (; cursor.docid() < acc.size(); cursor.next()) { + acc[cursor.docid()] = score(cursor.docid(), cursor.freq()); } } } +}; + +template +class exhaustive_taat_query { + public: + exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} + + uint64_t operator()(term_id_vec terms) { + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + return taat(std::move(cws.first), std::move(cws.second)); + } // TODO(michal): I think this should be eventually the `operator()` template @@ -86,33 +146,11 @@ struct exhaustive_taat_query { if (cursors.empty()) { return 0; } - init(); + m_accumulators.init(); for (uint32_t term = 0; term < cursors.size(); ++term) { - auto &cursor = cursors[term]; - auto &score_function = score_functions[term]; - if constexpr (std::is_same_v) { - while (cursor.docid() < m_accumulators.size()) { - auto const &documents = cursor.document_buffer(); - auto const &freqs = cursor.frequency_buffer(); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - intrinsics::prefetch(&m_accumulators[documents[idx + 3]]); - auto& accumulator = m_accumulators[documents[idx]]; - accumulator += score_function(documents[idx], freqs[idx] + 1); - if constexpr (accumulator_block_size > 1) { - auto block = documents[idx] / accumulator_block_size; - m_accumulators_max[block] = - std::max(m_accumulators_max[block], accumulator); - } - } - cursor.next_block(); - } - } else { - // TODO(michal): when no blocks - } + Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); } - aggregate(); - + m_accumulators.aggregate(m_topk); m_topk.finalize(); return m_topk.topk().size(); } @@ -120,19 +158,17 @@ struct exhaustive_taat_query { std::vector> const &topk() const { return m_topk.topk(); } private: - Index const & m_index; - WandType const & m_wdata; - topk_queue m_topk; - std::vector m_accumulators; - size_t m_acc_block_count{}; - std::vector m_accumulators_max{}; + Index const & m_index; + WandType const & m_wdata; + topk_queue m_topk; + Acc m_accumulators; }; -template +template [[nodiscard]] auto make_exhaustive_taat_query(Index const & index, WandType const &wdata, uint64_t k) { - return exhaustive_taat_query(index, wdata, k); + return exhaustive_taat_query(index, wdata, k); } }; // namespace pisa diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp new file mode 100644 index 000000000..313da9592 --- /dev/null +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include "exhaustive_taat_query.hpp" +#include "topk_queue.hpp" +#include "util/intrinsics.hpp" + +namespace pisa { + +template +[[nodiscard]] auto max_weights(Index const& index, WandType const &wdata, term_id_vec terms) +{ + // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more + // complex refactoring I want to avoid for now. + using scorer_type = bm25; + using cursor_type = typename Index::document_enumerator; + using score_function_type = std::function; + + auto query_term_freqs = query_freqs(terms); + std::vector max_weights; + max_weights.reserve(query_term_freqs.size()); + + for (auto term : query_term_freqs) { + auto list = index[term.first]; + auto q_weight = scorer_type::query_term_weight(term.second, list.size(), index.num_docs()); + max_weights.push_back(q_weight * wdata.max_term_weight(term.first)); + } + return max_weights; +} + +template +std::vector sort_permutation(Container const &container, Function sort_function) { + std::vector p(container.size()); + std::iota(p.begin(), p.end(), 0); + std::sort(p.begin(), p.end(), [&](std::size_t i, std::size_t j) { + return sort_function(container[i], container[j]); + }); + return p; +} + +template +void apply_permutation(Container &container, const std::vector &p) { + std::vector done(container.size()); + for (std::size_t i = 0; i < container.size(); ++i) { + if (done[i]) { + continue; + } + done[i] = true; + std::size_t prev_j = i; + std::size_t j = p[i]; + while (i != j) { + std::swap(container[prev_j], container[j]); + done[j] = true; + prev_j = j; + j = p[j]; + } + } +} + +template +void sort(Container &key_container, Function sort_function, Containers &... containers) { + auto permutation = sort_permutation(key_container, sort_function); + (apply_permutation(containers, permutation), ...); +} + +template +class maxscore_taat_query { + public: + maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} + + uint64_t operator()(term_id_vec terms) { + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + return maxscore_taat( + std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); + } + + template + void traverse_with_lookups(Cursor &cursor, score_function_type score) { + if constexpr (std::is_same_v) { + while (cursor.docid() < m_accumulators.size()) { + auto const &documents = cursor.document_buffer(); + auto const &freqs = cursor.frequency_buffer(); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + auto& accumulator = m_accumulators[documents[idx]]; + if (accumulator > 0) { + accumulator = score(documents[idx], freqs[idx]); + } + } + cursor.next_block(); + } + } else { + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + auto &accumulator = m_accumulators[cursor.docid()]; + if (accumulator > 0) { + accumulator = score(cursor.docid(), cursor.freq()); + } + } + } + } + + // TODO(michal): I think this should be eventually the `operator()` + template + uint64_t maxscore_taat(std::vector cursors, + std::vector score_functions, + std::vector max_weights) { + if (cursors.empty()) { + m_topk.clear(); + return 0; + } + sort(max_weights, [](auto lhs, auto rhs) { return lhs < rhs; }, cursors, score_functions); + + float essential_sum = 0; + float nonessential_sum = std::accumulate(max_weights.begin(), max_weights.end(), 0.0); + m_accumulators.init(); + uint32_t term = 0; + for (; term < cursors.size(); ++term) { + essential_sum += max_weights[term]; + nonessential_sum -= max_weights[term]; + Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); + m_topk.clear(); + m_accumulators.aggregate(m_topk); + if (not m_topk.would_enter(nonessential_sum)) { + break; + } + } + + for (; term < cursors.size(); ++term) { + traverse_with_lookups(cursors[term], score_functions[term]); + } + + m_topk.clear(); + m_accumulators.aggregate(m_topk); + m_topk.finalize(); + return m_topk.topk().size(); + } + + std::vector> const &topk() const { return m_topk.topk(); } + + private: + Index const & m_index; + WandType const & m_wdata; + topk_queue m_topk; + Acc m_accumulators; +}; + +template +[[nodiscard]] auto make_maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k) { + return maxscore_taat_query(index, wdata, k); +} + +}; // namespace pisa diff --git a/src/queries.cpp b/src/queries.cpp index e21c5eba4..350924796 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -135,7 +135,12 @@ void perftest(const std::string &index_filename, return maxscore_query(wdata, k)(index, query); }; } else if (t == "exhaustive_taat" && wand_data_filename) { - query_fun = pisa::make_exhaustive_taat_query(index, wdata, k); + query_fun = pisa::make_exhaustive_taat_query(index, wdata, k); + } else if (t == "exhaustive_taat_lazy" && wand_data_filename) { + query_fun = + pisa::make_exhaustive_taat_query>(index, wdata, k); + } else if (t == "maxscore_taat" && wand_data_filename) { + query_fun = pisa::make_maxscore_taat_query(index, wdata, k); } else { logger() << "Unsupported query type: " << t << std::endl; break; From b0e28245ed9295e6eeddbb197a83af791ece284c Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 28 Dec 2018 13:18:11 -0500 Subject: [PATCH 07/32] TAAT MaxScore and Blocked Accumulator --- .../query/algorithm/exhaustive_taat_query.hpp | 112 +++++++++++-- .../query/algorithm/maxscore_taat_query.hpp | 157 ++++++++++++++++-- src/queries.cpp | 6 + 3 files changed, 249 insertions(+), 26 deletions(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 7d3527a32..e45ab0805 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -39,20 +39,91 @@ template } } // namespace query + +template +struct Blocked_Accumulator { + + struct Proxy_Element { + std::ptrdiff_t document; + std::vector &accumulators; + std::vector &accumulators_max; + + Proxy_Element &operator=(float score) { + accumulators[document] = score; + auto &block_max = accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + return *this; + } + Proxy_Element &operator+=(float delta) { + accumulators[document] += delta; + auto const&score = accumulators[document]; + auto &block_max = accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + return *this; + } + + operator float() { return accumulators[document]; } + }; + + using reference = Proxy_Element; + + static_assert(block_size > 0, "must be positive"); + + [[nodiscard]] constexpr static auto calc_block_count(std::size_t size) noexcept -> std::size_t { + return (size + block_size - 1) / block_size; + } + + Blocked_Accumulator(std::size_t size) + : m_size(size), + m_block_count(calc_block_count(size)), m_accumulators(size), + m_accumulators_max(m_block_count) {} + + void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); } + + [[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element { + return {document, m_accumulators, m_accumulators_max}; + } + + void aggregate(topk_queue &topk) { + for (size_t block = 0; block < m_block_count; ++block) { + if (not topk.would_enter(m_accumulators_max[block])) { continue; } + uint32_t doc = block * block_size; + uint32_t end = std::min((block + 1) * block_size, m_accumulators.size()); + for (; doc < end; ++doc) { + topk.insert(m_accumulators[doc], doc); + } + } + } + + [[nodiscard]] auto size() noexcept -> std::size_t { return m_size; } + + private: + std::size_t m_size; + std::size_t m_block_count; + std::vector m_accumulators; + std::vector m_accumulators_max; +}; + template struct Lazy_Accumulator { + using reference = float &; + static_assert(std::is_integral_v && std::is_unsigned_v, "must be unsigned number"); - constexpr static auto descriptor_size = sizeof(Descriptor); - constexpr static auto counters_in_descriptor = descriptor_size / counter_bit_size; - constexpr static auto mask = (1u << counter_bit_size) - 1; - constexpr static auto cycle = (1u << counter_bit_size); + constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8; + constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; + constexpr static auto mask = (1u << counter_bit_size) - 1; + constexpr static auto cycle = (1u << counter_bit_size); struct Block { Descriptor descriptor{}; std::array accumulators{}; - [[nodiscard]] auto counter(int pos) -> int { + [[nodiscard]] auto counter(int pos) const noexcept -> int { return (descriptor >> (pos * counter_bit_size)) & mask; } }; @@ -66,15 +137,19 @@ struct Lazy_Accumulator { auto first = reinterpret_cast(&m_accumulators.front()); auto last = std::next(reinterpret_cast(&m_accumulators.back()), sizeof(Block)); - std::fill(first, last, std::byte{}); + std::fill(first, last, std::byte{0}); } } - float &operator[](std::ptrdiff_t document) { - auto block = document / counters_in_descriptor; - auto pos_in_block = document % counters_in_descriptor; - if (m_accumulators[block].counter(pos_in_block) < m_counter) { - m_accumulators[block].descriptor ^= (mask << pos_in_block * counter_bit_size); + float &operator[](std::ptrdiff_t const document) { + auto const block = document / counters_in_descriptor; + auto const pos_in_block = document % counters_in_descriptor; + if (m_accumulators[block].accumulators[pos_in_block] > 0 && + m_accumulators[block].counter(pos_in_block) < m_counter) + { + auto const shift = pos_in_block * counter_bit_size; + m_accumulators[block].descriptor &= ~(mask << shift); + m_accumulators[block].descriptor |= m_counter << shift; m_accumulators[block].accumulators[pos_in_block] = 0; } return m_accumulators[block].accumulators[pos_in_block]; @@ -83,8 +158,12 @@ struct Lazy_Accumulator { void aggregate(topk_queue &topk) { uint64_t docid = 0u; for (auto const &block : m_accumulators) { + int pos = 0; for (auto const &score : block.accumulators) { - topk.insert(score, docid++); + if (block.counter(pos++) == m_counter) { + topk.insert(score, docid); + } + ++docid; } }; m_counter = (m_counter + 1) % cycle; @@ -116,13 +195,13 @@ struct Taat_Traversal { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { - acc[documents[idx]] = score(documents[idx], freqs[idx]); + acc[documents[idx]] += score(documents[idx], freqs[idx]); } cursor.next_block(); } } else { for (; cursor.docid() < acc.size(); cursor.next()) { - acc[cursor.docid()] = score(cursor.docid(), cursor.freq()); + acc[cursor.docid()] += score(cursor.docid(), cursor.freq()); } } } @@ -139,6 +218,11 @@ class exhaustive_taat_query { return taat(std::move(cws.first), std::move(cws.second)); } + uint64_t operator()([[maybe_unused]] Index const &, term_id_vec terms) { + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + return taat(std::move(cws.first), std::move(cws.second)); + } + // TODO(michal): I think this should be eventually the `operator()` template uint64_t taat(std::vector cursors, std::vector score_functions) { diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp index 313da9592..ce595749e 100644 --- a/include/query/algorithm/maxscore_taat_query.hpp +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -1,6 +1,7 @@ #pragma once #include "exhaustive_taat_query.hpp" +//#include "heap.hpp" #include "topk_queue.hpp" #include "util/intrinsics.hpp" @@ -57,19 +58,123 @@ void apply_permutation(Container &container, const std::vector &p) } template -void sort(Container &key_container, Function sort_function, Containers &... containers) { +void sort_many(Container &key_container, Function sort_function, Containers &... containers) { auto permutation = sort_permutation(key_container, sort_function); (apply_permutation(containers, permutation), ...); } +namespace heap { + +} // namespace heap + +template > +class Bounded_Priority_Queue { + public: + using key_type = K; + using priority_type = P; + using entry_type = std::pair; + + struct Reverse_Function { + std::unordered_map &reverse; + void operator()(entry_type const &entry, int pos) { reverse[entry.first] = pos; } + }; + + explicit Bounded_Priority_Queue(uint64_t capacity, Order order = heap::Max_Order{}) + : m_threshold(0), + m_capacity(capacity), + m_order(std::move(order)), + m_reverse_function{m_reverse} { + m_entries.reserve(capacity + 1); + } + Bounded_Priority_Queue(Bounded_Priority_Queue const &) = default; + Bounded_Priority_Queue &operator=(Bounded_Priority_Queue const &) = default; + + void push(key_type const &key, priority_type const &priority) { + if (DS2I_UNLIKELY(priority < m_threshold)) { + return; + } + m_entries.emplace_back(key, priority); + if (DS2I_UNLIKELY(m_entries.size() < m_capacity)) { + auto pos = push_heap(m_entries.begin(), m_entries.end(), m_reverse_function, m_order); + if (DS2I_UNLIKELY(m_entries.size() == m_capacity)) { + m_threshold = m_entries.front().second; + } + m_reverse[key] = std::distance(m_entries.begin(), pos); + } else { + auto pos = pop_heap(m_entries.begin(), m_entries.end(), m_reverse_function, m_order); + m_entries.pop_back(); + m_threshold = m_entries.front().second; + m_reverse[key] = std::distance(m_entries.begin(), pos); + } + } + + [[nodiscard]] auto find(key_type const& key) const { + if (auto pos = m_reverse.find(key); pos != m_reverse.end()) { + return std::next(m_entries.begin(), *pos); + } + return m_entries.end(); + } + + template + void increase_priority(RandomAccessIterator handle, priority_type priority) { + handle->second = priority; + auto pos = heap::sift_up_and_track(m_entries.begin(), handle, m_order, m_reverse_function); + m_reverse[handle->first] = std::distance(m_entries.begin(), pos); + } + + void push_or_update(key_type const &key, priority_type const &priority) { + if (auto pos = m_reverse.find(key); pos != m_reverse.end()) { + increase_priority(std::next(m_entries.begin(), pos->second), priority); + } + return push(key, priority); + } + + bool would_enter(priority_type priority) const { + return m_entries.size() < m_capacity || priority > m_threshold; + } + + void finalize() { + std::sort_heap(m_entries.begin(), m_entries.end(), m_order); + size_t size = + std::lower_bound(m_entries.begin(), + m_entries.end(), + 0, + [](std::pair l, float r) { return l.first > r; }) - + m_entries.begin(); + m_entries.resize(size); + } + + [[nodiscard]] std::vector const &topk() const noexcept { return m_entries; } + + void clear() noexcept { m_entries.clear(); } + + [[nodiscard]] uint64_t size() const noexcept { return m_capacity; } + + private: + float m_threshold; + std::size_t m_capacity; + std::vector m_entries; + std::unordered_map m_reverse; + Order m_order; + Reverse_Function m_reverse_function; +}; + template class maxscore_taat_query { + using accumulator_reference = typename Acc::reference; + public: maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k) - : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} + : m_index(index), m_wdata(wdata), m_k(k), m_topk(k), m_accumulators(index.num_docs()) {} uint64_t operator()(term_id_vec terms) { - auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); + return maxscore_taat( + std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); + } + + uint64_t operator()([[maybe_unused]] Index const &, term_id_vec terms) { + auto cws = query::cursors_with_scores(m_index, m_wdata, terms); return maxscore_taat( std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); } @@ -82,23 +187,50 @@ class maxscore_taat_query { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { - auto& accumulator = m_accumulators[documents[idx]]; + accumulator_reference accumulator = m_accumulators[documents[idx]]; if (accumulator > 0) { - accumulator = score(documents[idx], freqs[idx]); + accumulator += score(documents[idx], freqs[idx]); } } cursor.next_block(); } } else { for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - auto &accumulator = m_accumulators[cursor.docid()]; + accumulator_reference accumulator = m_accumulators[cursor.docid()]; if (accumulator > 0) { - accumulator = score(cursor.docid(), cursor.freq()); + accumulator += score(cursor.docid(), cursor.freq()); } } } } + //template + //void static traverse_term(Cursor & cursor, + // score_function_type score, + // Acc & acc, + // Bounded_Priority_Queue &heap) + //{ + // if constexpr (std::is_same_v) { + // while (cursor.docid() < acc.size()) { + // auto const &documents = cursor.document_buffer(); + // auto const &freqs = cursor.frequency_buffer(); + // for (uint32_t idx = 0; idx < documents.size(); ++idx) { + // auto document = documents[idx]; + // acc[document] += score(document, freqs[idx]); + // heap.push_or_update(document, acc[document]); + // } + // cursor.next_block(); + // } + // } else { + // for (; cursor.docid() < acc.size(); cursor.next()) { + // auto document = cursor.docid(); + // acc[document] += score(document, cursor.freq()); + // heap.push_or_update(document, acc[document]); + // } + // } + //} + // TODO(michal): I think this should be eventually the `operator()` template uint64_t maxscore_taat(std::vector cursors, @@ -108,21 +240,21 @@ class maxscore_taat_query { m_topk.clear(); return 0; } - sort(max_weights, [](auto lhs, auto rhs) { return lhs < rhs; }, cursors, score_functions); + sort_many( + max_weights, [](auto lhs, auto rhs) { return lhs > rhs; }, cursors, score_functions); - float essential_sum = 0; + //Bounded_Priority_Queue heap(m_k); float nonessential_sum = std::accumulate(max_weights.begin(), max_weights.end(), 0.0); m_accumulators.init(); uint32_t term = 0; for (; term < cursors.size(); ++term) { - essential_sum += max_weights[term]; - nonessential_sum -= max_weights[term]; - Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); m_topk.clear(); m_accumulators.aggregate(m_topk); if (not m_topk.would_enter(nonessential_sum)) { break; } + Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); + nonessential_sum -= max_weights[term]; } for (; term < cursors.size(); ++term) { @@ -140,6 +272,7 @@ class maxscore_taat_query { private: Index const & m_index; WandType const & m_wdata; + int m_k; topk_queue m_topk; Acc m_accumulators; }; diff --git a/src/queries.cpp b/src/queries.cpp index 350924796..8412652c0 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -139,8 +139,14 @@ void perftest(const std::string &index_filename, } else if (t == "exhaustive_taat_lazy" && wand_data_filename) { query_fun = pisa::make_exhaustive_taat_query>(index, wdata, k); + } else if (t == "exhaustive_taat_blocked" && wand_data_filename) { + query_fun = + pisa::make_exhaustive_taat_query>(index, wdata, k); } else if (t == "maxscore_taat" && wand_data_filename) { query_fun = pisa::make_maxscore_taat_query(index, wdata, k); + } else if (t == "maxscore_taat_blocked" && wand_data_filename) { + query_fun = + pisa::make_maxscore_taat_query>(index, wdata, k); } else { logger() << "Unsupported query type: " << t << std::endl; break; From 7737d1fba250a35c0a15e1761bfc0045cf2c26b1 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 28 Dec 2018 14:02:31 -0500 Subject: [PATCH 08/32] Remove heap stuff --- .../query/algorithm/maxscore_taat_query.hpp | 125 ------------------ 1 file changed, 125 deletions(-) diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp index ce595749e..31b6633ad 100644 --- a/include/query/algorithm/maxscore_taat_query.hpp +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -1,7 +1,6 @@ #pragma once #include "exhaustive_taat_query.hpp" -//#include "heap.hpp" #include "topk_queue.hpp" #include "util/intrinsics.hpp" @@ -63,102 +62,6 @@ void sort_many(Container &key_container, Function sort_function, Containers &... (apply_permutation(containers, permutation), ...); } -namespace heap { - -} // namespace heap - -template > -class Bounded_Priority_Queue { - public: - using key_type = K; - using priority_type = P; - using entry_type = std::pair; - - struct Reverse_Function { - std::unordered_map &reverse; - void operator()(entry_type const &entry, int pos) { reverse[entry.first] = pos; } - }; - - explicit Bounded_Priority_Queue(uint64_t capacity, Order order = heap::Max_Order{}) - : m_threshold(0), - m_capacity(capacity), - m_order(std::move(order)), - m_reverse_function{m_reverse} { - m_entries.reserve(capacity + 1); - } - Bounded_Priority_Queue(Bounded_Priority_Queue const &) = default; - Bounded_Priority_Queue &operator=(Bounded_Priority_Queue const &) = default; - - void push(key_type const &key, priority_type const &priority) { - if (DS2I_UNLIKELY(priority < m_threshold)) { - return; - } - m_entries.emplace_back(key, priority); - if (DS2I_UNLIKELY(m_entries.size() < m_capacity)) { - auto pos = push_heap(m_entries.begin(), m_entries.end(), m_reverse_function, m_order); - if (DS2I_UNLIKELY(m_entries.size() == m_capacity)) { - m_threshold = m_entries.front().second; - } - m_reverse[key] = std::distance(m_entries.begin(), pos); - } else { - auto pos = pop_heap(m_entries.begin(), m_entries.end(), m_reverse_function, m_order); - m_entries.pop_back(); - m_threshold = m_entries.front().second; - m_reverse[key] = std::distance(m_entries.begin(), pos); - } - } - - [[nodiscard]] auto find(key_type const& key) const { - if (auto pos = m_reverse.find(key); pos != m_reverse.end()) { - return std::next(m_entries.begin(), *pos); - } - return m_entries.end(); - } - - template - void increase_priority(RandomAccessIterator handle, priority_type priority) { - handle->second = priority; - auto pos = heap::sift_up_and_track(m_entries.begin(), handle, m_order, m_reverse_function); - m_reverse[handle->first] = std::distance(m_entries.begin(), pos); - } - - void push_or_update(key_type const &key, priority_type const &priority) { - if (auto pos = m_reverse.find(key); pos != m_reverse.end()) { - increase_priority(std::next(m_entries.begin(), pos->second), priority); - } - return push(key, priority); - } - - bool would_enter(priority_type priority) const { - return m_entries.size() < m_capacity || priority > m_threshold; - } - - void finalize() { - std::sort_heap(m_entries.begin(), m_entries.end(), m_order); - size_t size = - std::lower_bound(m_entries.begin(), - m_entries.end(), - 0, - [](std::pair l, float r) { return l.first > r; }) - - m_entries.begin(); - m_entries.resize(size); - } - - [[nodiscard]] std::vector const &topk() const noexcept { return m_entries; } - - void clear() noexcept { m_entries.clear(); } - - [[nodiscard]] uint64_t size() const noexcept { return m_capacity; } - - private: - float m_threshold; - std::size_t m_capacity; - std::vector m_entries; - std::unordered_map m_reverse; - Order m_order; - Reverse_Function m_reverse_function; -}; - template class maxscore_taat_query { using accumulator_reference = typename Acc::reference; @@ -204,33 +107,6 @@ class maxscore_taat_query { } } - //template - //void static traverse_term(Cursor & cursor, - // score_function_type score, - // Acc & acc, - // Bounded_Priority_Queue &heap) - //{ - // if constexpr (std::is_same_v) { - // while (cursor.docid() < acc.size()) { - // auto const &documents = cursor.document_buffer(); - // auto const &freqs = cursor.frequency_buffer(); - // for (uint32_t idx = 0; idx < documents.size(); ++idx) { - // auto document = documents[idx]; - // acc[document] += score(document, freqs[idx]); - // heap.push_or_update(document, acc[document]); - // } - // cursor.next_block(); - // } - // } else { - // for (; cursor.docid() < acc.size(); cursor.next()) { - // auto document = cursor.docid(); - // acc[document] += score(document, cursor.freq()); - // heap.push_or_update(document, acc[document]); - // } - // } - //} - // TODO(michal): I think this should be eventually the `operator()` template uint64_t maxscore_taat(std::vector cursors, @@ -243,7 +119,6 @@ class maxscore_taat_query { sort_many( max_weights, [](auto lhs, auto rhs) { return lhs > rhs; }, cursors, score_functions); - //Bounded_Priority_Queue heap(m_k); float nonessential_sum = std::accumulate(max_weights.begin(), max_weights.end(), 0.0); m_accumulators.init(); uint32_t term = 0; From 0664cbad164fc2c0b18ffabe6d31b5ca2a3dc454 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 29 Dec 2018 09:40:43 -0500 Subject: [PATCH 09/32] TAAT optimizations --- include/query/algorithm/exhaustive_taat_query.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index e45ab0805..cbfa83c35 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "util/intrinsics.hpp" #include "topk_queue.hpp" @@ -194,6 +196,7 @@ struct Taat_Traversal { while (cursor.docid() < acc.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); + #pragma omp simd for (uint32_t idx = 0; idx < documents.size(); ++idx) { acc[documents[idx]] += score(documents[idx], freqs[idx]); } From 9ae6b1aa7874376452be8d3805c78a61e9fa9120 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 29 Dec 2018 10:06:35 -0500 Subject: [PATCH 10/32] Vectorize lookup traversal. --- include/query/algorithm/maxscore_taat_query.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp index 31b6633ad..b411f97d7 100644 --- a/include/query/algorithm/maxscore_taat_query.hpp +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -89,6 +89,7 @@ class maxscore_taat_query { while (cursor.docid() < m_accumulators.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); + #pragma omp simd for (uint32_t idx = 0; idx < documents.size(); ++idx) { accumulator_reference accumulator = m_accumulators[documents[idx]]; if (accumulator > 0) { From 26a05f120a8d5e538f7dafadc0da0e944b53219b Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Tue, 15 Jan 2019 12:34:59 +0100 Subject: [PATCH 11/32] Simple but effective ranked_or with taat --- include/pisa/query/queries.hpp | 1 + .../query/algorithm/ranked_or_taat_query.hpp | 47 +++++++++++++++++++ src/queries.cpp | 4 ++ test/test_ranked_queries.cpp | 44 +++++++++++++++++ 4 files changed, 96 insertions(+) create mode 100644 include/query/algorithm/ranked_or_taat_query.hpp diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp index 839302c4c..638d7c028 100644 --- a/include/pisa/query/queries.hpp +++ b/include/pisa/query/queries.hpp @@ -63,3 +63,4 @@ term_freq_vec query_freqs(term_id_vec terms) { #include "algorithm/wand_query.hpp" #include "algorithm/exhaustive_taat_query.hpp" #include "algorithm/maxscore_taat_query.hpp" +#include "algorithm/ranked_or_taat_query.hpp" diff --git a/include/query/algorithm/ranked_or_taat_query.hpp b/include/query/algorithm/ranked_or_taat_query.hpp new file mode 100644 index 000000000..d83f109b1 --- /dev/null +++ b/include/query/algorithm/ranked_or_taat_query.hpp @@ -0,0 +1,47 @@ +#pragma once + +template +struct ranked_or_taat_query { + + typedef bm25 scorer_type; + + ranked_or_taat_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + + template + uint64_t operator()(Index const &index, term_id_vec terms) { + m_topk.clear(); + if (terms.empty()) + return 0; + + auto query_term_freqs = query_freqs(terms); + + uint64_t num_docs = index.num_docs(); + std::vector accumulator(num_docs, 0.0f); + for (auto term : query_term_freqs) { + auto list = index[term.first]; + auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); + auto cur_doc = list.docid(); + while(cur_doc < num_docs){ + float norm_len = m_wdata->norm_len(cur_doc); + float score = q_weight * scorer_type::doc_term_weight(list.freq(), norm_len); + accumulator[cur_doc] += score; + list.next(); + cur_doc = list.docid(); + } + } + + for(auto&& v : accumulator) { + m_topk.insert(v); + } + + m_topk.finalize(); + return m_topk.topk().size(); + } + + std::vector> const &topk() const { return m_topk.topk(); } + + private: + WandType const *m_wdata; + topk_queue m_topk; +}; + diff --git a/src/queries.cpp b/src/queries.cpp index 8412652c0..1026cd3e2 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -142,6 +142,10 @@ void perftest(const std::string &index_filename, } else if (t == "exhaustive_taat_blocked" && wand_data_filename) { query_fun = pisa::make_exhaustive_taat_query>(index, wdata, k); + } else if (t == "ranked_or_taat" && wand_data_filename) { + query_fun = [&](ds2i::term_id_vec query) { + return ranked_or_taat_query(wdata, k)(index, query); + }; } else if (t == "maxscore_taat" && wand_data_filename) { query_fun = pisa::make_maxscore_taat_query(index, wdata, k); } else if (t == "maxscore_taat_blocked" && wand_data_filename) { diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 4b8ef4a59..3d95ed4ae 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -93,6 +93,50 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_maxscore") test_against_or(bmm_q); } +TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat") +{ + ds2i::pisa::exhaustive_taat_query taat_q( + index, wdata, 10); + test_against_or(taat_q); +} + +TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat_blocked") +{ + ds2i::pisa::exhaustive_taat_query> + taat_q(index, wdata, 10); + test_against_or(taat_q); +} + +TEST_CASE_METHOD(ds2i::test::index_initialization, "maxscore_taat") +{ + ds2i::pisa::maxscore_taat_query taat_q( + index, wdata, 10); + test_against_or(taat_q); +} + +TEST_CASE_METHOD(ds2i::test::index_initialization, "maxscore_taat_blocked") +{ + ds2i::pisa::maxscore_taat_query> + taat_q(index, wdata, 10); + test_against_or(taat_q); +} + +TEST_CASE_METHOD(ds2i::test::index_initialization, "ranked_or_taat") +{ + + ds2i::ranked_or_taat_query ranked_or_taat_q(wdata, 10); + test_against_or(ranked_or_taat_q); +} + +// TODO(michal): there is a bug, investigate! +//BOOST_FIXTURE_TEST_CASE(exhaustive_taat_lazy, +// ds2i::test::index_initialization) +//{ +// ds2i::pisa::exhaustive_taat_query> +// taat_q(index, wdata, 10); +// test_against_or(taat_q); +//} + /// Issue #26 https://github.com/pisa-engine/pisa/issues/26 TEST_CASE_METHOD(pisa::test::index_initialization, "topk_size_ranked_or") { From 41ca1981ef7390d9b26b038549b1fedc539989d3 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 15 Jan 2019 09:15:28 -0500 Subject: [PATCH 12/32] Remove OpenMP --- include/query/algorithm/exhaustive_taat_query.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index cbfa83c35..01b445050 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -1,7 +1,5 @@ #pragma once -#include - #include "util/intrinsics.hpp" #include "topk_queue.hpp" From 200be058e2008b0523e8127e07ad13a1bcdafb17 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 15 Jan 2019 09:18:33 -0500 Subject: [PATCH 13/32] Remove OpenMP --- include/query/algorithm/exhaustive_taat_query.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 01b445050..e45ab0805 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -194,7 +194,6 @@ struct Taat_Traversal { while (cursor.docid() < acc.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); - #pragma omp simd for (uint32_t idx = 0; idx < documents.size(); ++idx) { acc[documents[idx]] += score(documents[idx], freqs[idx]); } From c3400d1e310b9613b915f4110e48040bea4fbab8 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 15 Jan 2019 10:38:09 -0500 Subject: [PATCH 14/32] Use template rather than std::function for faster processing --- .../query/algorithm/exhaustive_taat_query.hpp | 22 +++++++++++++------ .../query/algorithm/maxscore_taat_query.hpp | 8 +++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index e45ab0805..f103afe62 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -5,7 +5,15 @@ namespace pisa { -using score_function_type = std::function; +template +struct Score_Function { + float query_weight; + std::reference_wrapper wdata; + + [[nodiscard]] auto operator()(uint32_t doc, uint32_t freq) const -> float { + return query_weight * Scorer::doc_term_weight(freq, wdata.get().norm_len(doc)); + } +}; // TODO: These are functions common to query processing in general. // They should be moved out of this file. @@ -18,6 +26,7 @@ template // complex refactoring I want to avoid for now. using scorer_type = bm25; using cursor_type = typename Index::document_enumerator; + using score_function_type = Score_Function; auto query_term_freqs = query_freqs(terms); std::vector cursors; @@ -30,10 +39,7 @@ template uint64_t num_docs = index.num_docs(); auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); cursors.push_back(std::move(list)); - score_functions.push_back([q_weight, &wdata](auto docid, auto freq) { - float norm_len = wdata.norm_len(docid); - return q_weight * scorer_type::doc_term_weight(freq, norm_len); - }); + score_functions.push_back({q_weight, std::cref(wdata)}); } return std::make_pair(cursors, score_functions); } @@ -187,8 +193,8 @@ struct Simple_Accumulator : public std::vector { }; struct Taat_Traversal { - template - void static traverse_term(Cursor &cursor, score_function_type score, Acc &acc) { + template + void static traverse_term(Cursor &cursor, Score score, Acc &acc) { if constexpr (std::is_same_v) { while (cursor.docid() < acc.size()) { @@ -209,6 +215,8 @@ struct Taat_Traversal { template class exhaustive_taat_query { + using score_function_type = Score_Function; + public: exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp index b411f97d7..7ec25de0d 100644 --- a/include/query/algorithm/maxscore_taat_query.hpp +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -13,7 +13,7 @@ template // complex refactoring I want to avoid for now. using scorer_type = bm25; using cursor_type = typename Index::document_enumerator; - using score_function_type = std::function; + using score_function_type = Score_Function; auto query_term_freqs = query_freqs(terms); std::vector max_weights; @@ -65,6 +65,7 @@ void sort_many(Container &key_container, Function sort_function, Containers &... template class maxscore_taat_query { using accumulator_reference = typename Acc::reference; + using score_function_type = Score_Function; public: maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k) @@ -82,14 +83,13 @@ class maxscore_taat_query { std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); } - template - void traverse_with_lookups(Cursor &cursor, score_function_type score) { + template + void traverse_with_lookups(Cursor &cursor, Score score) { if constexpr (std::is_same_v) { while (cursor.docid() < m_accumulators.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); - #pragma omp simd for (uint32_t idx = 0; idx < documents.size(); ++idx) { accumulator_reference accumulator = m_accumulators[documents[idx]]; if (accumulator > 0) { From 87252f39d785b8bc08559d9cd99fc4e605ef0d2b Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 15 Jan 2019 18:21:36 -0500 Subject: [PATCH 15/32] Lazy accumulator fixed --- .../query/algorithm/exhaustive_taat_query.hpp | 57 +++++++++++++++---- test/test_ranked_queries.cpp | 14 ++--- 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index f103afe62..730f7dca7 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -90,10 +90,21 @@ struct Blocked_Accumulator { void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); } - [[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element { + [[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element + { return {document, m_accumulators, m_accumulators_max}; } + void accumulate(std::ptrdiff_t const document, float score_delta) + { + m_accumulators[document] += score_delta; + auto const &score = m_accumulators[document]; + auto &block_max = m_accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + } + void aggregate(topk_queue &topk) { for (size_t block = 0; block < m_block_count; ++block) { if (not topk.would_enter(m_accumulators_max[block])) { continue; } @@ -121,9 +132,9 @@ struct Lazy_Accumulator { static_assert(std::is_integral_v && std::is_unsigned_v, "must be unsigned number"); constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8; - constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; - constexpr static auto mask = (1u << counter_bit_size) - 1; - constexpr static auto cycle = (1u << counter_bit_size); + constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; + constexpr static auto cycle = (1u << counter_bit_size); + constexpr static Descriptor mask = (1u << counter_bit_size) - 1; struct Block { Descriptor descriptor{}; @@ -132,13 +143,22 @@ struct Lazy_Accumulator { [[nodiscard]] auto counter(int pos) const noexcept -> int { return (descriptor >> (pos * counter_bit_size)) & mask; } + + void reset_counter(int pos, int counter) + { + auto const shift = pos * counter_bit_size; + descriptor &= ~(mask << shift); + descriptor |= static_cast(counter) << shift; + accumulators[pos] = 0; + } }; Lazy_Accumulator(std::size_t size) - : m_size(size), - m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) {} + : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) + {} - void init() { + void init() + { if (m_counter == 0) { auto first = reinterpret_cast(&m_accumulators.front()); auto last = @@ -150,8 +170,8 @@ struct Lazy_Accumulator { float &operator[](std::ptrdiff_t const document) { auto const block = document / counters_in_descriptor; auto const pos_in_block = document % counters_in_descriptor; - if (m_accumulators[block].accumulators[pos_in_block] > 0 && - m_accumulators[block].counter(pos_in_block) < m_counter) + if (//m_accumulators[block].accumulators[pos_in_block] > 0 && + m_accumulators[block].counter(pos_in_block) != m_counter) { auto const shift = pos_in_block * counter_bit_size; m_accumulators[block].descriptor &= ~(mask << shift); @@ -161,6 +181,16 @@ struct Lazy_Accumulator { return m_accumulators[block].accumulators[pos_in_block]; } + void accumulate(std::ptrdiff_t const document, float score) + { + auto const block = document / counters_in_descriptor; + auto const pos_in_block = document % counters_in_descriptor; + if (m_accumulators[block].counter(pos_in_block) != m_counter) { + m_accumulators[block].reset_counter(pos_in_block, m_counter); + } + m_accumulators[block].accumulators[pos_in_block] += score; + } + void aggregate(topk_queue &topk) { uint64_t docid = 0u; for (auto const &block : m_accumulators) { @@ -175,7 +205,9 @@ struct Lazy_Accumulator { m_counter = (m_counter + 1) % cycle; } - [[nodiscard]] auto size() noexcept -> std::size_t { return m_size; } + [[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; } + [[nodiscard]] auto blocks() noexcept -> std::vector & { return m_accumulators; } + [[nodiscard]] auto counter() const noexcept -> int { return m_counter; } private: std::size_t m_size; @@ -186,6 +218,7 @@ struct Lazy_Accumulator { struct Simple_Accumulator : public std::vector { Simple_Accumulator(std::ptrdiff_t size) : std::vector(size) {} void init() { std::fill(begin(), end(), 0.0); } + void accumulate(uint32_t doc, float score) { operator[](doc) += score; } void aggregate(topk_queue &topk) { uint64_t docid = 0u; std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); }); @@ -201,13 +234,13 @@ struct Taat_Traversal { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { - acc[documents[idx]] += score(documents[idx], freqs[idx]); + acc.accumulate(documents[idx], score(documents[idx], freqs[idx])); } cursor.next_block(); } } else { for (; cursor.docid() < acc.size(); cursor.next()) { - acc[cursor.docid()] += score(cursor.docid(), cursor.freq()); + acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); } } } diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 3d95ed4ae..b802dfaed 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -128,14 +128,12 @@ TEST_CASE_METHOD(ds2i::test::index_initialization, "ranked_or_taat") test_against_or(ranked_or_taat_q); } -// TODO(michal): there is a bug, investigate! -//BOOST_FIXTURE_TEST_CASE(exhaustive_taat_lazy, -// ds2i::test::index_initialization) -//{ -// ds2i::pisa::exhaustive_taat_query> -// taat_q(index, wdata, 10); -// test_against_or(taat_q); -//} +TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat_lazy") +{ + ds2i::pisa::exhaustive_taat_query> taat_q( + index, wdata, 10); + test_against_or(taat_q); +} /// Issue #26 https://github.com/pisa-engine/pisa/issues/26 TEST_CASE_METHOD(pisa::test::index_initialization, "topk_size_ranked_or") From b9dd1fdb0fb82ab9c8cc665b359b257b2ba0ac64 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 15 Jan 2019 19:07:44 -0500 Subject: [PATCH 16/32] Fix block traversal issue --- CMakeLists.txt | 6 ------ include/query/algorithm/exhaustive_taat_query.hpp | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f6cbb6cd..988fd4baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,12 +40,6 @@ list(APPEND LCOV_REMOVE_PATTERNS "'${PROJECT_SOURCE_DIR}/external/*'") if (UNIX) -<<<<<<< HEAD -======= - # C++14 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") ->>>>>>> Fetch an entire block at a time for TAAT - # For hardware popcount and other special instructions set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 730f7dca7..4bad21d1b 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -227,14 +227,15 @@ struct Simple_Accumulator : public std::vector { struct Taat_Traversal { template - void static traverse_term(Cursor &cursor, Score score, Acc &acc) { + void static traverse_term(Cursor &cursor, Score score, Acc &acc) + { if constexpr (std::is_same_v) { while (cursor.docid() < acc.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); for (uint32_t idx = 0; idx < documents.size(); ++idx) { - acc.accumulate(documents[idx], score(documents[idx], freqs[idx])); + acc.accumulate(documents[idx], score(documents[idx], freqs[idx] + 1)); } cursor.next_block(); } From 5c539b40c4435429274a7e8945ec7ef553d14f8d Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 12:48:51 +0100 Subject: [PATCH 17/32] Removed ds2i namespace --- include/pisa/block_posting_list.hpp | 2 +- include/pisa/freq_index.hpp | 2 +- .../query/algorithm/exhaustive_taat_query.hpp | 2 +- .../query/algorithm/maxscore_taat_query.hpp | 2 +- src/queries.cpp | 2 +- test/test_ranked_queries.cpp | 24 +++++++++---------- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/pisa/block_posting_list.hpp b/include/pisa/block_posting_list.hpp index f2f13315e..7489530c5 100644 --- a/include/pisa/block_posting_list.hpp +++ b/include/pisa/block_posting_list.hpp @@ -82,7 +82,7 @@ namespace pisa { class document_enumerator { public: - using enumerator_category = ds2i::block_enumerator_tag; + using enumerator_category = pisa::block_enumerator_tag; document_enumerator(uint8_t const* data, uint64_t universe, size_t term_id = 0) diff --git a/include/pisa/freq_index.hpp b/include/pisa/freq_index.hpp index 38804f615..38f9d748d 100644 --- a/include/pisa/freq_index.hpp +++ b/include/pisa/freq_index.hpp @@ -76,7 +76,7 @@ namespace pisa { class document_enumerator { public: - using enumerator_category = ds2i::input_enumerator_tag; + using enumerator_category = pisa::input_enumerator_tag; void reset() { m_cur_pos = 0; diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/query/algorithm/exhaustive_taat_query.hpp index 4bad21d1b..f4b90cef1 100644 --- a/include/query/algorithm/exhaustive_taat_query.hpp +++ b/include/query/algorithm/exhaustive_taat_query.hpp @@ -230,7 +230,7 @@ struct Taat_Traversal { void static traverse_term(Cursor &cursor, Score score, Acc &acc) { if constexpr (std::is_same_v) { + pisa::block_enumerator_tag>) { while (cursor.docid() < acc.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/query/algorithm/maxscore_taat_query.hpp index 7ec25de0d..5e4cd9ad1 100644 --- a/include/query/algorithm/maxscore_taat_query.hpp +++ b/include/query/algorithm/maxscore_taat_query.hpp @@ -86,7 +86,7 @@ class maxscore_taat_query { template void traverse_with_lookups(Cursor &cursor, Score score) { if constexpr (std::is_same_v) { + pisa::block_enumerator_tag>) { while (cursor.docid() < m_accumulators.size()) { auto const &documents = cursor.document_buffer(); auto const &freqs = cursor.frequency_buffer(); diff --git a/src/queries.cpp b/src/queries.cpp index 1026cd3e2..2e950d5fc 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -143,7 +143,7 @@ void perftest(const std::string &index_filename, query_fun = pisa::make_exhaustive_taat_query>(index, wdata, k); } else if (t == "ranked_or_taat" && wand_data_filename) { - query_fun = [&](ds2i::term_id_vec query) { + query_fun = [&](pisa::term_id_vec query) { return ranked_or_taat_query(wdata, k)(index, query); }; } else if (t == "maxscore_taat" && wand_data_filename) { diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index b802dfaed..8953c0578 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -93,44 +93,44 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_maxscore") test_against_or(bmm_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat") +TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat") { - ds2i::pisa::exhaustive_taat_query taat_q( + pisa::pisa::exhaustive_taat_query taat_q( index, wdata, 10); test_against_or(taat_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat_blocked") +TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat_blocked") { - ds2i::pisa::exhaustive_taat_query> + pisa::pisa::exhaustive_taat_query> taat_q(index, wdata, 10); test_against_or(taat_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "maxscore_taat") +TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat") { - ds2i::pisa::maxscore_taat_query taat_q( + pisa::pisa::maxscore_taat_query taat_q( index, wdata, 10); test_against_or(taat_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "maxscore_taat_blocked") +TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat_blocked") { - ds2i::pisa::maxscore_taat_query> + pisa::pisa::maxscore_taat_query> taat_q(index, wdata, 10); test_against_or(taat_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "ranked_or_taat") +TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat") { - ds2i::ranked_or_taat_query ranked_or_taat_q(wdata, 10); + pisa::ranked_or_taat_query ranked_or_taat_q(wdata, 10); test_against_or(ranked_or_taat_q); } -TEST_CASE_METHOD(ds2i::test::index_initialization, "exhaustive_taat_lazy") +TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat_lazy") { - ds2i::pisa::exhaustive_taat_query> taat_q( + pisa::pisa::exhaustive_taat_query> taat_q( index, wdata, 10); test_against_or(taat_q); } From 0c3663290393e8bd8d9e204895517d13f7a340f9 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 12:50:26 +0100 Subject: [PATCH 18/32] Moved algos --- include/{ => pisa}/query/algorithm/exhaustive_taat_query.hpp | 0 include/{ => pisa}/query/algorithm/maxscore_taat_query.hpp | 0 include/{ => pisa}/query/algorithm/ranked_or_taat_query.hpp | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename include/{ => pisa}/query/algorithm/exhaustive_taat_query.hpp (100%) rename include/{ => pisa}/query/algorithm/maxscore_taat_query.hpp (100%) rename include/{ => pisa}/query/algorithm/ranked_or_taat_query.hpp (100%) diff --git a/include/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp similarity index 100% rename from include/query/algorithm/exhaustive_taat_query.hpp rename to include/pisa/query/algorithm/exhaustive_taat_query.hpp diff --git a/include/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp similarity index 100% rename from include/query/algorithm/maxscore_taat_query.hpp rename to include/pisa/query/algorithm/maxscore_taat_query.hpp diff --git a/include/query/algorithm/ranked_or_taat_query.hpp b/include/pisa/query/algorithm/ranked_or_taat_query.hpp similarity index 100% rename from include/query/algorithm/ranked_or_taat_query.hpp rename to include/pisa/query/algorithm/ranked_or_taat_query.hpp From 6e96d163261097e3ed60491f4818ed2c6d5c4657 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 13:22:11 +0100 Subject: [PATCH 19/32] Merge with master --- CMakeLists.txt | 2 - .../pisa/accumulator/blocked_accumulator.hpp | 84 ++++++++ include/pisa/accumulator/lazy_accumulator.hpp | 95 +++++++++ .../pisa/accumulator/simple_accumulator.hpp | 15 ++ .../query/algorithm/exhaustive_taat_query.hpp | 183 +----------------- .../query/algorithm/ranked_or_taat_query.hpp | 16 +- test/test_ranked_queries.cpp | 10 +- 7 files changed, 212 insertions(+), 193 deletions(-) create mode 100644 include/pisa/accumulator/blocked_accumulator.hpp create mode 100644 include/pisa/accumulator/lazy_accumulator.hpp create mode 100644 include/pisa/accumulator/simple_accumulator.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 988fd4baa..064603f17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,8 +61,6 @@ endif() set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) -link_libraries(Threads::Threads) - include_directories(include) add_library(pisa INTERFACE) diff --git a/include/pisa/accumulator/blocked_accumulator.hpp b/include/pisa/accumulator/blocked_accumulator.hpp new file mode 100644 index 000000000..7d03dcb40 --- /dev/null +++ b/include/pisa/accumulator/blocked_accumulator.hpp @@ -0,0 +1,84 @@ +#pragma once + +namespace pisa { + +template +struct Blocked_Accumulator { + + struct Proxy_Element { + std::ptrdiff_t document; + std::vector &accumulators; + std::vector &accumulators_max; + + Proxy_Element &operator=(float score) { + accumulators[document] = score; + auto &block_max = accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + return *this; + } + Proxy_Element &operator+=(float delta) { + accumulators[document] += delta; + auto const&score = accumulators[document]; + auto &block_max = accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + return *this; + } + + operator float() { return accumulators[document]; } + }; + + using reference = Proxy_Element; + + static_assert(block_size > 0, "must be positive"); + + [[nodiscard]] constexpr static auto calc_block_count(std::size_t size) noexcept -> std::size_t { + return (size + block_size - 1) / block_size; + } + + Blocked_Accumulator(std::size_t size) + : m_size(size), + m_block_count(calc_block_count(size)), m_accumulators(size), + m_accumulators_max(m_block_count) {} + + void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); } + + [[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element + { + return {document, m_accumulators, m_accumulators_max}; + } + + void accumulate(std::ptrdiff_t const document, float score_delta) + { + m_accumulators[document] += score_delta; + auto const &score = m_accumulators[document]; + auto &block_max = m_accumulators_max[document / block_size]; + if (score > block_max) { + block_max = score; + } + } + + void aggregate(topk_queue &topk) { + for (size_t block = 0; block < m_block_count; ++block) { + if (not topk.would_enter(m_accumulators_max[block])) { continue; } + uint32_t doc = block * block_size; + uint32_t end = std::min((block + 1) * block_size, m_accumulators.size()); + for (; doc < end; ++doc) { + topk.insert(m_accumulators[doc], doc); + } + } + } + + [[nodiscard]] auto size() noexcept -> std::size_t { return m_size; } + + private: + std::size_t m_size; + std::size_t m_block_count; + std::vector m_accumulators; + std::vector m_accumulators_max; +}; + +} // pisa diff --git a/include/pisa/accumulator/lazy_accumulator.hpp b/include/pisa/accumulator/lazy_accumulator.hpp new file mode 100644 index 000000000..347a8eba2 --- /dev/null +++ b/include/pisa/accumulator/lazy_accumulator.hpp @@ -0,0 +1,95 @@ +#pragma once + +namespace pisa { + +template +struct Lazy_Accumulator { + using reference = float &; + + static_assert(std::is_integral_v && std::is_unsigned_v, + "must be unsigned number"); + constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8; + constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; + constexpr static auto cycle = (1u << counter_bit_size); + constexpr static Descriptor mask = (1u << counter_bit_size) - 1; + + struct Block { + Descriptor descriptor{}; + std::array accumulators{}; + + [[nodiscard]] auto counter(int pos) const noexcept -> int { + return (descriptor >> (pos * counter_bit_size)) & mask; + } + + void reset_counter(int pos, int counter) + { + auto const shift = pos * counter_bit_size; + descriptor &= ~(mask << shift); + descriptor |= static_cast(counter) << shift; + accumulators[pos] = 0; + } + }; + + Lazy_Accumulator(std::size_t size) + : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) + {} + + void init() + { + if (m_counter == 0) { + auto first = reinterpret_cast(&m_accumulators.front()); + auto last = + std::next(reinterpret_cast(&m_accumulators.back()), sizeof(Block)); + std::fill(first, last, std::byte{0}); + } + } + + float &operator[](std::ptrdiff_t const document) { + auto const block = document / counters_in_descriptor; + auto const pos_in_block = document % counters_in_descriptor; + if (//m_accumulators[block].accumulators[pos_in_block] > 0 && + m_accumulators[block].counter(pos_in_block) != m_counter) + { + auto const shift = pos_in_block * counter_bit_size; + m_accumulators[block].descriptor &= ~(mask << shift); + m_accumulators[block].descriptor |= m_counter << shift; + m_accumulators[block].accumulators[pos_in_block] = 0; + } + return m_accumulators[block].accumulators[pos_in_block]; + } + + void accumulate(std::ptrdiff_t const document, float score) + { + auto const block = document / counters_in_descriptor; + auto const pos_in_block = document % counters_in_descriptor; + if (m_accumulators[block].counter(pos_in_block) != m_counter) { + m_accumulators[block].reset_counter(pos_in_block, m_counter); + } + m_accumulators[block].accumulators[pos_in_block] += score; + } + + void aggregate(topk_queue &topk) { + uint64_t docid = 0u; + for (auto const &block : m_accumulators) { + int pos = 0; + for (auto const &score : block.accumulators) { + if (block.counter(pos++) == m_counter) { + topk.insert(score, docid); + } + ++docid; + } + }; + m_counter = (m_counter + 1) % cycle; + } + + [[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; } + [[nodiscard]] auto blocks() noexcept -> std::vector & { return m_accumulators; } + [[nodiscard]] auto counter() const noexcept -> int { return m_counter; } + + private: + std::size_t m_size; + std::vector m_accumulators; + int m_counter{}; +}; + +} \ No newline at end of file diff --git a/include/pisa/accumulator/simple_accumulator.hpp b/include/pisa/accumulator/simple_accumulator.hpp new file mode 100644 index 000000000..07d34f5ba --- /dev/null +++ b/include/pisa/accumulator/simple_accumulator.hpp @@ -0,0 +1,15 @@ +#pragma once + +namespace pisa { + +struct Simple_Accumulator : public std::vector { + Simple_Accumulator(std::ptrdiff_t size) : std::vector(size) {} + void init() { std::fill(begin(), end(), 0.0); } + void accumulate(uint32_t doc, float score) { operator[](doc) += score; } + void aggregate(topk_queue &topk) { + uint64_t docid = 0u; + std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); }); + } +}; + +} \ No newline at end of file diff --git a/include/pisa/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp index f4b90cef1..2dd51af86 100644 --- a/include/pisa/query/algorithm/exhaustive_taat_query.hpp +++ b/include/pisa/query/algorithm/exhaustive_taat_query.hpp @@ -3,6 +3,10 @@ #include "util/intrinsics.hpp" #include "topk_queue.hpp" +#include "accumulator/simple_accumulator.hpp" +#include "accumulator/lazy_accumulator.hpp" +#include "accumulator/blocked_accumulator.hpp" + namespace pisa { template @@ -46,185 +50,6 @@ template } // namespace query -template -struct Blocked_Accumulator { - - struct Proxy_Element { - std::ptrdiff_t document; - std::vector &accumulators; - std::vector &accumulators_max; - - Proxy_Element &operator=(float score) { - accumulators[document] = score; - auto &block_max = accumulators_max[document / block_size]; - if (score > block_max) { - block_max = score; - } - return *this; - } - Proxy_Element &operator+=(float delta) { - accumulators[document] += delta; - auto const&score = accumulators[document]; - auto &block_max = accumulators_max[document / block_size]; - if (score > block_max) { - block_max = score; - } - return *this; - } - - operator float() { return accumulators[document]; } - }; - - using reference = Proxy_Element; - - static_assert(block_size > 0, "must be positive"); - - [[nodiscard]] constexpr static auto calc_block_count(std::size_t size) noexcept -> std::size_t { - return (size + block_size - 1) / block_size; - } - - Blocked_Accumulator(std::size_t size) - : m_size(size), - m_block_count(calc_block_count(size)), m_accumulators(size), - m_accumulators_max(m_block_count) {} - - void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); } - - [[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element - { - return {document, m_accumulators, m_accumulators_max}; - } - - void accumulate(std::ptrdiff_t const document, float score_delta) - { - m_accumulators[document] += score_delta; - auto const &score = m_accumulators[document]; - auto &block_max = m_accumulators_max[document / block_size]; - if (score > block_max) { - block_max = score; - } - } - - void aggregate(topk_queue &topk) { - for (size_t block = 0; block < m_block_count; ++block) { - if (not topk.would_enter(m_accumulators_max[block])) { continue; } - uint32_t doc = block * block_size; - uint32_t end = std::min((block + 1) * block_size, m_accumulators.size()); - for (; doc < end; ++doc) { - topk.insert(m_accumulators[doc], doc); - } - } - } - - [[nodiscard]] auto size() noexcept -> std::size_t { return m_size; } - - private: - std::size_t m_size; - std::size_t m_block_count; - std::vector m_accumulators; - std::vector m_accumulators_max; -}; - -template -struct Lazy_Accumulator { - using reference = float &; - - static_assert(std::is_integral_v && std::is_unsigned_v, - "must be unsigned number"); - constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8; - constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; - constexpr static auto cycle = (1u << counter_bit_size); - constexpr static Descriptor mask = (1u << counter_bit_size) - 1; - - struct Block { - Descriptor descriptor{}; - std::array accumulators{}; - - [[nodiscard]] auto counter(int pos) const noexcept -> int { - return (descriptor >> (pos * counter_bit_size)) & mask; - } - - void reset_counter(int pos, int counter) - { - auto const shift = pos * counter_bit_size; - descriptor &= ~(mask << shift); - descriptor |= static_cast(counter) << shift; - accumulators[pos] = 0; - } - }; - - Lazy_Accumulator(std::size_t size) - : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) - {} - - void init() - { - if (m_counter == 0) { - auto first = reinterpret_cast(&m_accumulators.front()); - auto last = - std::next(reinterpret_cast(&m_accumulators.back()), sizeof(Block)); - std::fill(first, last, std::byte{0}); - } - } - - float &operator[](std::ptrdiff_t const document) { - auto const block = document / counters_in_descriptor; - auto const pos_in_block = document % counters_in_descriptor; - if (//m_accumulators[block].accumulators[pos_in_block] > 0 && - m_accumulators[block].counter(pos_in_block) != m_counter) - { - auto const shift = pos_in_block * counter_bit_size; - m_accumulators[block].descriptor &= ~(mask << shift); - m_accumulators[block].descriptor |= m_counter << shift; - m_accumulators[block].accumulators[pos_in_block] = 0; - } - return m_accumulators[block].accumulators[pos_in_block]; - } - - void accumulate(std::ptrdiff_t const document, float score) - { - auto const block = document / counters_in_descriptor; - auto const pos_in_block = document % counters_in_descriptor; - if (m_accumulators[block].counter(pos_in_block) != m_counter) { - m_accumulators[block].reset_counter(pos_in_block, m_counter); - } - m_accumulators[block].accumulators[pos_in_block] += score; - } - - void aggregate(topk_queue &topk) { - uint64_t docid = 0u; - for (auto const &block : m_accumulators) { - int pos = 0; - for (auto const &score : block.accumulators) { - if (block.counter(pos++) == m_counter) { - topk.insert(score, docid); - } - ++docid; - } - }; - m_counter = (m_counter + 1) % cycle; - } - - [[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; } - [[nodiscard]] auto blocks() noexcept -> std::vector & { return m_accumulators; } - [[nodiscard]] auto counter() const noexcept -> int { return m_counter; } - - private: - std::size_t m_size; - std::vector m_accumulators; - int m_counter{}; -}; - -struct Simple_Accumulator : public std::vector { - Simple_Accumulator(std::ptrdiff_t size) : std::vector(size) {} - void init() { std::fill(begin(), end(), 0.0); } - void accumulate(uint32_t doc, float score) { operator[](doc) += score; } - void aggregate(topk_queue &topk) { - uint64_t docid = 0u; - std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); }); - } -}; - struct Taat_Traversal { template void static traverse_term(Cursor &cursor, Score score, Acc &acc) diff --git a/include/pisa/query/algorithm/ranked_or_taat_query.hpp b/include/pisa/query/algorithm/ranked_or_taat_query.hpp index d83f109b1..f3ee57144 100644 --- a/include/pisa/query/algorithm/ranked_or_taat_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_taat_query.hpp @@ -1,5 +1,6 @@ #pragma once +namespace pisa { template struct ranked_or_taat_query { @@ -15,22 +16,22 @@ struct ranked_or_taat_query { auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = index.num_docs(); std::vector accumulator(num_docs, 0.0f); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); - auto cur_doc = list.docid(); - while(cur_doc < num_docs){ + auto cur_doc = list.docid(); + while (cur_doc < num_docs) { float norm_len = m_wdata->norm_len(cur_doc); - float score = q_weight * scorer_type::doc_term_weight(list.freq(), norm_len); + float score = q_weight * scorer_type::doc_term_weight(list.freq(), norm_len); accumulator[cur_doc] += score; list.next(); cur_doc = list.docid(); } } - for(auto&& v : accumulator) { + for (auto &&v : accumulator) { m_topk.insert(v); } @@ -42,6 +43,7 @@ struct ranked_or_taat_query { private: WandType const *m_wdata; - topk_queue m_topk; + topk_queue m_topk; }; +} // namespace pisa \ No newline at end of file diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 8953c0578..954805c80 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -95,28 +95,28 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_maxscore") TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat") { - pisa::pisa::exhaustive_taat_query taat_q( + pisa::exhaustive_taat_query taat_q( index, wdata, 10); test_against_or(taat_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat_blocked") { - pisa::pisa::exhaustive_taat_query> + pisa::exhaustive_taat_query> taat_q(index, wdata, 10); test_against_or(taat_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat") { - pisa::pisa::maxscore_taat_query taat_q( + pisa::maxscore_taat_query taat_q( index, wdata, 10); test_against_or(taat_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat_blocked") { - pisa::pisa::maxscore_taat_query> + pisa::maxscore_taat_query> taat_q(index, wdata, 10); test_against_or(taat_q); } @@ -130,7 +130,7 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat") TEST_CASE_METHOD(pisa::test::index_initialization, "exhaustive_taat_lazy") { - pisa::pisa::exhaustive_taat_query> taat_q( + pisa::exhaustive_taat_query> taat_q( index, wdata, 10); test_against_or(taat_q); } From f2c93f99bc026dbbdbaa71bd54503570843b188b Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 13:31:42 +0100 Subject: [PATCH 20/32] code cleanup --- .../query/algorithm/exhaustive_taat_query.hpp | 41 ------------------- .../query/algorithm/maxscore_taat_query.hpp | 29 ++++++++++++- include/pisa/query/queries.hpp | 40 ++++++++++++++++++ 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/include/pisa/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp index 2dd51af86..5f6e9186e 100644 --- a/include/pisa/query/algorithm/exhaustive_taat_query.hpp +++ b/include/pisa/query/algorithm/exhaustive_taat_query.hpp @@ -9,47 +9,6 @@ namespace pisa { -template -struct Score_Function { - float query_weight; - std::reference_wrapper wdata; - - [[nodiscard]] auto operator()(uint32_t doc, uint32_t freq) const -> float { - return query_weight * Scorer::doc_term_weight(freq, wdata.get().norm_len(doc)); - } -}; - -// TODO: These are functions common to query processing in general. -// They should be moved out of this file. -namespace query { - -template -[[nodiscard]] auto cursors_with_scores(Index const& index, WandType const &wdata, term_id_vec terms) -{ - // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more - // complex refactoring I want to avoid for now. - using scorer_type = bm25; - using cursor_type = typename Index::document_enumerator; - using score_function_type = Score_Function; - - auto query_term_freqs = query_freqs(terms); - std::vector cursors; - std::vector score_functions; - cursors.reserve(query_term_freqs.size()); - score_functions.reserve(query_term_freqs.size()); - - for (auto term : query_term_freqs) { - auto list = index[term.first]; - uint64_t num_docs = index.num_docs(); - auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); - cursors.push_back(std::move(list)); - score_functions.push_back({q_weight, std::cref(wdata)}); - } - return std::make_pair(cursors, score_functions); -} - -} // namespace query - struct Taat_Traversal { template void static traverse_term(Cursor &cursor, Score score, Acc &acc) diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 5e4cd9ad1..931b06d19 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -1,11 +1,36 @@ #pragma once -#include "exhaustive_taat_query.hpp" #include "topk_queue.hpp" #include "util/intrinsics.hpp" +#include "accumulator/simple_accumulator.hpp" +#include "accumulator/lazy_accumulator.hpp" +#include "accumulator/blocked_accumulator.hpp" + namespace pisa { +struct Maxscore_Taat_Traversal { + template + void static traverse_term(Cursor &cursor, Score score, Acc &acc) + { + if constexpr (std::is_same_v) { + while (cursor.docid() < acc.size()) { + auto const &documents = cursor.document_buffer(); + auto const &freqs = cursor.frequency_buffer(); + for (uint32_t idx = 0; idx < documents.size(); ++idx) { + acc.accumulate(documents[idx], score(documents[idx], freqs[idx] + 1)); + } + cursor.next_block(); + } + } else { + for (; cursor.docid() < acc.size(); cursor.next()) { + acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); + } + } + } +}; + template [[nodiscard]] auto max_weights(Index const& index, WandType const &wdata, term_id_vec terms) { @@ -129,7 +154,7 @@ class maxscore_taat_query { if (not m_topk.would_enter(nonessential_sum)) { break; } - Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); + Maxscore_Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); nonessential_sum -= max_weights[term]; } diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp index 638d7c028..45d165099 100644 --- a/include/pisa/query/queries.hpp +++ b/include/pisa/query/queries.hpp @@ -51,6 +51,46 @@ term_freq_vec query_freqs(term_id_vec terms) { return query_term_freqs; } +template +struct Score_Function { + float query_weight; + std::reference_wrapper wdata; + + [[nodiscard]] auto operator()(uint32_t doc, uint32_t freq) const -> float { + return query_weight * Scorer::doc_term_weight(freq, wdata.get().norm_len(doc)); + } +}; + +// TODO: These are functions common to query processing in general. +// They should be moved out of this file. +namespace query { + +template +[[nodiscard]] auto cursors_with_scores(Index const& index, WandType const &wdata, term_id_vec terms) +{ + // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more + // complex refactoring I want to avoid for now. + using scorer_type = bm25; + using cursor_type = typename Index::document_enumerator; + using score_function_type = Score_Function; + + auto query_term_freqs = query_freqs(terms); + std::vector cursors; + std::vector score_functions; + cursors.reserve(query_term_freqs.size()); + score_functions.reserve(query_term_freqs.size()); + + for (auto term : query_term_freqs) { + auto list = index[term.first]; + uint64_t num_docs = index.num_docs(); + auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); + cursors.push_back(std::move(list)); + score_functions.push_back({q_weight, std::cref(wdata)}); + } + return std::make_pair(cursors, score_functions); +} + +} // namespace query } // namespace pisa #include "algorithm/and_query.hpp" From 7ca53d57a36ad3b06771fb1876b3e7aeb5f01e27 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 13:37:15 +0100 Subject: [PATCH 21/32] Added comment [skip ci] --- include/pisa/query/algorithm/maxscore_taat_query.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 931b06d19..0361f884e 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -9,6 +9,11 @@ namespace pisa { +// TODO(antonio): basically here we can do a bit better. +// before scoring a document, we read its accumulator value and check if the sum of +// the accumulator value and the upper bound of the maxscores of the missing terms +// (current included) is greater than the threshold. If it is we score and add it to the accumulator, +// we go to the next document otherwise. struct Maxscore_Taat_Traversal { template void static traverse_term(Cursor &cursor, Score score, Acc &acc) From 30782acb26b6f3a9255342a4df02cc1959398eee Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 15:33:45 +0100 Subject: [PATCH 22/32] Improved queries interface --- include/pisa/query/algorithm/and_query.hpp | 14 ++++++--- .../algorithm/block_max_maxscore_query.hpp | 17 +++++----- .../query/algorithm/block_max_wand_query.hpp | 13 ++++---- .../query/algorithm/exhaustive_taat_query.hpp | 13 +------- .../pisa/query/algorithm/maxscore_query.hpp | 16 +++++----- .../query/algorithm/maxscore_taat_query.hpp | 31 +++++++++---------- include/pisa/query/algorithm/or_query.hpp | 16 ++++++---- .../pisa/query/algorithm/ranked_and_query.hpp | 15 ++++----- .../pisa/query/algorithm/ranked_or_query.hpp | 17 +++++----- .../query/algorithm/ranked_or_taat_query.hpp | 13 ++++---- include/pisa/query/algorithm/wand_query.hpp | 13 ++++---- src/profile_queries.cpp | 15 +++++---- src/queries.cpp | 20 ++++++------ test/test_bmw_queries.cpp | 12 +++---- test/test_ranked_queries.cpp | 22 ++++++------- 15 files changed, 124 insertions(+), 123 deletions(-) diff --git a/include/pisa/query/algorithm/and_query.hpp b/include/pisa/query/algorithm/and_query.hpp index f9d36f865..3601b246b 100644 --- a/include/pisa/query/algorithm/and_query.hpp +++ b/include/pisa/query/algorithm/and_query.hpp @@ -2,11 +2,12 @@ namespace pisa { -template +template struct and_query { - template - uint64_t operator()(Index const &index, term_id_vec terms) const { + and_query(Index const &index) : m_index(index) {} + + uint64_t operator()(term_id_vec terms) const { if (terms.empty()) return 0; remove_duplicate_terms(terms); @@ -16,7 +17,7 @@ struct and_query { enums.reserve(terms.size()); for (auto term : terms) { - enums.push_back(index[term]); + enums.push_back(m_index[term]); } // sort by increasing frequency @@ -27,7 +28,7 @@ struct and_query { uint64_t results = 0; uint64_t candidate = enums[0].docid(); size_t i = 1; - while (candidate < index.num_docs()) { + while (candidate < m_index.num_docs()) { for (; i < enums.size(); ++i) { enums[i].next_geq(candidate); if (enums[i].docid() != candidate) { @@ -52,6 +53,9 @@ struct and_query { } return results; } + + private: + Index const &m_index; }; } // namespace pisa \ No newline at end of file diff --git a/include/pisa/query/algorithm/block_max_maxscore_query.hpp b/include/pisa/query/algorithm/block_max_maxscore_query.hpp index 5ef880321..a0a95dee1 100644 --- a/include/pisa/query/algorithm/block_max_maxscore_query.hpp +++ b/include/pisa/query/algorithm/block_max_maxscore_query.hpp @@ -2,22 +2,22 @@ namespace pisa { -template +template struct block_max_maxscore_query { typedef bm25 scorer_type; - block_max_maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + block_max_maxscore_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec const &terms) { + uint64_t operator()(term_id_vec const &terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; typedef typename WandType::wand_data_enumerator wdata_enum; @@ -32,7 +32,7 @@ struct block_max_maxscore_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto w_enum = m_wdata->getenum(term.first); auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); auto max_weight = q_weight * m_wdata->max_term_weight(term.first); @@ -66,10 +66,10 @@ struct block_max_maxscore_query { }) ->docs_enum.docid(); - while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) { + while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) { float score = 0; float norm_len = m_wdata->norm_len(cur_doc); - uint64_t next_doc = index.num_docs(); + uint64_t next_doc = m_index.num_docs(); for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) { if (ordered_enums[i]->docs_enum.docid() == cur_doc) { score += @@ -129,6 +129,7 @@ struct block_max_maxscore_query { std::vector> const &topk() const { return m_topk.topk(); } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/block_max_wand_query.hpp b/include/pisa/query/algorithm/block_max_wand_query.hpp index 1bbd0c396..138083839 100644 --- a/include/pisa/query/algorithm/block_max_wand_query.hpp +++ b/include/pisa/query/algorithm/block_max_wand_query.hpp @@ -2,20 +2,20 @@ namespace pisa { -template +template struct block_max_wand_query { typedef bm25 scorer_type; - block_max_wand_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + block_max_wand_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec const &terms) { + uint64_t operator()(term_id_vec const &terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; typedef typename WandType::wand_data_enumerator wdata_enum; @@ -30,7 +30,7 @@ struct block_max_wand_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto w_enum = m_wdata->getenum(term.first); auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); @@ -204,6 +204,7 @@ struct block_max_wand_query { topk_queue const &get_topk() const { return m_topk; } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp index 5f6e9186e..96b6a9d8e 100644 --- a/include/pisa/query/algorithm/exhaustive_taat_query.hpp +++ b/include/pisa/query/algorithm/exhaustive_taat_query.hpp @@ -40,18 +40,7 @@ class exhaustive_taat_query { : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} uint64_t operator()(term_id_vec terms) { - auto cws = query::cursors_with_scores(m_index, m_wdata, terms); - return taat(std::move(cws.first), std::move(cws.second)); - } - - uint64_t operator()([[maybe_unused]] Index const &, term_id_vec terms) { - auto cws = query::cursors_with_scores(m_index, m_wdata, terms); - return taat(std::move(cws.first), std::move(cws.second)); - } - - // TODO(michal): I think this should be eventually the `operator()` - template - uint64_t taat(std::vector cursors, std::vector score_functions) { + auto [cursors, score_functions] = query::cursors_with_scores(m_index, m_wdata, terms); m_topk.clear(); if (cursors.empty()) { return 0; diff --git a/include/pisa/query/algorithm/maxscore_query.hpp b/include/pisa/query/algorithm/maxscore_query.hpp index 37f45c999..c7674fb1d 100644 --- a/include/pisa/query/algorithm/maxscore_query.hpp +++ b/include/pisa/query/algorithm/maxscore_query.hpp @@ -2,22 +2,21 @@ namespace pisa { -template +template struct maxscore_query { typedef bm25 scorer_type; - maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + maxscore_query(Index const &index, WandType const &wdata, uint64_t k) : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec const &terms) { + uint64_t operator()(term_id_vec const &terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; struct scored_enum { enum_type docs_enum; @@ -29,7 +28,7 @@ struct maxscore_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); auto max_weight = q_weight * m_wdata->max_term_weight(term.first); enums.push_back(scored_enum{std::move(list), q_weight, max_weight}); @@ -62,10 +61,10 @@ struct maxscore_query { }) ->docs_enum.docid(); - while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) { + while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) { float score = 0; float norm_len = m_wdata->norm_len(cur_doc); - uint64_t next_doc = index.num_docs(); + uint64_t next_doc = m_index.num_docs(); for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) { if (ordered_enums[i]->docs_enum.docid() == cur_doc) { score += @@ -109,6 +108,7 @@ struct maxscore_query { std::vector> const &topk() const { return m_topk.topk(); } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 0361f884e..5170f26d7 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -3,21 +3,20 @@ #include "topk_queue.hpp" #include "util/intrinsics.hpp" -#include "accumulator/simple_accumulator.hpp" -#include "accumulator/lazy_accumulator.hpp" #include "accumulator/blocked_accumulator.hpp" +#include "accumulator/lazy_accumulator.hpp" +#include "accumulator/simple_accumulator.hpp" namespace pisa { // TODO(antonio): basically here we can do a bit better. // before scoring a document, we read its accumulator value and check if the sum of // the accumulator value and the upper bound of the maxscores of the missing terms -// (current included) is greater than the threshold. If it is we score and add it to the accumulator, -// we go to the next document otherwise. +// (current included) is greater than the threshold. If it is we score and add it to the +// accumulator, we go to the next document otherwise. struct Maxscore_Taat_Traversal { template - void static traverse_term(Cursor &cursor, Score score, Acc &acc) - { + void static traverse_term(Cursor &cursor, Score score, Acc &acc) { if constexpr (std::is_same_v) { while (cursor.docid() < acc.size()) { @@ -37,15 +36,14 @@ struct Maxscore_Taat_Traversal { }; template -[[nodiscard]] auto max_weights(Index const& index, WandType const &wdata, term_id_vec terms) -{ +[[nodiscard]] auto max_weights(Index const &index, WandType const &wdata, term_id_vec terms) { // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more // complex refactoring I want to avoid for now. using scorer_type = bm25; using cursor_type = typename Index::document_enumerator; using score_function_type = Score_Function; - auto query_term_freqs = query_freqs(terms); + auto query_term_freqs = query_freqs(terms); std::vector max_weights; max_weights.reserve(query_term_freqs.size()); @@ -74,7 +72,7 @@ void apply_permutation(Container &container, const std::vector &p) if (done[i]) { continue; } - done[i] = true; + done[i] = true; std::size_t prev_j = i; std::size_t j = p[i]; while (i != j) { @@ -159,7 +157,8 @@ class maxscore_taat_query { if (not m_topk.would_enter(nonessential_sum)) { break; } - Maxscore_Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); + Maxscore_Taat_Traversal::traverse_term( + cursors[term], score_functions[term], m_accumulators); nonessential_sum -= max_weights[term]; } @@ -176,11 +175,11 @@ class maxscore_taat_query { std::vector> const &topk() const { return m_topk.topk(); } private: - Index const & m_index; - WandType const & m_wdata; - int m_k; - topk_queue m_topk; - Acc m_accumulators; + Index const & m_index; + WandType const &m_wdata; + int m_k; + topk_queue m_topk; + Acc m_accumulators; }; template diff --git a/include/pisa/query/algorithm/or_query.hpp b/include/pisa/query/algorithm/or_query.hpp index 452bed52e..55fba0f64 100644 --- a/include/pisa/query/algorithm/or_query.hpp +++ b/include/pisa/query/algorithm/or_query.hpp @@ -2,11 +2,12 @@ namespace pisa { -template +template struct or_query { - template - uint64_t operator()(Index const &index, term_id_vec terms) const { + or_query(Index const &index) : m_index(index) {} + + uint64_t operator()(term_id_vec terms) const { if (terms.empty()) return 0; remove_duplicate_terms(terms); @@ -16,7 +17,7 @@ struct or_query { enums.reserve(terms.size()); for (auto term : terms) { - enums.push_back(index[term]); + enums.push_back(m_index[term]); } uint64_t results = 0; @@ -27,9 +28,9 @@ struct or_query { }) ->docid(); - while (cur_doc < index.num_docs()) { + while (cur_doc < m_index.num_docs()) { results += 1; - uint64_t next_doc = index.num_docs(); + uint64_t next_doc = m_index.num_docs(); for (size_t i = 0; i < enums.size(); ++i) { if (enums[i].docid() == cur_doc) { if (with_freqs) { @@ -47,6 +48,9 @@ struct or_query { return results; } + + private: + Index const &m_index; }; } // namespace pisa \ No newline at end of file diff --git a/include/pisa/query/algorithm/ranked_and_query.hpp b/include/pisa/query/algorithm/ranked_and_query.hpp index 3d330cc15..ef9e7da68 100644 --- a/include/pisa/query/algorithm/ranked_and_query.hpp +++ b/include/pisa/query/algorithm/ranked_and_query.hpp @@ -2,15 +2,15 @@ namespace pisa { -template +template struct ranked_and_query { typedef bm25 scorer_type; - ranked_and_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + ranked_and_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec terms) { + uint64_t operator()(term_id_vec terms) { size_t results = 0; m_topk.clear(); if (terms.empty()) @@ -18,7 +18,7 @@ struct ranked_and_query { auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; struct scored_enum { enum_type docs_enum; @@ -29,7 +29,7 @@ struct ranked_and_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); enums.push_back(scored_enum{std::move(list), q_weight}); } @@ -41,7 +41,7 @@ struct ranked_and_query { uint64_t candidate = enums[0].docs_enum.docid(); size_t i = 1; - while (candidate < index.num_docs()) { + while (candidate < m_index.num_docs()) { for (; i < enums.size(); ++i) { enums[i].docs_enum.next_geq(candidate); if (enums[i].docs_enum.docid() != candidate) { @@ -80,6 +80,7 @@ struct ranked_and_query { topk_queue &get_topk() { return m_topk; } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/ranked_or_query.hpp b/include/pisa/query/algorithm/ranked_or_query.hpp index a5ab88216..dbf9fb06e 100644 --- a/include/pisa/query/algorithm/ranked_or_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_query.hpp @@ -2,22 +2,22 @@ namespace pisa { -template +template struct ranked_or_query { typedef bm25 scorer_type; - ranked_or_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + ranked_or_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec terms) { + uint64_t operator()(term_id_vec terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; struct scored_enum { enum_type docs_enum; @@ -28,7 +28,7 @@ struct ranked_or_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); enums.push_back(scored_enum{std::move(list), q_weight}); } @@ -41,10 +41,10 @@ struct ranked_or_query { }) ->docs_enum.docid(); - while (cur_doc < index.num_docs()) { + while (cur_doc < m_index.num_docs()) { float score = 0; float norm_len = m_wdata->norm_len(cur_doc); - uint64_t next_doc = index.num_docs(); + uint64_t next_doc = m_index.num_docs(); for (size_t i = 0; i < enums.size(); ++i) { if (enums[i].docs_enum.docid() == cur_doc) { score += enums[i].q_weight * @@ -67,6 +67,7 @@ struct ranked_or_query { std::vector> const &topk() const { return m_topk.topk(); } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/ranked_or_taat_query.hpp b/include/pisa/query/algorithm/ranked_or_taat_query.hpp index f3ee57144..06cd29c2a 100644 --- a/include/pisa/query/algorithm/ranked_or_taat_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_taat_query.hpp @@ -1,25 +1,25 @@ #pragma once namespace pisa { -template +template struct ranked_or_taat_query { typedef bm25 scorer_type; - ranked_or_taat_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + ranked_or_taat_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec terms) { + uint64_t operator()(term_id_vec terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); std::vector accumulator(num_docs, 0.0f); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); auto cur_doc = list.docid(); while (cur_doc < num_docs) { @@ -42,6 +42,7 @@ struct ranked_or_taat_query { std::vector> const &topk() const { return m_topk.topk(); } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/include/pisa/query/algorithm/wand_query.hpp b/include/pisa/query/algorithm/wand_query.hpp index 2194cdde2..00839493e 100644 --- a/include/pisa/query/algorithm/wand_query.hpp +++ b/include/pisa/query/algorithm/wand_query.hpp @@ -2,22 +2,22 @@ namespace pisa { -template +template struct wand_query { typedef bm25 scorer_type; - wand_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {} + wand_query(Index const &index, WandType const &wdata, uint64_t k) + : m_index(index), m_wdata(&wdata), m_topk(k) {} - template - uint64_t operator()(Index const &index, term_id_vec const &terms) { + uint64_t operator()(term_id_vec const &terms) { m_topk.clear(); if (terms.empty()) return 0; auto query_term_freqs = query_freqs(terms); - uint64_t num_docs = index.num_docs(); + uint64_t num_docs = m_index.num_docs(); typedef typename Index::document_enumerator enum_type; struct scored_enum { enum_type docs_enum; @@ -29,7 +29,7 @@ struct wand_query { enums.reserve(query_term_freqs.size()); for (auto term : query_term_freqs) { - auto list = index[term.first]; + auto list = m_index[term.first]; auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs); auto max_weight = q_weight * m_wdata->max_term_weight(term.first); @@ -114,6 +114,7 @@ struct wand_query { std::vector> const &topk() const { return m_topk.topk(); } private: + Index const & m_index; WandType const *m_wdata; topk_queue m_topk; }; diff --git a/src/profile_queries.cpp b/src/profile_queries.cpp index 6e09c387f..5066de5eb 100644 --- a/src/profile_queries.cpp +++ b/src/profile_queries.cpp @@ -14,9 +14,8 @@ #include "query/queries.hpp" #include "util/util.hpp" -template -void op_profile(IndexType const& index, - QueryOperator const& query_op, +template +void op_profile(QueryOperator const& query_op, std::vector const& queries) { using namespace pisa; @@ -34,7 +33,7 @@ void op_profile(IndexType const& index, logger() << i << " queries processed" << std::endl; } - query_op_copy(index, queries[i]); + query_op_copy(queries[i]); } }); } @@ -86,13 +85,13 @@ void profile(const std::string index_filename, for (auto const& t: query_types) { logger() << "Query type: " << t << std::endl; if (t == "and") { - op_profile(index, and_query(), queries); + op_profile(and_query::type, false>(index), queries); } else if (t == "ranked_and" && wand_data_filename) { - op_profile(index, ranked_and_query(wdata, 10), queries); + op_profile(ranked_and_query::type, WandType>(index, wdata, 10), queries); } else if (t == "wand" && wand_data_filename) { - op_profile(index, wand_query(wdata, 10), queries); + op_profile(wand_query::type, WandType>(index, wdata, 10), queries); } else if (t == "maxscore" && wand_data_filename) { - op_profile(index, maxscore_query(wdata, 10), queries); + op_profile(maxscore_query::type, WandType>(index, wdata, 10), queries); } else { logger() << "Unsupported query type: " << t << std::endl; } diff --git a/src/queries.cpp b/src/queries.cpp index 2e950d5fc..5eb2c7af6 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -107,32 +107,32 @@ void perftest(const std::string &index_filename, logger() << "Query type: " << t << std::endl; std::function query_fun; if (t == "and") { - query_fun = [&](term_id_vec query) { return and_query()(index, query); }; + query_fun = [&](term_id_vec query) { return and_query(index)(query); }; } else if (t == "and_freq") { - query_fun = [&](term_id_vec query) { return and_query()(index, query); }; + query_fun = [&](term_id_vec query) { return and_query(index)(query); }; } else if (t == "or") { - query_fun = [&](term_id_vec query) { return or_query()(index, query); }; + query_fun = [&](term_id_vec query) { return or_query(index)(query); }; } else if (t == "or_freq") { - query_fun = [&](term_id_vec query) { return or_query()(index, query); }; + query_fun = [&](term_id_vec query) { return or_query(index)(query); }; } else if (t == "wand" && wand_data_filename) { query_fun = [&](term_id_vec query) { - return wand_query(wdata, k)(index, query); + return wand_query(index, wdata, k)(query); }; } else if (t == "block_max_wand" && wand_data_filename) { query_fun = [&](term_id_vec query) { - return block_max_wand_query(wdata, k)(index, query); + return block_max_wand_query(index, wdata, k)(query); }; } else if (t == "block_max_maxscore" && wand_data_filename) { query_fun = [&](term_id_vec query) { - return block_max_maxscore_query(wdata, k)(index, query); + return block_max_maxscore_query(index, wdata, k)(query); }; } else if (t == "ranked_or" && wand_data_filename) { query_fun = [&](term_id_vec query) { - return ranked_or_query(wdata, k)(index, query); + return ranked_or_query(index, wdata, k)(query); }; } else if (t == "maxscore" && wand_data_filename) { query_fun = [&](term_id_vec query) { - return maxscore_query(wdata, k)(index, query); + return maxscore_query(index, wdata, k)(query); }; } else if (t == "exhaustive_taat" && wand_data_filename) { query_fun = pisa::make_exhaustive_taat_query(index, wdata, k); @@ -144,7 +144,7 @@ void perftest(const std::string &index_filename, pisa::make_exhaustive_taat_query>(index, wdata, k); } else if (t == "ranked_or_taat" && wand_data_filename) { query_fun = [&](pisa::term_id_vec query) { - return ranked_or_taat_query(wdata, k)(index, query); + return ranked_or_taat_query(index, wdata, k)(query); }; } else if (t == "maxscore_taat" && wand_data_filename) { query_fun = pisa::make_maxscore_taat_query(index, wdata, k); diff --git a/test/test_bmw_queries.cpp b/test/test_bmw_queries.cpp index 67b5c82fe..a229e4b21 100644 --- a/test/test_bmw_queries.cpp +++ b/test/test_bmw_queries.cpp @@ -57,11 +57,11 @@ struct index_initialization { template void test_against_wand(QueryOp &op_q) const { - wand_query or_q(wdata, 10); + wand_query or_q(index, wdata, 10); for (auto const &q : queries) { - or_q(index, q); - op_q(index, q); + or_q(q); + op_q(q); REQUIRE(or_q.topk().size() == op_q.topk().size()); for (size_t i = 0; i < or_q.topk().size(); ++i) { @@ -76,9 +76,9 @@ struct index_initialization { } // namespace pisa TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_wand") { - pisa::block_max_wand_query block_max_wand_q(wdata, 10); - pisa::block_max_wand_query block_max_wand_uniform_q(wdata_uniform, 10); - pisa::block_max_wand_query block_max_wand_fixed_q(wdata_fixed, 10); + pisa::block_max_wand_query block_max_wand_q(index, wdata, 10); + pisa::block_max_wand_query block_max_wand_uniform_q(index, wdata_uniform, 10); + pisa::block_max_wand_query block_max_wand_fixed_q(index, wdata_fixed, 10); test_against_wand(block_max_wand_uniform_q); test_against_wand(block_max_wand_q); test_against_wand(block_max_wand_fixed_q); diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 954805c80..df3d89537 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -42,11 +42,11 @@ namespace pisa { namespace test { template void test_against_or(QueryOp& op_q) const { - ranked_or_query or_q(wdata, 10); + ranked_or_query or_q(index, wdata, 10); for (auto const& q: queries) { - or_q(index, q); - op_q(index, q); + or_q(q); + op_q(q); REQUIRE(or_q.topk().size() == op_q.topk().size()); for (size_t i = 0; i < or_q.topk().size(); ++i) { REQUIRE(or_q.topk()[i].first == Approx(op_q.topk()[i].first).epsilon(0.1)); // tolerance is % relative @@ -56,12 +56,12 @@ namespace pisa { namespace test { void test_k_size() const { - ranked_or_query or_10(wdata, 10); - ranked_or_query or_1(wdata, 1); + ranked_or_query or_10(index, wdata, 10); + ranked_or_query or_1(index, wdata, 1); for (auto const &q : queries) { - or_10(index, q); - or_1(index, q); + or_10(q); + or_1(q); if (not or_10.topk().empty()) { REQUIRE(not or_1.topk().empty()); REQUIRE(or_1.topk().front().first == Approx(or_10.topk().front().first).epsilon(0.1)); @@ -77,19 +77,19 @@ namespace pisa { namespace test { TEST_CASE_METHOD(pisa::test::index_initialization, "wand") { - pisa::wand_query wand_q(wdata, 10); + pisa::wand_query wand_q(index, wdata, 10); test_against_or(wand_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore") { - pisa::maxscore_query maxscore_q(wdata, 10); + pisa::maxscore_query maxscore_q(index, wdata, 10); test_against_or(maxscore_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_maxscore") { - pisa::block_max_maxscore_query bmm_q(wdata, 10); + pisa::block_max_maxscore_query bmm_q(index, wdata, 10); test_against_or(bmm_q); } @@ -124,7 +124,7 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat_blocked") TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat") { - pisa::ranked_or_taat_query ranked_or_taat_q(wdata, 10); + pisa::ranked_or_taat_query ranked_or_taat_q(index, wdata, 10); test_against_or(ranked_or_taat_q); } From 3649a7d23c854460037e519c2e457bbf0c8dcd3e Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 16:14:10 +0100 Subject: [PATCH 23/32] removed unused lambdas --- src/queries.cpp | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/src/queries.cpp b/src/queries.cpp index 5eb2c7af6..cdcfe3a21 100644 --- a/src/queries.cpp +++ b/src/queries.cpp @@ -107,33 +107,23 @@ void perftest(const std::string &index_filename, logger() << "Query type: " << t << std::endl; std::function query_fun; if (t == "and") { - query_fun = [&](term_id_vec query) { return and_query(index)(query); }; + query_fun = and_query(index); } else if (t == "and_freq") { - query_fun = [&](term_id_vec query) { return and_query(index)(query); }; + query_fun = and_query(index); } else if (t == "or") { - query_fun = [&](term_id_vec query) { return or_query(index)(query); }; + query_fun = or_query(index); } else if (t == "or_freq") { - query_fun = [&](term_id_vec query) { return or_query(index)(query); }; + query_fun = or_query(index); } else if (t == "wand" && wand_data_filename) { - query_fun = [&](term_id_vec query) { - return wand_query(index, wdata, k)(query); - }; + query_fun = wand_query(index, wdata, k); } else if (t == "block_max_wand" && wand_data_filename) { - query_fun = [&](term_id_vec query) { - return block_max_wand_query(index, wdata, k)(query); - }; + query_fun =block_max_wand_query(index, wdata, k); } else if (t == "block_max_maxscore" && wand_data_filename) { - query_fun = [&](term_id_vec query) { - return block_max_maxscore_query(index, wdata, k)(query); - }; + query_fun = block_max_maxscore_query(index, wdata, k); } else if (t == "ranked_or" && wand_data_filename) { - query_fun = [&](term_id_vec query) { - return ranked_or_query(index, wdata, k)(query); - }; + query_fun = ranked_or_query(index, wdata, k); } else if (t == "maxscore" && wand_data_filename) { - query_fun = [&](term_id_vec query) { - return maxscore_query(index, wdata, k)(query); - }; + query_fun = maxscore_query(index, wdata, k); } else if (t == "exhaustive_taat" && wand_data_filename) { query_fun = pisa::make_exhaustive_taat_query(index, wdata, k); } else if (t == "exhaustive_taat_lazy" && wand_data_filename) { @@ -143,9 +133,7 @@ void perftest(const std::string &index_filename, query_fun = pisa::make_exhaustive_taat_query>(index, wdata, k); } else if (t == "ranked_or_taat" && wand_data_filename) { - query_fun = [&](pisa::term_id_vec query) { - return ranked_or_taat_query(index, wdata, k)(query); - }; + query_fun =ranked_or_taat_query(index, wdata, k); } else if (t == "maxscore_taat" && wand_data_filename) { query_fun = pisa::make_maxscore_taat_query(index, wdata, k); } else if (t == "maxscore_taat_blocked" && wand_data_filename) { From 81db028e302ff8f223cfc79bd8011c28f592c028 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Wed, 16 Jan 2019 18:15:37 +0100 Subject: [PATCH 24/32] Removed buffers --- include/pisa/block_posting_list.hpp | 20 ---------- include/pisa/freq_index.hpp | 2 +- .../query/algorithm/exhaustive_taat_query.hpp | 28 +++---------- .../query/algorithm/maxscore_taat_query.hpp | 39 +++---------------- include/pisa/util/util.hpp | 4 -- 5 files changed, 12 insertions(+), 81 deletions(-) diff --git a/include/pisa/block_posting_list.hpp b/include/pisa/block_posting_list.hpp index 7489530c5..f2cd24c81 100644 --- a/include/pisa/block_posting_list.hpp +++ b/include/pisa/block_posting_list.hpp @@ -82,7 +82,6 @@ namespace pisa { class document_enumerator { public: - using enumerator_category = pisa::block_enumerator_tag; document_enumerator(uint8_t const* data, uint64_t universe, size_t term_id = 0) @@ -158,25 +157,6 @@ namespace pisa { } } - // TODO(michal): I recommend using some view, like gsl::span or something - // instead of a reference to a vector. - [[nodiscard]] auto document_buffer() -> std::vector const & { - return m_docs_buf; - } - - [[nodiscard]] auto frequency_buffer() -> std::vector const & { - if (!m_freqs_decoded) { - decode_freqs_block(); - } - return m_freqs_buf; - } - - void next_block() - { - m_pos_in_block = m_cur_block_size - 1; - next(); - } - uint64_t docid() const { return m_cur_docid; diff --git a/include/pisa/freq_index.hpp b/include/pisa/freq_index.hpp index 38f9d748d..82caaffa3 100644 --- a/include/pisa/freq_index.hpp +++ b/include/pisa/freq_index.hpp @@ -76,7 +76,7 @@ namespace pisa { class document_enumerator { public: - using enumerator_category = pisa::input_enumerator_tag; + void reset() { m_cur_pos = 0; diff --git a/include/pisa/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp index 96b6a9d8e..a6ef15955 100644 --- a/include/pisa/query/algorithm/exhaustive_taat_query.hpp +++ b/include/pisa/query/algorithm/exhaustive_taat_query.hpp @@ -9,28 +9,6 @@ namespace pisa { -struct Taat_Traversal { - template - void static traverse_term(Cursor &cursor, Score score, Acc &acc) - { - if constexpr (std::is_same_v) { - while (cursor.docid() < acc.size()) { - auto const &documents = cursor.document_buffer(); - auto const &freqs = cursor.frequency_buffer(); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - acc.accumulate(documents[idx], score(documents[idx], freqs[idx] + 1)); - } - cursor.next_block(); - } - } else { - for (; cursor.docid() < acc.size(); cursor.next()) { - acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); - } - } - } -}; - template class exhaustive_taat_query { using score_function_type = Score_Function; @@ -47,7 +25,11 @@ class exhaustive_taat_query { } m_accumulators.init(); for (uint32_t term = 0; term < cursors.size(); ++term) { - Taat_Traversal::traverse_term(cursors[term], score_functions[term], m_accumulators); + auto cursor = cursors[term]; + const auto score = score_functions[term]; + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); + } } m_accumulators.aggregate(m_topk); m_topk.finalize(); diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 5170f26d7..8bf4098bf 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -17,20 +17,8 @@ namespace pisa { struct Maxscore_Taat_Traversal { template void static traverse_term(Cursor &cursor, Score score, Acc &acc) { - if constexpr (std::is_same_v) { - while (cursor.docid() < acc.size()) { - auto const &documents = cursor.document_buffer(); - auto const &freqs = cursor.frequency_buffer(); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - acc.accumulate(documents[idx], score(documents[idx], freqs[idx] + 1)); - } - cursor.next_block(); - } - } else { - for (; cursor.docid() < acc.size(); cursor.next()) { - acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); - } + for (; cursor.docid() < acc.size(); cursor.next()) { + acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); } } }; @@ -113,25 +101,10 @@ class maxscore_taat_query { template void traverse_with_lookups(Cursor &cursor, Score score) { - if constexpr (std::is_same_v) { - while (cursor.docid() < m_accumulators.size()) { - auto const &documents = cursor.document_buffer(); - auto const &freqs = cursor.frequency_buffer(); - for (uint32_t idx = 0; idx < documents.size(); ++idx) { - accumulator_reference accumulator = m_accumulators[documents[idx]]; - if (accumulator > 0) { - accumulator += score(documents[idx], freqs[idx]); - } - } - cursor.next_block(); - } - } else { - for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - accumulator_reference accumulator = m_accumulators[cursor.docid()]; - if (accumulator > 0) { - accumulator += score(cursor.docid(), cursor.freq()); - } + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + accumulator_reference accumulator = m_accumulators[cursor.docid()]; + if (accumulator > 0) { + accumulator += score(cursor.docid(), cursor.freq()); } } } diff --git a/include/pisa/util/util.hpp b/include/pisa/util/util.hpp index 01223ac59..6281152ea 100644 --- a/include/pisa/util/util.hpp +++ b/include/pisa/util/util.hpp @@ -265,8 +265,4 @@ namespace pisa { bool first; }; - // TODO(michal): We should extract it in a better place, couldn't find anything better quickly. - struct input_enumerator_tag {}; - struct block_enumerator_tag : public input_enumerator_tag {}; - } From f1a8b64e08ccd39d392bc64327461406289a8744 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Sat, 19 Jan 2019 13:13:51 +0100 Subject: [PATCH 25/32] Simplified Maxscore --- .../query/algorithm/maxscore_taat_query.hpp | 70 +++++++------------ 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 8bf4098bf..efb3716f6 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -9,20 +9,6 @@ namespace pisa { -// TODO(antonio): basically here we can do a bit better. -// before scoring a document, we read its accumulator value and check if the sum of -// the accumulator value and the upper bound of the maxscores of the missing terms -// (current included) is greater than the threshold. If it is we score and add it to the -// accumulator, we go to the next document otherwise. -struct Maxscore_Taat_Traversal { - template - void static traverse_term(Cursor &cursor, Score score, Acc &acc) { - for (; cursor.docid() < acc.size(); cursor.next()) { - acc.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); - } - } -}; - template [[nodiscard]] auto max_weights(Index const &index, WandType const &wdata, term_id_vec terms) { // TODO(michal): parametrize scorer_type; didn't do that because this might mean some more @@ -89,39 +75,17 @@ class maxscore_taat_query { uint64_t operator()(term_id_vec terms) { auto cws = query::cursors_with_scores(m_index, m_wdata, terms); - return maxscore_taat( - std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); - } - - uint64_t operator()([[maybe_unused]] Index const &, term_id_vec terms) { - auto cws = query::cursors_with_scores(m_index, m_wdata, terms); - return maxscore_taat( - std::move(cws.first), std::move(cws.second), max_weights(m_index, m_wdata, terms)); - } - - template - void traverse_with_lookups(Cursor &cursor, Score score) { - for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - accumulator_reference accumulator = m_accumulators[cursor.docid()]; - if (accumulator > 0) { - accumulator += score(cursor.docid(), cursor.freq()); - } - } - } - - // TODO(michal): I think this should be eventually the `operator()` - template - uint64_t maxscore_taat(std::vector cursors, - std::vector score_functions, - std::vector max_weights) { + auto cursors = cws.first; + auto score_functions = cws.second; + auto m_w = max_weights(m_index, m_wdata, terms); if (cursors.empty()) { m_topk.clear(); return 0; } sort_many( - max_weights, [](auto lhs, auto rhs) { return lhs > rhs; }, cursors, score_functions); + m_w, [](auto lhs, auto rhs) { return lhs > rhs; }, cursors, score_functions); - float nonessential_sum = std::accumulate(max_weights.begin(), max_weights.end(), 0.0); + float nonessential_sum = std::accumulate(m_w.begin(), m_w.end(), 0.0); m_accumulators.init(); uint32_t term = 0; for (; term < cursors.size(); ++term) { @@ -130,13 +94,28 @@ class maxscore_taat_query { if (not m_topk.would_enter(nonessential_sum)) { break; } - Maxscore_Taat_Traversal::traverse_term( - cursors[term], score_functions[term], m_accumulators); - nonessential_sum -= max_weights[term]; + auto cursor = cursors[term]; + auto score = score_functions[term]; + // TODO(antonio): basically here we can do a bit better. + // before scoring a document, we read its accumulator value and check if the sum of + // the accumulator value and the upper bound of the maxscores of the missing terms + // (current included) is greater than the threshold. If it is we score and add it to the + // accumulator, we go to the next document otherwise. + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); + } + nonessential_sum -= m_w[term]; } for (; term < cursors.size(); ++term) { - traverse_with_lookups(cursors[term], score_functions[term]); + auto cursor = cursors[term]; + auto score = score_functions[term]; + for (; cursor.docid() < m_accumulators.size(); cursor.next()) { + accumulator_reference accumulator = m_accumulators[cursor.docid()]; + if (accumulator > 0) { + accumulator += score(cursor.docid(), cursor.freq()); + } + } } m_topk.clear(); @@ -145,6 +124,7 @@ class maxscore_taat_query { return m_topk.topk().size(); } + std::vector> const &topk() const { return m_topk.topk(); } private: From d08a06314a3f527b969b353a76d6d6c3a58f44bc Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Mon, 21 Jan 2019 16:16:03 +0100 Subject: [PATCH 26/32] Delete exhaustive_taat_query.hpp --- .../query/algorithm/exhaustive_taat_query.hpp | 55 ------------------- 1 file changed, 55 deletions(-) delete mode 100644 include/pisa/query/algorithm/exhaustive_taat_query.hpp diff --git a/include/pisa/query/algorithm/exhaustive_taat_query.hpp b/include/pisa/query/algorithm/exhaustive_taat_query.hpp deleted file mode 100644 index a6ef15955..000000000 --- a/include/pisa/query/algorithm/exhaustive_taat_query.hpp +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#include "util/intrinsics.hpp" -#include "topk_queue.hpp" - -#include "accumulator/simple_accumulator.hpp" -#include "accumulator/lazy_accumulator.hpp" -#include "accumulator/blocked_accumulator.hpp" - -namespace pisa { - -template -class exhaustive_taat_query { - using score_function_type = Score_Function; - - public: - exhaustive_taat_query(Index const &index, WandType const &wdata, uint64_t k) - : m_index(index), m_wdata(wdata), m_topk(k), m_accumulators(index.num_docs()) {} - - uint64_t operator()(term_id_vec terms) { - auto [cursors, score_functions] = query::cursors_with_scores(m_index, m_wdata, terms); - m_topk.clear(); - if (cursors.empty()) { - return 0; - } - m_accumulators.init(); - for (uint32_t term = 0; term < cursors.size(); ++term) { - auto cursor = cursors[term]; - const auto score = score_functions[term]; - for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); - } - } - m_accumulators.aggregate(m_topk); - m_topk.finalize(); - return m_topk.topk().size(); - } - - std::vector> const &topk() const { return m_topk.topk(); } - - private: - Index const & m_index; - WandType const & m_wdata; - topk_queue m_topk; - Acc m_accumulators; -}; - -template -[[nodiscard]] auto make_exhaustive_taat_query(Index const & index, - WandType const &wdata, - uint64_t k) { - return exhaustive_taat_query(index, wdata, k); -} - -}; // namespace pisa From f254277ac36c852936b63d71e5a489dbd02b3dcd Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Mon, 21 Jan 2019 17:51:14 +0100 Subject: [PATCH 27/32] Update test_ranked_queries.cpp --- test/test_ranked_queries.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 788e3de43..4ee9a828c 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -116,22 +116,22 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat_blocked") TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat") { - pisa::ranked_or_taat_query taat_q( + pisa::ranked_or_taat_query ranked_or_taat_q( test_against_or(ranked_or_taat_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_blocked") { pisa::ranked_or_taat_query> - taat_q(index, wdata, 10); - test_against_or(taat_q); + ranked_or_taat_q(index, wdata, 10); + test_against_or(ranked_or_taat_q); } TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_query_lazy") { - pisa::ranked_or_taat_query> taat_q( + pisa::ranked_or_taat_query> ranked_or_taat_q( index, wdata, 10); - test_against_or(taat_q); + test_against_or(ranked_or_taat_q); } /// Issue #26 https://github.com/pisa-engine/pisa/issues/26 From b92e93de3d897a0f5712e1f0eb7800afa89d9878 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Mon, 21 Jan 2019 19:18:57 +0100 Subject: [PATCH 28/32] Update test_ranked_queries.cpp --- test/test_ranked_queries.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 4ee9a828c..099a20df8 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -117,6 +117,7 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat") { pisa::ranked_or_taat_query ranked_or_taat_q( + index, wdata, 10); test_against_or(ranked_or_taat_q); } From c8c1b35e1195a39b2fa96fa2f72d1b80c738d82b Mon Sep 17 00:00:00 2001 From: Antonio Date: Tue, 22 Jan 2019 09:25:28 +0000 Subject: [PATCH 29/32] Faster MaxScore --- include/pisa/query/algorithm/maxscore_taat_query.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index efb3716f6..63d8638f1 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -74,12 +74,12 @@ class maxscore_taat_query { : m_index(index), m_wdata(wdata), m_k(k), m_topk(k), m_accumulators(index.num_docs()) {} uint64_t operator()(term_id_vec terms) { + m_topk.clear(); auto cws = query::cursors_with_scores(m_index, m_wdata, terms); auto cursors = cws.first; auto score_functions = cws.second; auto m_w = max_weights(m_index, m_wdata, terms); if (cursors.empty()) { - m_topk.clear(); return 0; } sort_many( @@ -89,11 +89,10 @@ class maxscore_taat_query { m_accumulators.init(); uint32_t term = 0; for (; term < cursors.size(); ++term) { - m_topk.clear(); - m_accumulators.aggregate(m_topk); if (not m_topk.would_enter(nonessential_sum)) { break; } + m_topk.clear(); auto cursor = cursors[term]; auto score = score_functions[term]; // TODO(antonio): basically here we can do a bit better. @@ -103,6 +102,7 @@ class maxscore_taat_query { // accumulator, we go to the next document otherwise. for (; cursor.docid() < m_accumulators.size(); cursor.next()) { m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); + m_topk.insert(m_accumulators[cursor.docid()]); } nonessential_sum -= m_w[term]; } From e60142b826ec99350cbe7c75227e396dc5207b88 Mon Sep 17 00:00:00 2001 From: Antonio Date: Tue, 22 Jan 2019 15:40:18 +0000 Subject: [PATCH 30/32] Added extra check --- include/pisa/query/algorithm/maxscore_taat_query.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/pisa/query/algorithm/maxscore_taat_query.hpp b/include/pisa/query/algorithm/maxscore_taat_query.hpp index 63d8638f1..48548fa61 100644 --- a/include/pisa/query/algorithm/maxscore_taat_query.hpp +++ b/include/pisa/query/algorithm/maxscore_taat_query.hpp @@ -101,8 +101,10 @@ class maxscore_taat_query { // (current included) is greater than the threshold. If it is we score and add it to the // accumulator, we go to the next document otherwise. for (; cursor.docid() < m_accumulators.size(); cursor.next()) { - m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); - m_topk.insert(m_accumulators[cursor.docid()]); + if(m_topk.would_enter(nonessential_sum + m_accumulators[cursor.docid()])) { + m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq())); + m_topk.insert(m_accumulators[cursor.docid()]); + } } nonessential_sum -= m_w[term]; } From 18be6a0883a5a218f6ed549961c867a78a841925 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 25 Jan 2019 17:12:29 -0500 Subject: [PATCH 31/32] Use int for 8 bits --- include/pisa/accumulator/lazy_accumulator.hpp | 22 ++++++++++++++----- .../pisa/accumulator/simple_accumulator.hpp | 4 +++- include/pisa/topk_queue.hpp | 2 ++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/pisa/accumulator/lazy_accumulator.hpp b/include/pisa/accumulator/lazy_accumulator.hpp index 347a8eba2..caa334ec4 100644 --- a/include/pisa/accumulator/lazy_accumulator.hpp +++ b/include/pisa/accumulator/lazy_accumulator.hpp @@ -1,5 +1,7 @@ #pragma once +#include "topk_queue.hpp" + namespace pisa { template @@ -18,15 +20,23 @@ struct Lazy_Accumulator { std::array accumulators{}; [[nodiscard]] auto counter(int pos) const noexcept -> int { - return (descriptor >> (pos * counter_bit_size)) & mask; + if constexpr (counter_bit_size == 8) { + return static_cast(*(reinterpret_cast(&descriptor) + pos)); + } else { + return (descriptor >> (pos * counter_bit_size)) & mask; + } } void reset_counter(int pos, int counter) { - auto const shift = pos * counter_bit_size; - descriptor &= ~(mask << shift); - descriptor |= static_cast(counter) << shift; - accumulators[pos] = 0; + if constexpr (counter_bit_size == 8) { + *(reinterpret_cast(&descriptor) + pos) = static_cast(counter); + } else { + auto const shift = pos * counter_bit_size; + descriptor &= ~(mask << shift); + descriptor |= static_cast(counter) << shift; + accumulators[pos] = 0; + } } }; @@ -92,4 +102,4 @@ struct Lazy_Accumulator { int m_counter{}; }; -} \ No newline at end of file +} diff --git a/include/pisa/accumulator/simple_accumulator.hpp b/include/pisa/accumulator/simple_accumulator.hpp index 07d34f5ba..01a9d5876 100644 --- a/include/pisa/accumulator/simple_accumulator.hpp +++ b/include/pisa/accumulator/simple_accumulator.hpp @@ -1,5 +1,7 @@ #pragma once +#include "topk_queue.hpp" + namespace pisa { struct Simple_Accumulator : public std::vector { @@ -12,4 +14,4 @@ struct Simple_Accumulator : public std::vector { } }; -} \ No newline at end of file +} diff --git a/include/pisa/topk_queue.hpp b/include/pisa/topk_queue.hpp index 2ff4af9f8..f906e0689 100644 --- a/include/pisa/topk_queue.hpp +++ b/include/pisa/topk_queue.hpp @@ -1,5 +1,7 @@ #pragma once +#include "util/util.hpp" + namespace pisa { struct topk_queue { From 48a0f8092cbe051de710e56aca892256c56c7419 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 25 Jan 2019 20:46:31 -0500 Subject: [PATCH 32/32] Bug fix: reset counter --- include/pisa/accumulator/lazy_accumulator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/pisa/accumulator/lazy_accumulator.hpp b/include/pisa/accumulator/lazy_accumulator.hpp index caa334ec4..991e5a253 100644 --- a/include/pisa/accumulator/lazy_accumulator.hpp +++ b/include/pisa/accumulator/lazy_accumulator.hpp @@ -35,8 +35,8 @@ struct Lazy_Accumulator { auto const shift = pos * counter_bit_size; descriptor &= ~(mask << shift); descriptor |= static_cast(counter) << shift; - accumulators[pos] = 0; } + accumulators[pos] = 0; } };