Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
653d21e
First take at exhaustive TAAT
elshize Dec 20, 2018
8f0b54a
Fetch an entire block at a time for TAAT
elshize Dec 22, 2018
9f4e325
Add prefetching for TAAT
elshize Dec 22, 2018
c28e5d2
Return buffer references from posting lists instead of moved buffers.
elshize Dec 22, 2018
5d5519b
Blocked accumulator array for TAAT
elshize Dec 22, 2018
886e580
TAAT maxscore and lazy accumulator
elshize Dec 27, 2018
b0e2824
TAAT MaxScore and Blocked Accumulator
elshize Dec 28, 2018
7737d1f
Remove heap stuff
elshize Dec 28, 2018
0664cba
TAAT optimizations
elshize Dec 29, 2018
9ae6b1a
Vectorize lookup traversal.
elshize Dec 29, 2018
26a05f1
Simple but effective ranked_or with taat
amallia Jan 15, 2019
41ca198
Remove OpenMP
elshize Jan 15, 2019
200be05
Remove OpenMP
elshize Jan 15, 2019
c3400d1
Use template rather than std::function for faster processing
elshize Jan 15, 2019
87252f3
Lazy accumulator fixed
elshize Jan 15, 2019
b9dd1fd
Fix block traversal issue
elshize Jan 16, 2019
5c539b4
Removed ds2i namespace
amallia Jan 16, 2019
0c36632
Moved algos
amallia Jan 16, 2019
6e96d16
Merge with master
amallia Jan 16, 2019
f2c93f9
code cleanup
amallia Jan 16, 2019
7ca53d5
Added comment [skip ci]
amallia Jan 16, 2019
30782ac
Improved queries interface
amallia Jan 16, 2019
3649a7d
removed unused lambdas
amallia Jan 16, 2019
81db028
Removed buffers
amallia Jan 16, 2019
09fbc02
Merge branch 'master' into taat
elshize Jan 17, 2019
f1a8b64
Simplified Maxscore
amallia Jan 19, 2019
5e3923e
Including only ranked_or_taat and simple_accumulator
amallia Jan 21, 2019
4016c26
Merge branch 'master' into ranked_or_taat
amallia Jan 21, 2019
1471dde
Update queries.hpp
amallia Jan 21, 2019
56ecfc6
Update thresholds.cpp
amallia Jan 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ list(APPEND LCOV_REMOVE_PATTERNS "'${PROJECT_SOURCE_DIR}/external/*'")


if (UNIX)

# For hardware popcount and other special instructions
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")

Expand All @@ -62,8 +61,6 @@ endif()

set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)


include_directories(include)
add_library(pisa INTERFACE)
Expand Down
15 changes: 15 additions & 0 deletions include/pisa/accumulator/simple_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

namespace pisa {

struct Simple_Accumulator : public std::vector<float> {
Simple_Accumulator(std::ptrdiff_t size) : std::vector<float>(size) {}
void init() { std::fill(begin(), end(), 0.0); }
void accumulate(uint32_t doc, float score) { operator[](doc) += score; }
void aggregate(topk_queue &topk) {
uint64_t docid = 0u;
std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); });
}
};

}
1 change: 1 addition & 0 deletions include/pisa/block_posting_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ namespace pisa {

class document_enumerator {
public:

document_enumerator(uint8_t const* data, uint64_t universe,
size_t term_id = 0)
: m_n(0) // just to silence warnings
Expand Down
1 change: 1 addition & 0 deletions include/pisa/freq_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ namespace pisa {

class document_enumerator {
public:

void reset()
{
m_cur_pos = 0;
Expand Down
14 changes: 9 additions & 5 deletions include/pisa/query/algorithm/and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

namespace pisa {

template <bool with_freqs>
template <typename Index, bool with_freqs>
struct and_query {

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) const {
and_query(Index const &index) : m_index(index) {}

uint64_t operator()(term_id_vec terms) const {
if (terms.empty())
return 0;
remove_duplicate_terms(terms);
Expand All @@ -16,7 +17,7 @@ struct and_query {
enums.reserve(terms.size());

for (auto term : terms) {
enums.push_back(index[term]);
enums.push_back(m_index[term]);
}

// sort by increasing frequency
Expand All @@ -27,7 +28,7 @@ struct and_query {
uint64_t results = 0;
uint64_t candidate = enums[0].docid();
size_t i = 1;
while (candidate < index.num_docs()) {
while (candidate < m_index.num_docs()) {
for (; i < enums.size(); ++i) {
enums[i].next_geq(candidate);
if (enums[i].docid() != candidate) {
Expand All @@ -52,6 +53,9 @@ struct and_query {
}
return results;
}

private:
Index const &m_index;
};

} // namespace pisa
17 changes: 9 additions & 8 deletions include/pisa/query/algorithm/block_max_maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct block_max_maxscore_query {

typedef bm25 scorer_type;

block_max_maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
block_max_maxscore_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
typedef typename WandType::wand_data_enumerator wdata_enum;

Expand All @@ -32,7 +32,7 @@ struct block_max_maxscore_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto w_enum = m_wdata->getenum(term.first);
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
auto max_weight = q_weight * m_wdata->max_term_weight(term.first);
Expand Down Expand Up @@ -66,10 +66,10 @@ struct block_max_maxscore_query {
})
->docs_enum.docid();

while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) {
while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) {
float score = 0;
float norm_len = m_wdata->norm_len(cur_doc);
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) {
if (ordered_enums[i]->docs_enum.docid() == cur_doc) {
score +=
Expand Down Expand Up @@ -129,6 +129,7 @@ struct block_max_maxscore_query {
std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
13 changes: 7 additions & 6 deletions include/pisa/query/algorithm/block_max_wand_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct block_max_wand_query {
typedef bm25 scorer_type;

block_max_wand_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
block_max_wand_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();

if (terms.empty())
return 0;
auto query_term_freqs = query_freqs(terms);
uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
typedef typename WandType::wand_data_enumerator wdata_enum;

Expand All @@ -30,7 +30,7 @@ struct block_max_wand_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto w_enum = m_wdata->getenum(term.first);
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);

Expand Down Expand Up @@ -204,6 +204,7 @@ struct block_max_wand_query {
topk_queue const &get_topk() const { return m_topk; }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
16 changes: 8 additions & 8 deletions include/pisa/query/algorithm/maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct maxscore_query {

typedef bm25 scorer_type;

maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
maxscore_query(Index const &index, WandType const &wdata, uint64_t k) : m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
struct scored_enum {
enum_type docs_enum;
Expand All @@ -29,7 +28,7 @@ struct maxscore_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
auto max_weight = q_weight * m_wdata->max_term_weight(term.first);
enums.push_back(scored_enum{std::move(list), q_weight, max_weight});
Expand Down Expand Up @@ -62,10 +61,10 @@ struct maxscore_query {
})
->docs_enum.docid();

while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) {
while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) {
float score = 0;
float norm_len = m_wdata->norm_len(cur_doc);
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) {
if (ordered_enums[i]->docs_enum.docid() == cur_doc) {
score +=
Expand Down Expand Up @@ -109,6 +108,7 @@ struct maxscore_query {
std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
16 changes: 10 additions & 6 deletions include/pisa/query/algorithm/or_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

namespace pisa {

template <bool with_freqs>
template <typename Index, bool with_freqs>
struct or_query {

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) const {
or_query(Index const &index) : m_index(index) {}

uint64_t operator()(term_id_vec terms) const {
if (terms.empty())
return 0;
remove_duplicate_terms(terms);
Expand All @@ -16,7 +17,7 @@ struct or_query {
enums.reserve(terms.size());

for (auto term : terms) {
enums.push_back(index[term]);
enums.push_back(m_index[term]);
}

uint64_t results = 0;
Expand All @@ -27,9 +28,9 @@ struct or_query {
})
->docid();

while (cur_doc < index.num_docs()) {
while (cur_doc < m_index.num_docs()) {
results += 1;
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = 0; i < enums.size(); ++i) {
if (enums[i].docid() == cur_doc) {
if (with_freqs) {
Expand All @@ -47,6 +48,9 @@ struct or_query {

return results;
}

private:
Index const &m_index;
};

} // namespace pisa
15 changes: 8 additions & 7 deletions include/pisa/query/algorithm/ranked_and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct ranked_and_query {

typedef bm25 scorer_type;

ranked_and_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
ranked_and_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) {
uint64_t operator()(term_id_vec terms) {
size_t results = 0;
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
struct scored_enum {
enum_type docs_enum;
Expand All @@ -29,7 +29,7 @@ struct ranked_and_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
enums.push_back(scored_enum{std::move(list), q_weight});
}
Expand All @@ -41,7 +41,7 @@ struct ranked_and_query {

uint64_t candidate = enums[0].docs_enum.docid();
size_t i = 1;
while (candidate < index.num_docs()) {
while (candidate < m_index.num_docs()) {
for (; i < enums.size(); ++i) {
enums[i].docs_enum.next_geq(candidate);
if (enums[i].docs_enum.docid() != candidate) {
Expand Down Expand Up @@ -80,6 +80,7 @@ struct ranked_and_query {
topk_queue &get_topk() { return m_topk; }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
Loading