@@ -18,27 +18,58 @@ namespace bits {
1818
1919 under the name "darray" (short for "dense" array).
2020
21- The bitvector is split into variable-length super- blocks, each
22- containing L ones (except for, possibly, the last super- block).
23- A super- block is said to be "sparse" if its length is >= L2;
24- otherwise it is said "dense". Sparse super- blocks are represented
21+ The bitvector is split into variable-length blocks, each
22+ containing L ones (except for, possibly, the last block).
23+ A block is said to be "sparse" if its length is >= L2;
24+ otherwise it is said "dense". Sparse blocks are represented
2525 verbatim, i.e., the positions of the L ones are coded using 64-bit
26- integers. A dense super- block, instead, is sparsified: we keep
26+ integers. A dense block, instead, is sparsified: we keep
2727 one position every L3 positions. The positions are coded relatively
28- to the beginning of each super- block, hence using log2(L2) bits per
28+ to the beginning of each block, hence using log2(L2) bits per
2929 position.
3030
31- A select query first checks if the super-block is sparse: if it is,
32- then the query is solved in O(1). If the super-block is dense instead,
33- the corresponding block is accessed and a linear scan is performed
34- for a worst-case cost of O(L2/L3).
31+ Let m be the number of bits set in B. The data structure stores:
32+ - An array `block_inventory[0..m/L)` such that `block_inventory[i]`
33+ holds the result of Select(iL) if block i is dense; it holds the start position
34+ in `overflow_positions` of the 1-bit positions of the block i if it is sparse.
35+ Its space is m/L*64 bits.
36+ - An array `overflow_positions` holding the positions of the L ones
37+ in sparse blocks. As we have at most m/L2 sparse blocks, its space is
38+ m/L2*L*64 bits at most.
39+ - An array `subblock_inventory[0..m/L3)` such that `subblock_inventory[i]`
40+ holds the result of Select(iL3). Its space is m/L3*log2(L2) bits.
41+
42+ A Select(i) query first checks if the block i/L is sparse by accessing
43+ `block_inventory[i/L]`: if it is, then the query is solved in O(1) as
44+ `overflow_positions[block_inventory[i/L] + i%L]`; otherwise the corresponding
45+ sub-block is accessed and a sequential scan of at most L2 bits is performed.
46+ If p is the position computed during the scan, the final result is
47+ `block_inventory[i/L] + subblock_inventory[i/L3] + p`.
48+
49+ If Scan(Q) = Q/B is the number of cache-misses involved in a sequential scan
50+ of Q bits with a cache-line of B bits, it follows that the number of cache-misses
51+ per Select(i) query are:
52+ 2, if position i belongs to a sparse block;
53+ 2 + Scan(L2), if i belongs to a dense block.
3554
3655 This implementation uses, by default:
3756
3857 L = 1,024 (block_size)
3958 L2 = 65,536 (so that each position in a dense block can be coded
4059 using 16-bit integers)
4160 L3 = 32 (subblock_size)
61+
62+ For these block sizes, we have a space usage of at most:
63+ m/2^10*64 (block_inventory) +
64+ m/2^16*2^10*64 + (sparse blocks) +
65+ m/2^5*2^4 (dense blocks) =
66+ 25/16 m = 1.5625 m bits.
67+
68+ (When used to index the high bitvector of Elias-Fano, sparse blocks are rare;
69+ so the space usage is likely close to 9/16 m.)
70+
71+ If 0.0 <= d = m/n <= 1.0 is the density of the bitvector, the index costs at most
72+ 25/16 dn extra bits, for a total of n(1+25/16d) bits.
4273*/
4374
4475template < //
@@ -85,6 +116,32 @@ struct darray {
85116 m_block_inventory.swap (block_inventory);
86117 m_subblock_inventory.swap (subblock_inventory);
87118 m_overflow_positions.swap (overflow_positions);
119+
120+ // {
121+ // std::cout << "num_blocks = " << m_block_inventory.size()
122+ // << " (expected = " << ((m_positions + block_size - 1) / block_size) << ")"
123+ // << std::endl;
124+ // std::cout << "num_subblocks = " << m_subblock_inventory.size()
125+ // << " (expected = " << ((m_positions + subblock_size - 1) / subblock_size)
126+ // << ")" << std::endl;
127+
128+ // std::cout << "block_inventory: got "
129+ // << (essentials::vec_bytes(m_block_inventory) * 8.0 - 64) / m_positions
130+ // << " bits/bit (expected "
131+ // << ((m_positions + block_size - 1) / block_size * 64.0) / m_positions <<
132+ // ")"
133+ // << std::endl;
134+
135+ // std::cout << "overflow_positions: got "
136+ // << (essentials::vec_bytes(m_overflow_positions) * 8.0 - 64) / m_positions
137+ // << " bits/bit (at most 1)" << std::endl;
138+
139+ // std::cout << "subblock_inventory: got "
140+ // << (essentials::vec_bytes(m_subblock_inventory) * 8.0 - 16) / m_positions
141+ // << " bits/bit (expected "
142+ // << ((m_positions + subblock_size - 1) / subblock_size * 16.0) / m_positions
143+ // << ")" << std::endl;
144+ // }
88145 }
89146
90147 /*
@@ -95,7 +152,7 @@ struct darray {
95152 assert (i < num_positions ());
96153 uint64_t block = i / block_size;
97154 int64_t block_pos = m_block_inventory[block];
98- if (block_pos < 0 ) { // sparse super- block
155+ if (block_pos < 0 ) { // sparse block
99156 uint64_t overflow_pos = uint64_t (-block_pos - 1 );
100157 return m_overflow_positions[overflow_pos + (i & (block_size - 1 ))];
101158 }
0 commit comments