Skip to content

Commit f587569

Browse files
committed
refactoring of encoders
1 parent 23bbac3 commit f587569

7 files changed

Lines changed: 123 additions & 316 deletions

File tree

README.md

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,6 @@ For a testing environment, use the following instead:
100100

101101
(NOTE: Beware that the software will result in a much slower execution when running in debug mode and using sanitizers. Use this only for debug purposes, not to run performance tests.)
102102

103-
### Enable All Encoders
104-
105-
By default, you can choose between three encoders to compress the PTHash
106-
data structure (see the output of `./build --help` for suggestions).
107-
108-
If you want to test all the encoders we tested in the papers,
109-
compile again with
110-
111-
cmake .. -D PTHASH_ENABLE_ALL_ENCODERS=On
112-
113103
### Enable Large Bucket-Id Type
114104

115105
By default, PTHash assumes there are less than $2^{32}$ buckets, hence 32-bit integers are used
@@ -233,10 +223,6 @@ Giulio on 13/04/2025: Update this section with new benchmarks.
233223

234224
<!-- The script `script/run_benchmark.sh` runs some trade-off configurations (encoder, $\alpha$, $\lambda$) that have been tested in the papers, on 100M and 1000M keys.
235225
236-
Be sure you run the benchmark after compiling with
237-
238-
cmake .. -D PTHASH_ENABLE_ALL_ENCODERS=On
239-
240226
From within the directory where the code has been compiled, just run
241227
242228
bash ../script/run_benchmark.sh 100000000 2> results.json

include/dense_partitioned_phf.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,6 @@ struct dense_partitioned_phf //
166166
};
167167

168168
template <typename Hasher>
169-
using phobic = dense_partitioned_phf<Hasher, opt_bucketer, inter_C_inter_R, true>;
169+
using phobic = dense_partitioned_phf<Hasher, opt_bucketer, R_int, true>;
170170

171171
} // namespace pthash

include/utils/dense_encoders.hpp

Lines changed: 10 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ struct dense_mono : dense_encoder {
2424
}
2525

2626
static std::string name() {
27-
return "mono-" + Encoder::name();
27+
return Encoder::name();
2828
}
2929

3030
size_t size() const {
@@ -100,7 +100,7 @@ struct dense_interleaved : dense_encoder {
100100
}
101101

102102
static std::string name() {
103-
return "inter-" + Encoder::name();
103+
return Encoder::name() + "-int";
104104
}
105105

106106
inline uint64_t access(const uint64_t partition, const uint64_t bucket) const {
@@ -109,7 +109,7 @@ struct dense_interleaved : dense_encoder {
109109
}
110110

111111
uint64_t num_bits() const {
112-
uint64_t sum = 0;
112+
uint64_t sum = 8 * sizeof(uint64_t); // for std::vector size
113113
for (auto const& e : m_encoders) sum += e.num_bits();
114114
return sum;
115115
}
@@ -133,93 +133,13 @@ struct dense_interleaved : dense_encoder {
133133
std::vector<Encoder> m_encoders;
134134
};
135135

136-
template <typename Front, typename Back, uint64_t numerator = 1, uint64_t denominator = 3>
137-
struct dense_dual : dense_encoder {
138-
template <typename Iterator>
139-
void encode(Iterator begin, //
140-
const uint64_t num_partitions, //
141-
const uint64_t num_buckets_per_partition, const uint64_t num_threads) //
142-
{
143-
m_front_size = num_buckets_per_partition * (static_cast<double>(numerator) / denominator);
144-
if (num_threads == 1) {
145-
if (m_front_size != 0) m_front.encode(begin, num_partitions, m_front_size, 1);
146-
if (num_buckets_per_partition - m_front_size != 0)
147-
m_back.encode(begin + m_front_size * num_partitions, num_partitions,
148-
num_buckets_per_partition - m_front_size, 1);
149-
} else {
150-
uint64_t m_front_threads =
151-
(num_threads * m_front_size + num_buckets_per_partition - 1) /
152-
num_buckets_per_partition;
153-
auto exe = [&]() {
154-
if (m_front_size != 0)
155-
m_front.encode(begin, num_partitions, m_front_size, m_front_threads);
156-
};
157-
std::thread frontThread = std::thread(exe);
158-
if (num_buckets_per_partition - m_front_size != 0)
159-
m_back.encode(begin + m_front_size * num_partitions, num_partitions,
160-
num_buckets_per_partition - m_front_size,
161-
num_threads - m_front_threads);
162-
if (frontThread.joinable()) frontThread.join();
163-
}
164-
}
165-
166-
static std::string name() {
167-
std::ostringstream oss;
168-
oss << Front::name() << "-" << Back::name() << "-" << std::fixed << std::setprecision(2)
169-
<< static_cast<double>(numerator) / denominator;
170-
return oss.str();
171-
}
172-
173-
size_t num_bits() const {
174-
return sizeof(m_front_size) * 8 + m_front.num_bits() + m_back.num_bits();
175-
}
176-
177-
uint64_t access(uint64_t i) const {
178-
if (i < m_front.size()) return m_front.access(i);
179-
return m_back.access(i - m_front.size());
180-
}
181-
182-
inline uint64_t access(const uint64_t partition, const uint64_t bucket) const {
183-
if (bucket < m_front_size) return m_front.access(partition, bucket);
184-
return m_back.access(partition, bucket - m_front_size);
185-
}
186-
187-
template <typename Visitor>
188-
void visit(Visitor& visitor) const {
189-
visit_impl(visitor, *this);
190-
}
191-
192-
template <typename Visitor>
193-
void visit(Visitor& visitor) {
194-
visit_impl(visitor, *this);
195-
}
196-
197-
private:
198-
template <typename Visitor, typename T>
199-
static void visit_impl(Visitor& visitor, T&& t) {
200-
visitor.visit(t.m_front_size);
201-
visitor.visit(t.m_front);
202-
visitor.visit(t.m_back);
203-
}
204-
205-
uint64_t m_front_size;
206-
Front m_front;
207-
Back m_back;
208-
};
136+
typedef dense_mono<compact> C_mono;
137+
typedef dense_mono<dictionary> D_mono;
138+
typedef dense_mono<rice> R_mono;
139+
typedef dense_mono<elias_fano> EF_mono;
209140

210-
typedef dense_mono<rice> mono_R;
211-
typedef dense_interleaved<rice> inter_R;
212-
typedef dense_mono<compact> mono_C;
213-
typedef dense_interleaved<compact> inter_C;
214-
typedef dense_mono<dictionary> mono_D;
215-
typedef dense_interleaved<dictionary> inter_D;
216-
typedef dense_mono<elias_fano> mono_EF;
217-
typedef dense_interleaved<elias_fano> inter_EF;
218-
219-
/* dual_interleaved encoders */
220-
typedef dense_dual<mono_C, mono_R, 1, 3> mono_C_mono_R;
221-
typedef dense_dual<inter_C, inter_R, 1, 3> inter_C_inter_R;
222-
typedef dense_dual<mono_D, mono_R, 1, 3> mono_D_mono_R;
223-
typedef dense_dual<inter_D, inter_R, 1, 3> inter_D_inter_R;
141+
typedef dense_interleaved<compact> C_int;
142+
typedef dense_interleaved<dictionary> D_int;
143+
typedef dense_interleaved<rice> R_int;
224144

225145
} // namespace pthash

script/run_benchmark.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -54,28 +54,29 @@ def run_build(n, base_filename=None):
5454

5555
for l in lambda_values:
5656
for a in alpha_values:
57-
58-
cmd = [
59-
"./build",
60-
"-n", str(n),
61-
"-l", f"{l:.2f}",
62-
"-a", f"{a:.2f}",
63-
"-e", "all",
64-
"-b", "skew",
65-
"-s", "0",
66-
"-q", str(n),
67-
"-t", str(num_threads),
68-
"--minimal",
69-
"--check",
70-
"--verbose"
71-
]
72-
73-
run_cmd("SINGLE", l, a, cmd, log_file, results_file)
74-
75-
avg_partition_size = n / (num_threads * num_partitions_per_thread)
76-
run_cmd("PARTITIONED", l, a, cmd + ["-p", str(avg_partition_size)], log_file, results_file)
77-
78-
run_cmd("DENSE-PARTITIONED", l, a, cmd + ["--dense"], log_file, results_file)
57+
for b in ["skew", "opt"]:
58+
cmd = [
59+
"./build",
60+
"-n", str(n),
61+
"-l", str(l),
62+
"-a", str(a),
63+
"-e", "all",
64+
"-b", b,
65+
"-s", "0",
66+
"-q", str(n),
67+
"-t", str(num_threads),
68+
"--minimal",
69+
"--check",
70+
"--verbose"
71+
]
72+
73+
run_cmd("SINGLE", l, a, cmd, log_file, results_file)
74+
75+
avg_partition_size = n / (num_threads * num_partitions_per_thread)
76+
run_cmd("PARTITIONED", l, a, cmd + ["-p", str(avg_partition_size)], log_file, results_file)
77+
78+
if a == 1.0:
79+
run_cmd("DENSE-PARTITIONED", l, a, cmd + ["--dense"], log_file, results_file)
7980

8081
log_file.close()
8182
results_file.close()

0 commit comments

Comments
 (0)