Skip to content

Commit 0f7a1c3

Browse files
authored
[opt](expr) Remove FixedContainer optimization for StringValueSet (#62243)
Benchmark results show that `FixedContainer` (SIMD linear scan) is slower than `DynamicContainer` (hash set) for `std::string`/`StringRef` when N >= 2. The FixedContainer optimization only benefits integer types at small sizes, but hurts string lookups due to expensive comparisons scaling linearly with N. This PR removes the `FixedContainer` specialization from `create_string_value_set`, always using the hash-based `DynamicContainer` instead. Additionally fixes deprecated `DoNotOptimize` warnings in benchmark files and adds a new `FixedContainer` vs `DynamicContainer` benchmark. ``` Fixed_Int32_N1 5.28 us 5.25 us 100000 Dynamic_Int32_N1 29.4 us 29.1 us 23538 Fixed_Int32_N2 15.7 us 15.6 us 46383 Dynamic_Int32_N2 41.4 us 41.2 us 16213 Fixed_Int32_N3 25.7 us 25.6 us 27386 Dynamic_Int32_N3 45.9 us 45.5 us 14574 Fixed_Int32_N4 9.45 us 9.40 us 79614 Dynamic_Int32_N4 40.5 us 40.3 us 15873 Fixed_Int32_N5 22.9 us 22.8 us 29587 Dynamic_Int32_N5 46.5 us 46.3 us 14624 Fixed_Int32_N6 31.5 us 31.3 us 22306 Dynamic_Int32_N6 46.5 us 46.3 us 14851 Fixed_Int32_N7 41.6 us 41.4 us 17142 Dynamic_Int32_N7 56.1 us 55.8 us 12542 Fixed_Int32_N8 11.1 us 11.1 us 70730 Dynamic_Int32_N8 43.3 us 42.1 us 12838 Fixed_Int64_N1 5.91 us 5.88 us 121146 Dynamic_Int64_N1 41.5 us 41.3 us 19183 Fixed_Int64_N2 16.0 us 16.0 us 47690 Dynamic_Int64_N2 40.2 us 40.0 us 18290 Fixed_Int64_N3 27.0 us 26.9 us 27097 Dynamic_Int64_N3 46.9 us 46.6 us 15495 Fixed_Int64_N4 24.8 us 24.6 us 28495 Dynamic_Int64_N4 40.5 us 40.2 us 17581 Fixed_Int64_N5 39.9 us 39.6 us 18133 Dynamic_Int64_N5 47.4 us 47.2 us 14943 Fixed_Int64_N6 45.8 us 45.5 us 15396 Dynamic_Int64_N6 49.2 us 48.9 us 14062 Fixed_Int64_N7 55.0 us 54.8 us 12726 Dynamic_Int64_N7 57.6 us 57.3 us 12673 Fixed_Int64_N8 14.5 us 14.4 us 48230 Dynamic_Int64_N8 40.3 us 40.1 us 17560 Fixed_String_N1 243 us 242 us 2884 Dynamic_String_N1 1364 us 1358 us 516 Fixed_String_N2 1195 us 1189 us 591 Dynamic_String_N2 739 us 736 us 946 Fixed_String_N3 163 us 163 us 4433 Dynamic_String_N3 226 us 225 us 3257 Fixed_String_N4 214 us 213 us 3290 Dynamic_String_N4 205 us 204 us 3437 Fixed_String_N5 266 us 265 us 2634 Dynamic_String_N5 220 us 219 us 2900 Fixed_String_N6 313 us 312 us 2246 Dynamic_String_N6 220 us 219 us 3195 Fixed_String_N7 362 us 360 us 1962 Dynamic_String_N7 234 us 233 us 3008 Fixed_String_N8 2586 us 2575 us 274 Dynamic_String_N8 213 us 212 us 3350 ```
1 parent 8c53f07 commit 0f7a1c3

6 files changed

Lines changed: 230 additions & 46 deletions

File tree

be/benchmark/benchmark_bit_pack.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,10 @@ static void BM_BitPack(benchmark::State& state) {
6565
std::vector<uint8_t> output(size);
6666

6767
for (auto _ : state) {
68-
benchmark::DoNotOptimize(test_data.data());
69-
benchmark::DoNotOptimize(output.data());
68+
auto* td = test_data.data();
69+
auto* od = output.data();
70+
benchmark::DoNotOptimize(td);
71+
benchmark::DoNotOptimize(od);
7072
bit_pack(test_data.data(), (uint8_t)n, w, output.data());
7173
benchmark::ClobberMemory();
7274
}
@@ -92,8 +94,10 @@ static void BM_BitPackOptimized(benchmark::State& state) {
9294
ForEncoder<__int128_t> forEncoder(nullptr);
9395

9496
for (auto _ : state) {
95-
benchmark::DoNotOptimize(test_data.data());
96-
benchmark::DoNotOptimize(output.data());
97+
auto* td = test_data.data();
98+
auto* od = output.data();
99+
benchmark::DoNotOptimize(td);
100+
benchmark::DoNotOptimize(od);
97101
forEncoder.bit_pack(test_data.data(), (uint8_t)n, w, output.data());
98102
benchmark::ClobberMemory();
99103
}
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
// ============================================================
19+
// Benchmark: FixedContainer vs DynamicContainer (from hybrid_set.h)
20+
//
21+
// Measures find() performance for different container sizes (1-8)
22+
// and element types (int32_t, int64_t, std::string).
23+
// ============================================================
24+
25+
#pragma once
26+
27+
#include <benchmark/benchmark.h>
28+
29+
#include <cstdint>
30+
#include <string>
31+
#include <vector>
32+
33+
#include "exprs/hybrid_set.h"
34+
35+
namespace doris {
36+
37+
// ============================================================
38+
// Test data generators
39+
// ============================================================
40+
41+
template <typename T>
42+
static std::vector<T> generate_values(size_t n);
43+
44+
template <>
45+
std::vector<int32_t> generate_values<int32_t>(size_t n) {
46+
std::vector<int32_t> vals;
47+
vals.reserve(n);
48+
for (size_t i = 0; i < n; ++i) {
49+
vals.push_back(static_cast<int32_t>(i * 7 + 13));
50+
}
51+
return vals;
52+
}
53+
54+
template <>
55+
std::vector<int64_t> generate_values<int64_t>(size_t n) {
56+
std::vector<int64_t> vals;
57+
vals.reserve(n);
58+
for (size_t i = 0; i < n; ++i) {
59+
vals.push_back(static_cast<int64_t>(i * 7 + 13));
60+
}
61+
return vals;
62+
}
63+
64+
template <>
65+
std::vector<std::string> generate_values<std::string>(size_t n) {
66+
std::vector<std::string> vals;
67+
vals.reserve(n);
68+
for (size_t i = 0; i < n; ++i) {
69+
vals.push_back("key_" + std::to_string(i * 7 + 13));
70+
}
71+
return vals;
72+
}
73+
74+
// Number of find() calls per iteration to amortize loop overhead.
75+
static constexpr size_t FIND_ITERS = 10000;
76+
77+
// ============================================================
78+
// FixedContainer benchmark: insert N values, then find them
79+
// ============================================================
80+
81+
template <typename T, size_t N>
82+
static void BM_FixedContainer_Find(benchmark::State& state) {
83+
auto values = generate_values<T>(N);
84+
85+
// Also prepare a "miss" value for interleaving hit/miss lookups.
86+
auto miss_values = generate_values<T>(N + 4);
87+
88+
for (auto _ : state) {
89+
FixedContainer<T, N> container;
90+
for (size_t i = 0; i < N; ++i) {
91+
container.insert(values[i]);
92+
}
93+
94+
int64_t found = 0;
95+
for (size_t iter = 0; iter < FIND_ITERS; ++iter) {
96+
// Hit: search for existing value
97+
found += container.find(values[iter % N]);
98+
// Miss: search for non-existing value
99+
found += container.find(miss_values[N + (iter % 4)]);
100+
}
101+
benchmark::DoNotOptimize(found);
102+
}
103+
}
104+
105+
// ============================================================
106+
// DynamicContainer benchmark: insert N values, then find them
107+
// ============================================================
108+
109+
template <typename T, size_t N>
110+
static void BM_DynamicContainer_Find(benchmark::State& state) {
111+
auto values = generate_values<T>(N);
112+
auto miss_values = generate_values<T>(N + 4);
113+
114+
for (auto _ : state) {
115+
DynamicContainer<T> container;
116+
for (size_t i = 0; i < N; ++i) {
117+
container.insert(values[i]);
118+
}
119+
120+
int64_t found = 0;
121+
for (size_t iter = 0; iter < FIND_ITERS; ++iter) {
122+
found += container.find(values[iter % N]);
123+
found += container.find(miss_values[N + (iter % 4)]);
124+
}
125+
benchmark::DoNotOptimize(found);
126+
}
127+
}
128+
129+
// ============================================================
130+
// Register benchmarks for int32_t
131+
// ============================================================
132+
133+
#define REGISTER_FIXED_INT32(N) \
134+
BENCHMARK(BM_FixedContainer_Find<int32_t, N>)->Name("Fixed_Int32_N" #N)->Unit( \
135+
benchmark::kMicrosecond); \
136+
BENCHMARK(BM_DynamicContainer_Find<int32_t, N>)->Name("Dynamic_Int32_N" #N)->Unit( \
137+
benchmark::kMicrosecond);
138+
139+
REGISTER_FIXED_INT32(1)
140+
REGISTER_FIXED_INT32(2)
141+
REGISTER_FIXED_INT32(3)
142+
REGISTER_FIXED_INT32(4)
143+
REGISTER_FIXED_INT32(5)
144+
REGISTER_FIXED_INT32(6)
145+
REGISTER_FIXED_INT32(7)
146+
REGISTER_FIXED_INT32(8)
147+
148+
// ============================================================
149+
// Register benchmarks for int64_t
150+
// ============================================================
151+
152+
#define REGISTER_FIXED_INT64(N) \
153+
BENCHMARK(BM_FixedContainer_Find<int64_t, N>)->Name("Fixed_Int64_N" #N)->Unit( \
154+
benchmark::kMicrosecond); \
155+
BENCHMARK(BM_DynamicContainer_Find<int64_t, N>)->Name("Dynamic_Int64_N" #N)->Unit( \
156+
benchmark::kMicrosecond);
157+
158+
REGISTER_FIXED_INT64(1)
159+
REGISTER_FIXED_INT64(2)
160+
REGISTER_FIXED_INT64(3)
161+
REGISTER_FIXED_INT64(4)
162+
REGISTER_FIXED_INT64(5)
163+
REGISTER_FIXED_INT64(6)
164+
REGISTER_FIXED_INT64(7)
165+
REGISTER_FIXED_INT64(8)
166+
167+
// ============================================================
168+
// Register benchmarks for std::string
169+
// ============================================================
170+
171+
#define REGISTER_FIXED_STRING(N) \
172+
BENCHMARK(BM_FixedContainer_Find<std::string, N>)->Name("Fixed_String_N" #N)->Unit( \
173+
benchmark::kMicrosecond); \
174+
BENCHMARK(BM_DynamicContainer_Find<std::string, N>)->Name("Dynamic_String_N" #N)->Unit( \
175+
benchmark::kMicrosecond);
176+
177+
REGISTER_FIXED_STRING(1)
178+
REGISTER_FIXED_STRING(2)
179+
REGISTER_FIXED_STRING(3)
180+
REGISTER_FIXED_STRING(4)
181+
REGISTER_FIXED_STRING(5)
182+
REGISTER_FIXED_STRING(6)
183+
REGISTER_FIXED_STRING(7)
184+
REGISTER_FIXED_STRING(8)
185+
186+
#undef REGISTER_FIXED_INT32
187+
#undef REGISTER_FIXED_INT64
188+
#undef REGISTER_FIXED_STRING
189+
190+
} // namespace doris

be/benchmark/benchmark_main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "benchmark_column_view.hpp"
2424
#include "benchmark_fastunion.hpp"
2525
#include "benchmark_hll_merge.hpp"
26+
#include "benchmark_hybrid_set.hpp"
2627
#include "benchmark_string.hpp"
2728
#include "binary_cast_benchmark.hpp"
2829
#include "core/block/block.h"

be/benchmark/benchmark_string.hpp

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ static void BM_ToBase64Impl_Old(benchmark::State& state) {
211211
for (auto _ : state) {
212212
dst_data.clear();
213213
dst_offsets.clear();
214-
benchmark::DoNotOptimize(OldToBase64Impl::vector(data, offsets, dst_data, dst_offsets));
214+
auto status = OldToBase64Impl::vector(data, offsets, dst_data, dst_offsets);
215+
benchmark::DoNotOptimize(status);
215216
}
216217
}
217218

@@ -228,7 +229,8 @@ static void BM_ToBase64Impl_New(benchmark::State& state) {
228229
for (auto _ : state) {
229230
dst_data.clear();
230231
dst_offsets.clear();
231-
benchmark::DoNotOptimize(ToBase64Impl::vector(data, offsets, dst_data, dst_offsets));
232+
auto status = ToBase64Impl::vector(data, offsets, dst_data, dst_offsets);
233+
benchmark::DoNotOptimize(status);
232234
}
233235
}
234236

@@ -258,8 +260,9 @@ static void BM_FromBase64Impl_Old(benchmark::State& state) {
258260
for (auto _ : state) {
259261
dst_data.clear();
260262
dst_offsets.clear();
261-
benchmark::DoNotOptimize(OldFromBase64Impl::vector(data, offsets, dst_data, dst_offsets,
262-
null_map->get_data()));
263+
auto status = OldFromBase64Impl::vector(data, offsets, dst_data, dst_offsets,
264+
null_map->get_data());
265+
benchmark::DoNotOptimize(status);
263266
}
264267
}
265268

@@ -277,8 +280,9 @@ static void BM_FromBase64Impl_New(benchmark::State& state) {
277280
for (auto _ : state) {
278281
dst_data.clear();
279282
dst_offsets.clear();
280-
benchmark::DoNotOptimize(
281-
FromBase64Impl::vector(data, offsets, dst_data, dst_offsets, null_map->get_data()));
283+
auto status =
284+
FromBase64Impl::vector(data, offsets, dst_data, dst_offsets, null_map->get_data());
285+
benchmark::DoNotOptimize(status);
282286
}
283287
}
284288

@@ -307,7 +311,8 @@ static void BM_UnhexImpl_Old(benchmark::State& state) {
307311
for (auto _ : state) {
308312
dst_data.clear();
309313
dst_offsets.clear();
310-
benchmark::DoNotOptimize(OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets));
314+
auto status = OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets);
315+
benchmark::DoNotOptimize(status);
311316
}
312317
}
313318

@@ -324,8 +329,9 @@ static void BM_UnhexImpl_New(benchmark::State& state) {
324329
for (auto _ : state) {
325330
dst_data.clear();
326331
dst_offsets.clear();
327-
benchmark::DoNotOptimize(
328-
UnHexImpl<UnHexImplEmpty>::vector(data, offsets, dst_data, dst_offsets));
332+
auto status =
333+
UnHexImpl<UnHexImplEmpty>::vector(data, offsets, dst_data, dst_offsets);
334+
benchmark::DoNotOptimize(status);
329335
}
330336
}
331337

@@ -355,8 +361,9 @@ static void BM_UnhexNullImpl_Old(benchmark::State& state) {
355361
for (auto _ : state) {
356362
dst_data.clear();
357363
dst_offsets.clear();
358-
benchmark::DoNotOptimize(
359-
OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets, &null_map->get_data()));
364+
auto status =
365+
OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets, &null_map->get_data());
366+
benchmark::DoNotOptimize(status);
360367
}
361368
}
362369

@@ -374,8 +381,9 @@ static void BM_UnhexNullImpl_New(benchmark::State& state) {
374381
for (auto _ : state) {
375382
dst_data.clear();
376383
dst_offsets.clear();
377-
benchmark::DoNotOptimize(UnHexImpl<UnHexImplNull>::vector(
378-
data, offsets, dst_data, dst_offsets, &null_map->get_data()));
384+
auto status = UnHexImpl<UnHexImplNull>::vector(
385+
data, offsets, dst_data, dst_offsets, &null_map->get_data());
386+
benchmark::DoNotOptimize(status);
379387
}
380388
}
381389

be/benchmark/binary_cast_benchmark.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ static void BM_BinaryCast_UI64_to_DateTimeV2(benchmark::State& state) {
138138
state.ResumeTiming();
139139

140140
for (auto _ : state) {
141-
benchmark::DoNotOptimize(test_data.data());
141+
auto* td = test_data.data();
142+
benchmark::DoNotOptimize(td);
142143

143144
for (size_t i = 0; i < data_size; ++i) {
144145
auto result = binary_cast<uint64_t, DateV2Value<DateTimeV2ValueType>>(test_data[i]);
@@ -160,7 +161,8 @@ static void BM_OldBinaryCast_UI64_to_DateTimeV2(benchmark::State& state) {
160161
state.ResumeTiming();
161162

162163
for (auto _ : state) {
163-
benchmark::DoNotOptimize(test_data.data());
164+
auto* td = test_data.data();
165+
benchmark::DoNotOptimize(td);
164166

165167
for (size_t i = 0; i < data_size; ++i) {
166168
auto result = old_binary_cast<uint64_t, DateV2Value<DateTimeV2ValueType>>(test_data[i]);
@@ -183,7 +185,8 @@ static void BM_BinaryCast_DateTimeV2_to_UI64(benchmark::State& state) {
183185
state.ResumeTiming();
184186

185187
for (auto _ : state) {
186-
benchmark::DoNotOptimize(test_data.data());
188+
auto* td = test_data.data();
189+
benchmark::DoNotOptimize(td);
187190

188191
for (size_t i = 0; i < data_size; ++i) {
189192
auto result = binary_cast<DateV2Value<DateTimeV2ValueType>, uint64_t>(test_data[i]);
@@ -206,7 +209,8 @@ static void BM_OldBinaryCast_DateTimeV2_to_UI64(benchmark::State& state) {
206209
state.ResumeTiming();
207210

208211
for (auto _ : state) {
209-
benchmark::DoNotOptimize(test_data.data());
212+
auto* td = test_data.data();
213+
benchmark::DoNotOptimize(td);
210214

211215
for (size_t i = 0; i < data_size; ++i) {
212216
auto result = old_binary_cast<DateV2Value<DateTimeV2ValueType>, uint64_t>(test_data[i]);

be/src/exprs/create_predicate_function.h

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -188,35 +188,12 @@ inline auto create_set(PrimitiveType type, size_t size, bool null_aware) {
188188
}
189189
}
190190

191-
template <size_t N = 0>
192191
inline HybridSetBase* create_string_value_set(bool null_aware) {
193-
if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) {
194-
return new StringValueSet<FixedContainer<StringRef, N>>(null_aware);
195-
} else {
196-
return new StringValueSet(null_aware);
197-
}
192+
return new StringValueSet(null_aware);
198193
}
199194

200195
inline HybridSetBase* create_string_value_set(size_t size, bool null_aware) {
201-
if (size == 1) {
202-
return create_string_value_set<1>(null_aware);
203-
} else if (size == 2) {
204-
return create_string_value_set<2>(null_aware);
205-
} else if (size == 3) {
206-
return create_string_value_set<3>(null_aware);
207-
} else if (size == 4) {
208-
return create_string_value_set<4>(null_aware);
209-
} else if (size == 5) {
210-
return create_string_value_set<5>(null_aware);
211-
} else if (size == 6) {
212-
return create_string_value_set<6>(null_aware);
213-
} else if (size == 7) {
214-
return create_string_value_set<7>(null_aware);
215-
} else if (size == FIXED_CONTAINER_MAX_SIZE) {
216-
return create_string_value_set<FIXED_CONTAINER_MAX_SIZE>(null_aware);
217-
} else {
218-
return create_string_value_set(null_aware);
219-
}
196+
return create_string_value_set(null_aware);
220197
}
221198

222199
inline auto create_bloom_filter(PrimitiveType type, bool null_aware) {

0 commit comments

Comments
 (0)