diff --git a/scripts/setup-common.sh b/scripts/setup-common.sh index 9699939d2af..6673e432a16 100755 --- a/scripts/setup-common.sh +++ b/scripts/setup-common.sh @@ -48,7 +48,12 @@ function install_fmt { function install_folly { wget_and_untar https://github.com/facebook/folly/archive/refs/tags/"${FB_OS_VERSION}".tar.gz folly - local FOLLY_FLAGS=(-DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED" -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON) + local FOLLY_FLAGS=( + -DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED" + -DBUILD_TESTS=OFF + -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" + -DFOLLY_HAVE_INT128_T=ON + ) # When folly is static, use static gflags to avoid dual gflags flag # registration when .so plugins are dlopen'd (both the binary and plugin # would register the same flags in a shared gflags registry). diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh index a50fb02ae0e..30bfb7d523d 100755 --- a/scripts/setup-helper-functions.sh +++ b/scripts/setup-helper-functions.sh @@ -81,7 +81,8 @@ function github_checkout { # The values that CPU_ARCH can take are as follows: # arm64 : Target Apple silicon. # aarch64: Target general 64 bit arm cpus. -# avx: Target Intel CPUs with AVX. +# avx512: Target Intel CPUs with AVX-512F. +# avx: Target Intel CPUs with AVX2. # sse: Target Intel CPUs with sse. # Echo's the appropriate compiler flags which can be captured as so # CXX_FLAGS=$(get_cxx_flags) or @@ -102,7 +103,9 @@ function get_cxx_flags { else # x86_64 local CPU_CAPABILITIES CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}') - if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then + CPU_ARCH="avx512" + elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then CPU_ARCH="avx" else CPU_ARCH="sse" @@ -114,7 +117,9 @@ function get_cxx_flags { else # x86_64 local CPU_CAPABILITIES CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1 | awk '{print tolower($0)}') - if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then + CPU_ARCH="avx512" + elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then CPU_ARCH="avx" elif [[ $CPU_CAPABILITIES =~ "sse" ]]; then CPU_ARCH="sse" @@ -131,8 +136,12 @@ function get_cxx_flags { echo -n "-mcpu=apple-m1+crc" ;; + "avx512") + echo -n "-mavx512f -mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2" + ;; + "avx") - echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2" + echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2" ;; "sse") diff --git a/velox/common/process/ProcessBase.cpp b/velox/common/process/ProcessBase.cpp index 0b9a4df2c64..3cbb7fa6a42 100644 --- a/velox/common/process/ProcessBase.cpp +++ b/velox/common/process/ProcessBase.cpp @@ -32,6 +32,8 @@ DECLARE_bool(avx2); // Enables use of AVX2 when available NOLINT DECLARE_bool(bmi2); // Enables use of BMI2 when available NOLINT +DECLARE_bool(avx512f); + namespace facebook { namespace velox { namespace process { @@ -106,6 +108,7 @@ uint64_t threadCpuNanos() { namespace { bool bmi2CpuFlag = folly::CpuId().bmi2(); bool avx2CpuFlag = folly::CpuId().avx2(); +bool avx512fCpuFlag = folly::CpuId().avx512f(); } // namespace bool hasAvx2() { @@ -124,6 +127,14 @@ bool hasBmi2() { #endif } +bool hasAvx512f() { +#ifdef __AVX512F__ + return avx512fCpuFlag && FLAGS_avx512f; +#else + return false; +#endif +} + } // namespace process } // namespace velox } // namespace facebook diff --git a/velox/common/process/ProcessBase.h b/velox/common/process/ProcessBase.h index 34edd6d1467..7ca400b4efa 100644 --- a/velox/common/process/ProcessBase.h +++ b/velox/common/process/ProcessBase.h @@ -46,6 +46,10 @@ uint64_t threadCpuNanos(); /// by flag. bool hasAvx2(); +/// True if the machine has Intel AVX512F instructions and these are not +/// disabled by flag. +bool hasAvx512f(); + /// True if the machine has Intel BMI2 instructions and these are not disabled /// by flag. bool hasBmi2(); diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 062a507fc64..f52aeb7dd37 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -132,7 +132,8 @@ void HiveConnector::registerSerDe() { std::unique_ptr HivePartitionFunctionSpec::create( int numPartitions, - bool localExchange) const { + bool localExchange, + bool /*useOptimizedPartitionFunction*/) const { std::vector bucketToPartitions; if (bucketToPartition_.empty()) { // NOTE: if hive partition function spec doesn't specify bucket to partition diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h index 95c175c4f69..e3508cb4729 100644 --- a/velox/connectors/hive/HiveConnector.h +++ b/velox/connectors/hive/HiveConnector.h @@ -141,7 +141,8 @@ class HivePartitionFunctionSpec : public core::PartitionFunctionSpec { std::unique_ptr create( int numPartitions, - bool localExchange) const override; + bool localExchange, + bool useOptimizedPartitionFunction = false) const override; std::string toString() const override; diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 4a1ba1579cd..109caf0f45d 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -2500,9 +2500,13 @@ class PartitionFunctionSpec : public ISerializable { public: /// If 'localExchange' is true, the partition function is used for local /// exchange within a velox task. + /// TODO: useOptimizedPartitionFunction = true is only supported in + /// HashPartitionFunction now. Will extend the optimization to other + /// PartitionFunctions soon. virtual std::unique_ptr create( int numPartitions, - bool localExchange = false) const = 0; + bool localExchange = false, + bool useOptimizedPartitionFunction = false) const = 0; virtual ~PartitionFunctionSpec() = default; @@ -2515,7 +2519,8 @@ class GatherPartitionFunctionSpec : public PartitionFunctionSpec { public: std::unique_ptr create( int /*numPartitions*/, - bool /*localExchange*/) const override { + bool /*localExchange*/, + bool /*useOptimizedPartitionFunction*/ = false) const override { VELOX_UNREACHABLE(); } diff --git a/velox/core/QueryConfig.cpp b/velox/core/QueryConfig.cpp index 4a31862590a..8493d6546c7 100644 --- a/velox/core/QueryConfig.cpp +++ b/velox/core/QueryConfig.cpp @@ -90,6 +90,7 @@ const std::vector& QueryConfig::registeredProperties() { // Partitioned output. VELOX_REGISTER_QUERY_CONFIG(kPartitionedOutputEagerFlush); + VELOX_REGISTER_QUERY_CONFIG(kOptimizedHashPartitionFunctionEnabled); VELOX_REGISTER_QUERY_CONFIG(kMaxPartitionedOutputBufferSize); VELOX_REGISTER_QUERY_CONFIG(kMaxOutputBufferSize); diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 772015daa1f..b30fb47bd1a 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -454,6 +454,16 @@ class QueryConfig { false, "Flush PartitionedOutput rows eagerly without buffering.") + /// If true, use OptimizedHashPartitionFunction in place of + /// HashPartitionFunction. + VELOX_QUERY_CONFIG( + kOptimizedHashPartitionFunctionEnabled, + optimizedHashPartitionFunctionEnabled, + "optimized_hash_partition_function_enabled", + bool, + false, + "Use OptimizedHashPartitionFunction instead of HashPartitionFunction.") + /// The maximum number of bytes to buffer in PartitionedOutput operator to /// avoid creating tiny SerializedPages. VELOX_QUERY_CONFIG( @@ -1469,6 +1479,14 @@ class QueryConfig { 1000, "Batch size threshold for zero-copy in MarkSorted operator.") + VELOX_QUERY_CONFIG( + kOptimizedPartitionedOutputEnabled, + optimizedPartitionedOutputEnabled, + "optimized_repartitioning", + bool, + false, + "Enable OptimizedPartitionedOutput operator."); + // --- Hand-written accessors for properties that need custom logic --- // Generated by VELOX_QUERY_CONFIG for simple properties above. diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt index 3a5bec7e6e8..626c7c06570 100644 --- a/velox/exec/CMakeLists.txt +++ b/velox/exec/CMakeLists.txt @@ -71,6 +71,9 @@ velox_add_library( OperatorTraceScan.cpp OperatorTraceWriter.cpp OperatorUtils.cpp + OptimizedHashPartitionFunction.cpp + OptimizedPartitionedOutput.cpp + OptimizedVectorHasher.cpp OrderBy.cpp OutputBuffer.cpp OutputBufferManager.cpp @@ -177,6 +180,7 @@ velox_add_library( OperatorTraceWriter.h OperatorType.h OperatorUtils.h + OptimizedVectorHasher.h OrderBy.h OutputBuffer.h OutputBufferManager.h diff --git a/velox/exec/HashPartitionFunction.cpp b/velox/exec/HashPartitionFunction.cpp index 896facc4efa..44f012e5e00 100644 --- a/velox/exec/HashPartitionFunction.cpp +++ b/velox/exec/HashPartitionFunction.cpp @@ -13,8 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include +#include "velox/exec/HashPartitionFunction.h" + +#include "velox/exec/OptimizedHashPartitionFunction.h" +#include "velox/exec/VectorHasher.h" #define XXH_INLINE_ALL #include // @manual=third-party//xxHash:xxhash @@ -123,9 +125,15 @@ std::optional HashPartitionFunction::partition( std::unique_ptr HashPartitionFunctionSpec::create( int numPartitions, - bool localExchange) const { - return std::make_unique( - localExchange, numPartitions, inputType_, keyChannels_, constValues_); + bool localExchange, + bool useOptimizedPartitionFunction) const { + return createHashPartitionFunction( + localExchange, + numPartitions, + inputType_, + keyChannels_, + constValues_, + useOptimizedPartitionFunction); } std::string HashPartitionFunctionSpec::toString() const { @@ -180,4 +188,33 @@ core::PartitionFunctionSpecPtr HashPartitionFunctionSpec::deserialize( return std::make_shared( ISerializable::deserialize(obj["inputType"]), keys, constValues); } + +std::unique_ptr createHashPartitionFunction( + bool localExchange, + int numPartitions, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues, + bool useOptimizedPartitionFunction) { + if (useOptimizedPartitionFunction) { + return std::make_unique( + localExchange, numPartitions, inputType, keyChannels, constValues); + } + return std::make_unique( + localExchange, numPartitions, inputType, keyChannels, constValues); +} + +std::unique_ptr createHashPartitionFunction( + const HashBitRange& hashBitRange, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues, + bool useOptimizedPartitionFunction) { + if (useOptimizedPartitionFunction) { + return std::make_unique( + hashBitRange, inputType, keyChannels, constValues); + } + return std::make_unique( + hashBitRange, inputType, keyChannels, constValues); +} } // namespace facebook::velox::exec diff --git a/velox/exec/HashPartitionFunction.h b/velox/exec/HashPartitionFunction.h index 7aa6a032d6b..848fd42e0ac 100644 --- a/velox/exec/HashPartitionFunction.h +++ b/velox/exec/HashPartitionFunction.h @@ -15,19 +15,28 @@ */ #pragma once -#include -#include #include "velox/core/PlanNode.h" +#include "velox/exec/HashBitRange.h" +#include "velox/exec/VectorHasher.h" namespace facebook::velox::exec { +class HashPartitionFunctionBase : public core::PartitionFunction { + public: + ~HashPartitionFunctionBase() override = default; + + virtual int numPartitions() const = 0; +}; + /// Calculates partition number for each row of the specified vector using a /// hash function. The constructor with hashBitRange parameter requires both /// hashBitRange and keyChannels to be non-empty. The constructor with /// numPartitions allows the keyChannels argument to be empty. If keyChannels is /// empty, then the resulting partition number of partition() will always be /// zero. -class HashPartitionFunction : public core::PartitionFunction { +/// Extends PartitionFunction with access to the configured number of +/// partitions. +class HashPartitionFunction : public HashPartitionFunctionBase { public: HashPartitionFunction( bool localExchange, @@ -48,7 +57,7 @@ class HashPartitionFunction : public core::PartitionFunction { const RowVector& input, std::vector& partitions) override; - int numPartitions() const { + int numPartitions() const override { return numPartitions_; } @@ -85,7 +94,8 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec { std::unique_ptr create( int numPartitions, - bool localExchange) const override; + bool localExchange, + bool useOptimizedPartitionFunction = false) const override; std::string toString() const override; @@ -100,4 +110,22 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec { const std::vector keyChannels_; const std::vector constValues_; }; + +/// Creates either HashPartitionFunction or OptimizedHashPartitionFunction +/// based on 'useOptimizedPartitionFunction'. +std::unique_ptr createHashPartitionFunction( + bool localExchange, + int numPartitions, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues = {}, + bool useOptimizedPartitionFunction = false); + +std::unique_ptr createHashPartitionFunction( + const HashBitRange& hashBitRange, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues = {}, + bool useOptimizedPartitionFunction = false); + } // namespace facebook::velox::exec diff --git a/velox/exec/LocalPartition.cpp b/velox/exec/LocalPartition.cpp index eb6eb81add3..231b873d7fa 100644 --- a/velox/exec/LocalPartition.cpp +++ b/velox/exec/LocalPartition.cpp @@ -339,10 +339,13 @@ LocalPartition::LocalPartition( ctx->task->getLocalExchangeQueues(ctx->splitGroupId, planNode->id())}, numPartitions_{queues_.size()}, partitionFunction_( - numPartitions_ == 1 ? nullptr - : planNode->partitionFunctionSpec().create( - numPartitions_, - /*localExchange=*/true)), + numPartitions_ == 1 + ? nullptr + : planNode->partitionFunctionSpec().create( + numPartitions_, + /*localExchange=*/true, + ctx->queryConfig() + .optimizedHashPartitionFunctionEnabled())), singlePartitionBufferSize_{ (numPartitions_ < ctx->queryConfig() diff --git a/velox/exec/LocalPlanner.cpp b/velox/exec/LocalPlanner.cpp index 39f009fe39a..a46daa8b4f1 100644 --- a/velox/exec/LocalPlanner.cpp +++ b/velox/exec/LocalPlanner.cpp @@ -37,6 +37,7 @@ #include "velox/exec/NestedLoopJoinBuild.h" #include "velox/exec/NestedLoopJoinProbe.h" #include "velox/exec/OperatorTraceScan.h" +#include "velox/exec/OptimizedPartitionedOutput.h" #include "velox/exec/OrderBy.h" #include "velox/exec/ParallelProject.h" #include "velox/exec/PartitionedOutput.h" @@ -553,9 +554,15 @@ std::shared_ptr DriverFactory::createDriver( auto partitionedOutputNode = std::dynamic_pointer_cast( planNode)) { - operators.push_back( - std::make_unique( - id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode))); + if (ctx->queryConfig().optimizedPartitionedOutputEnabled()) { + operators.push_back( + std::make_unique( + id, ctx.get(), partitionedOutputNode)); + } else { + operators.push_back( + std::make_unique( + id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode))); + } } else if ( auto joinNode = std::dynamic_pointer_cast(planNode)) { diff --git a/velox/exec/MarkDistinct.cpp b/velox/exec/MarkDistinct.cpp index 2b562c714af..83ae15a2cad 100644 --- a/velox/exec/MarkDistinct.cpp +++ b/velox/exec/MarkDistinct.cpp @@ -356,8 +356,14 @@ void MarkDistinct::setupInputSpiller( &spillConfig_.value(), spillStats_.get()); - spillHashFunction_ = std::make_unique( - inputSpiller_->hashBits(), inputType_, distinctKeyChannels_); + spillHashFunction_ = createHashPartitionFunction( + inputSpiller_->hashBits(), + inputType_, + distinctKeyChannels_, + {}, + operatorCtx_->driverCtx() + ->queryConfig() + .optimizedHashPartitionFunctionEnabled()); } void MarkDistinct::spill() { diff --git a/velox/exec/MarkDistinct.h b/velox/exec/MarkDistinct.h index c8c582b5ea8..f386ff77bd9 100644 --- a/velox/exec/MarkDistinct.h +++ b/velox/exec/MarkDistinct.h @@ -106,7 +106,7 @@ class MarkDistinct : public Operator { SpillPartitionSet spillInputPartitionSet_; - std::unique_ptr spillHashFunction_; + std::unique_ptr spillHashFunction_; SpillPartitionSet spillHashTablePartitionSet_; diff --git a/velox/exec/OptimizedHashPartitionFunction.cpp b/velox/exec/OptimizedHashPartitionFunction.cpp new file mode 100644 index 00000000000..ac83b7d8d27 --- /dev/null +++ b/velox/exec/OptimizedHashPartitionFunction.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/exec/OptimizedHashPartitionFunction.h" + +#include + +#include + +#include "velox/common/process/ProcessBase.h" + +#if defined(__AVX2__) || defined(__AVX512F__) +#include +#endif + +#define XXH_INLINE_ALL +#include // @manual=third-party//xxHash:xxhash + +namespace facebook::velox::exec { +namespace { +// Gets the hash value for local exchange with given 'rawHash'. 'rawHash' +// is the value computed by this hash function which is used for remote +// shuffle across stages like for Prestissimo. +static inline uint32_t localExchangeHash(uint32_t rawHash) { + // Mix the bits so we don't use the same hash used to distribute between + // stages. + bits::reverseBits(reinterpret_cast(&rawHash), sizeof(rawHash)); + return XXH32(&rawHash, sizeof(rawHash), 0); +} + +FOLLY_ALWAYS_INLINE uint32_t mixedHash(uint64_t hash) { + return static_cast(hash) ^ static_cast(hash >> 32); +} + +FOLLY_ALWAYS_INLINE uint32_t +reduceRange(uint64_t hash, uint32_t numPartitions) { + return (static_cast(mixedHash(hash)) * numPartitions) >> 32; +} + +void rangeReductionPowerOfTwo( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions) { + VELOX_DCHECK(bits::isPowerOfTwo(numPartitions)); + + if (numPartitions == 1) { + std::fill(partitions, partitions + size, 0); + return; + } + + const auto shift = 32 - __builtin_ctz(numPartitions); + for (vector_size_t index = 0; index < size; ++index) { + partitions[index] = mixedHash(hashes[index]) >> shift; + } +} + +#if defined(__AVX512F__) +void rangeReductionAvx512( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions) { + const __m512i numPartitionsVec = _mm512_set1_epi64(numPartitions); + + vector_size_t index = 0; + for (; index + 8 <= size; index += 8) { + const auto hashesVec = + _mm512_loadu_si512(reinterpret_cast(hashes + index)); + + const auto mixedHashesVec = + _mm512_xor_si512(hashesVec, _mm512_srli_epi64(hashesVec, 32)); + const auto productVec = _mm512_mul_epu32(mixedHashesVec, numPartitionsVec); + const auto shiftedVec = _mm512_srli_epi64(productVec, 32); + const auto packedResults = _mm512_cvtepi64_epi32(shiftedVec); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(partitions + index), packedResults); + } + + for (; index < size; ++index) { + partitions[index] = reduceRange(hashes[index], numPartitions); + } +} +#endif + +#if defined(__AVX2__) +void rangeReductionAvx2( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions) { + const auto packIndexes = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); + const auto numPartitionsVec = _mm256_set1_epi64x(numPartitions); + + vector_size_t index = 0; + for (; index + 4 <= size; index += 4) { + const auto hashesVec = + _mm256_loadu_si256(reinterpret_cast(hashes + index)); + const auto mixedHashesVec = + _mm256_xor_si256(hashesVec, _mm256_srli_epi64(hashesVec, 32)); + const auto productVec = _mm256_mul_epu32(mixedHashesVec, numPartitionsVec); + const auto shiftedVec = _mm256_srli_epi64(productVec, 32); + const auto packedResults = + _mm256_permutevar8x32_epi32(shiftedVec, packIndexes); + _mm_storeu_si128( + reinterpret_cast<__m128i*>(partitions + index), + _mm256_castsi256_si128(packedResults)); + } + + for (; index < size; ++index) { + partitions[index] = reduceRange(hashes[index], numPartitions); + } +} +#endif + +void rangeReductionImpl( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions) { + if (bits::isPowerOfTwo(numPartitions)) { + rangeReductionPowerOfTwo(hashes, partitions, size, numPartitions); + return; + } + +#if defined(__AVX512F__) + if (process::hasAvx512f()) { + rangeReductionAvx512(hashes, partitions, size, numPartitions); + return; + } +#endif + +#if defined(__AVX2__) + if (process::hasAvx2()) { + rangeReductionAvx2(hashes, partitions, size, numPartitions); + return; + } +#endif + + for (vector_size_t index = 0; index < size; ++index) { + partitions[index] = reduceRange(hashes[index], numPartitions); + } +} + +void applyLocalExchangeHash(raw_vector& hashes) { + for (auto& hash : hashes) { + hash = localExchangeHash(hash); + } +} + +void applyHashBitRange( + const HashBitRange& hashBitRange, + const raw_vector& hashes, + std::vector& partitions) { + partitions.resize(hashes.size()); + for (auto index = 0; index < hashes.size(); ++index) { + partitions[index] = hashBitRange.partition(hashes[index]); + } +} + +} // namespace + +void rangeReduction( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions) { + rangeReductionImpl(hashes, partitions, size, numPartitions); +} + +OptimizedHashPartitionFunction::OptimizedHashPartitionFunction( + bool localExchange, + int numPartitions, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues) + : localExchange_{localExchange}, numPartitions_{numPartitions} { + init(inputType, keyChannels, constValues); +} + +OptimizedHashPartitionFunction::OptimizedHashPartitionFunction( + const HashBitRange& hashBitRange, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues) + : localExchange_{false}, + numPartitions_{hashBitRange.numPartitions()}, + hashBitRange_(hashBitRange) { + VELOX_CHECK_GT(hashBitRange.numPartitions(), 0); + VELOX_CHECK(!keyChannels.empty()); + init(inputType, keyChannels, constValues); +} + +std::optional OptimizedHashPartitionFunction::partition( + const RowVector& input, + std::vector& partitions) { + if (hashers_.empty()) { + return 0u; + } + + const auto size = input.size(); + if (size == 0) { + partitions.clear(); + return std::nullopt; + } + + if (!hashBitRange_.has_value() && numPartitions_ == 1) { + return 0u; + } + + rows_.resize(size); + rows_.setAll(); + + hashes_.resize(size); + for (auto i = 0; i < hashers_.size(); ++i) { + auto& hasher = hashers_[i]; + if (hasher->channel() != kConstantChannel) { + hashers_[i]->decode(*input.childAt(hasher->channel()), rows_); + hashers_[i]->hash(rows_, i > 0, hashes_); + } else { + hashers_[i]->hashPrecomputed(i > 0, hashes_); + } + } + + if (localExchange_) { + applyLocalExchangeHash(hashes_); + } + + if (hashBitRange_.has_value()) { + applyHashBitRange(*hashBitRange_, hashes_, partitions); + } else { + partitions.resize(size); + rangeReduction(hashes_.data(), partitions.data(), size, numPartitions_); + } + + return std::nullopt; +} + +void OptimizedHashPartitionFunction::init( + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues) { + hashers_.reserve(keyChannels.size()); + size_t constChannel{0}; + for (const auto channel : keyChannels) { + if (channel != kConstantChannel) { + hashers_.emplace_back( + OptimizedVectorHasher::create(inputType->childAt(channel), channel)); + } else { + const auto& constValue = constValues[constChannel++]; + hashers_.emplace_back( + OptimizedVectorHasher::create(constValue->type(), channel)); + hashers_.back()->precompute(*constValue); + } + } +} + +} // namespace facebook::velox::exec diff --git a/velox/exec/OptimizedHashPartitionFunction.h b/velox/exec/OptimizedHashPartitionFunction.h new file mode 100644 index 00000000000..bc7ceb1ac0b --- /dev/null +++ b/velox/exec/OptimizedHashPartitionFunction.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/exec/HashPartitionFunction.h" +#include "velox/exec/OptimizedVectorHasher.h" + +namespace facebook::velox::exec { + +/// Maps hashes to partitions using range reduction. Visible for testing. +void rangeReduction( + const uint64_t* hashes, + uint32_t* partitions, + vector_size_t size, + uint32_t numPartitions); + +/// Calculates partition numbers using OptimizedVectorHasher. +class OptimizedHashPartitionFunction : public HashPartitionFunctionBase { + public: + OptimizedHashPartitionFunction( + bool localExchange, + int numPartitions, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues = {}); + + OptimizedHashPartitionFunction( + const HashBitRange& hashBitRange, + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues = {}); + + ~OptimizedHashPartitionFunction() override = default; + + std::optional partition( + const RowVector& input, + std::vector& partitions) override; + + int numPartitions() const override { + return numPartitions_; + } + + private: + void init( + const RowTypePtr& inputType, + const std::vector& keyChannels, + const std::vector& constValues); + + const bool localExchange_; + const int numPartitions_; + const std::optional hashBitRange_ = std::nullopt; + std::vector> hashers_; + + // Reusable memory. + SelectivityVector rows_; + raw_vector hashes_; +}; + +} // namespace facebook::velox::exec diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp new file mode 100644 index 00000000000..a8da3786b81 --- /dev/null +++ b/velox/exec/OptimizedPartitionedOutput.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/exec/OptimizedPartitionedOutput.h" + +#include + +#include "velox/exec/HashPartitionFunction.h" +#include "velox/exec/SerializedPage.h" +#include "velox/exec/Task.h" + +namespace facebook::velox::exec { + +OptimizedPartitionedOutput::OptimizedPartitionedOutput( + int32_t operatorId, + DriverCtx* ctx, + const std::shared_ptr& planNode) + : Operator( + ctx, + planNode->outputType(), + operatorId, + planNode->id(), + "OptimizedPartitionedOutput"), + taskId_(operatorCtx_->taskId()), + inputType_(planNode->inputType()), + keyChannels_(toChannels(planNode->inputType(), planNode->keys())), + outputChannels_(calculateOutputChannels( + planNode->inputType(), + planNode->outputType(), + planNode->outputType())), + numDestinations_(planNode->numPartitions()), + replicateNullsAndAny_(planNode->isReplicateNullsAndAny()), + bufferManager_(OutputBufferManager::getInstanceRef()), + // NOTE: 'bufferReleaseFn_' holds a reference on the associated task to + // prevent it from deleting while there are output buffers being accessed + // out of the partitioned output buffer manager such as in Prestissimo, + // the http server holds the buffers while sending the data response. + bufferReleaseFn_([task = operatorCtx_->task()]() {}), + maxOutputBufferBytes_(ctx->task->queryCtx() + ->queryConfig() + .maxPartitionedOutputBufferSize()), + pool_(pool()), + partitionFunction_( + numDestinations_ == 1 ? nullptr + : planNode->partitionFunctionSpec().create( + numDestinations_, + /*localExchange=*/false, + true)) { + if (!planNode->isPartitioned()) { + VELOX_USER_CHECK_EQ(numDestinations_, 1); + } + if (numDestinations_ == 1) { + VELOX_USER_CHECK(keyChannels_.empty()); + } + + serializer::presto::SerdeOpts options; + options.compressionKind = common::stringToCompressionKind( + operatorCtx_->driverCtx()->queryConfig().shuffleCompressionKind()); + options.minCompressionRatio = 0.8; + + initializeSerializerLayout(); + + serializer_ = std::make_unique< + serializer::presto::PrestoIterativePartitioningSerializer>( + outputType_, + numDestinations_, + options, + pool_, + serializerInputByOutput_, + [bufferManager = + bufferManager_]() -> std::unique_ptr { + auto lockedBufferManager = bufferManager.lock(); + VELOX_CHECK_NOT_NULL( + lockedBufferManager, "OutputBufferManager was already destructed"); + return lockedBufferManager->newListener(); + }); +} + +void OptimizedPartitionedOutput::addInput(RowVectorPtr input) { + VELOX_USER_CHECK( + !replicateNullsAndAny_, + "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput"); + + auto serializerInput = prepareSerializerInput(input); + + if (serializer_->estimateBytesAfterAppend(serializerInput) > + maxOutputBufferBytes_) { + flush(); + } + + const auto numRows = input->size(); + partitions_.resize(numRows); + + if (numDestinations_ == 1) { + std::fill(partitions_.begin(), partitions_.end(), 0u); + } else { + std::optional partition = + partitionFunction_->partition(*input, partitions_); + if (partition.has_value()) { + // All rows go to the same partition + std::fill(partitions_.begin(), partitions_.end(), partition.value()); + } + } + + serializer_->append(serializerInput, partitions_); + + auto lockedStats = stats_.wlock(); + ++numAppends_; + lockedStats->addRuntimeStat("numAppends", RuntimeCounter(1)); +} + +bool OptimizedPartitionedOutput::needsInput() const { + return blockingReason_ == BlockingReason::kNotBlocked; +} + +RowVectorPtr OptimizedPartitionedOutput::getOutput() { + if (finished_) { + return nullptr; + } + + blockingReason_ = BlockingReason::kNotBlocked; + + if (noMoreInput_ || serializer_->bytesBuffered() >= maxOutputBufferBytes_) { + flush(); + } + + // If blocked, stop here. We avoid advancing operator state while blocked, + // even if noMoreInput_ may already be true. The driver will resume and call + // getOutput() again once the OutputBuffer has space. + if (blockingReason_ != BlockingReason::kNotBlocked) { + return nullptr; + } + + if (noMoreInput_ && serializer_->bytesBuffered() == 0) { + // TODO: merge serializer runtime stats into operator stats once + // PrestoIterativePartitioningSerializer exposes runtimeStats(). + bufferManager_.lock()->noMoreData(operatorCtx_->task()->taskId()); + finished_ = true; + } + + return nullptr; +} + +BlockingReason OptimizedPartitionedOutput::isBlocked(ContinueFuture* future) { + if (blockingReason_ != BlockingReason::kNotBlocked) { + *future = std::move(future_); + blockingReason_ = BlockingReason::kNotBlocked; + return BlockingReason::kWaitForConsumer; + } + return BlockingReason::kNotBlocked; +} + +bool OptimizedPartitionedOutput::isFinished() { + return finished_; +} + +void OptimizedPartitionedOutput::initializeSerializerLayout() { + if (outputType_->size() == 0 || outputChannels_.empty()) { + serializerInputType_ = outputType_; + return; + } + + std::unordered_map outputToSerializerInput; + outputToSerializerInput.reserve(outputChannels_.size()); + + std::vector names; + std::vector types; + names.reserve(outputChannels_.size()); + types.reserve(outputChannels_.size()); + serializerInputByOutput_.reserve(outputChannels_.size()); + + for (const auto outputChannel : outputChannels_) { + auto it = outputToSerializerInput.find(outputChannel); + if (it == outputToSerializerInput.end()) { + const auto serializerInputChannel = + static_cast(serializerInputChannels_.size()); + serializerInputChannels_.push_back(outputChannel); + names.push_back(inputType_->nameOf(outputChannel)); + types.push_back(inputType_->childAt(outputChannel)); + it = + outputToSerializerInput.emplace(outputChannel, serializerInputChannel) + .first; + } + serializerInputByOutput_.push_back(it->second); + } + + serializerInputType_ = ROW(std::move(names), std::move(types)); +} + +RowVectorPtr OptimizedPartitionedOutput::prepareSerializerInput( + const RowVectorPtr& input) const { + VELOX_CHECK_NOT_NULL(input); + + if (serializerInputType_->size() == 0) { + return std::make_shared( + input->pool(), + serializerInputType_, + nullptr /*nulls*/, + input->size(), + std::vector{}); + } + + if (serializerInputChannels_.empty()) { + input->loadedVector(); + return input; + } + + std::vector serializerInputColumns; + serializerInputColumns.reserve(serializerInputChannels_.size()); + for (auto channel : serializerInputChannels_) { + auto loadedChild = BaseVector::loadedVectorShared(input->childAt(channel)); + serializerInputColumns.push_back(loadedChild); + } + + return std::make_shared( + input->pool(), + serializerInputType_, + nullptr /*nulls*/, + input->size(), + std::move(serializerInputColumns)); +} + +void OptimizedPartitionedOutput::flush() { + const auto flushedBytes = serializer_->bytesBuffered(); + const auto flushedRows = serializer_->rowsBuffered(); + + // This will serialize all destinations and reset serializer_->bytesBuffered() + // to 0. + auto serializedIOBufs = serializer_->flush(); + auto bufferManager = bufferManager_.lock(); + VELOX_CHECK_NOT_NULL( + bufferManager, "OutputBufferManager was already destructed"); + + bool shouldBlock = false; + ContinueFuture future = ContinueFuture::makeEmpty(); + for (auto& [destination, pageData] : serializedIOBufs) { + // We will only pass the future to bufferManager->enqueue() for the first + // blocked destination. This is to avoid unnecessary creation of + // ContinueFuture objects for the remaining destinations. + ContinueFuture* futurePtr = shouldBlock ? nullptr : &future; + + // Enqueue the data for each non-empty partition. Since the pageData is + // already serialized, enqueueing them would not cause new memory + // allocations. This will always move the pageData to the OutputBuffers no + // matter if the OutputBuffer is blocked. + bool blocked = bufferManager->enqueue( + taskId_, + static_cast(destination), + std::make_unique( + std::move(pageData.first), + [fn = bufferReleaseFn_](folly::IOBuf&) { fn(); }, + pageData.second), + futurePtr); + + if (blocked && !shouldBlock) { + blockingReason_ = BlockingReason::kWaitForConsumer; + shouldBlock = true; + future_ = std::move(future); + } + } + + auto lockedStats = stats_.wlock(); + lockedStats->addOutputVector(flushedBytes, flushedRows); + if (flushedRows > 0) { + ++numFlushes_; + lockedStats->addRuntimeStat("numFlushes", RuntimeCounter(1)); + } + if (shouldBlock) { + ++numBlockedTimes_; + lockedStats->addRuntimeStat("numBlockedTimes", RuntimeCounter(1)); + } +} + +} // namespace facebook::velox::exec diff --git a/velox/exec/OptimizedPartitionedOutput.h b/velox/exec/OptimizedPartitionedOutput.h new file mode 100644 index 00000000000..78ddcaf4a6f --- /dev/null +++ b/velox/exec/OptimizedPartitionedOutput.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/exec/Operator.h" +#include "velox/exec/OutputBufferManager.h" +#include "velox/serializers/PrestoIterativePartitioningSerializer.h" + +namespace facebook::velox::exec { + +/// Partitioned output operator backed by PrestoIterativePartitioningSerializer. +/// +/// Routes each input row to a partition via a hash function, buffers the +/// partitioned data, and flushes serialized Presto pages into the output +/// buffer manager when the buffer is full or the pipeline is draining. +class OptimizedPartitionedOutput : public Operator { + public: + /// Minimum flush size for non-final flush; 60 KB + overhead fits a 64 KB + /// network MTU. + static constexpr uint64_t kMinDestinationSize = 60 * 1024; + + OptimizedPartitionedOutput( + int32_t operatorId, + DriverCtx* ctx, + const std::shared_ptr& planNode); + + void addInput(RowVectorPtr input) override; + + /// Returns true when the operator is not waiting for the output buffer to + /// drain. The driver checks this before calling addInput() so a blocked + /// state does not accumulate additional rows. + bool needsInput() const override; + + /// Always returns nullptr; output is pushed into the buffer manager as a + /// side-effect. Flushes the serializer when the buffer is full or the + /// pipeline is draining, then signals noMoreData() once all rows are sent. + RowVectorPtr getOutput() override; + + BlockingReason isBlocked(ContinueFuture* future) override; + + bool isFinished() override; + + private: + /// Computes the serializer input columns and the mapping from output columns + /// to serializer input columns. + void initializeSerializerLayout(); + + /// Builds the RowVector consumed by the serializer. When the output layout + /// has duplicated columns, this projects only the distinct columns and + /// leaves duplication to flush time. + RowVectorPtr prepareSerializerInput(const RowVectorPtr& input) const; + + /// Serializes all buffered rows into Presto pages and enqueues each page + /// into the output buffer manager. All destinations are always enqueued; + /// sets blockingReason_ and records a future if the output buffer is full. + /// Increments numFlushes_ on each call. + void flush(); + + const std::string taskId_; + const RowTypePtr inputType_; + const std::vector keyChannels_; + /// Non-empty when the output layout differs from the input + const std::vector outputChannels_; + const int32_t numDestinations_; + + const bool replicateNullsAndAny_; + const std::weak_ptr bufferManager_; + /// Holds a reference to the owning task to prevent it from being destroyed + /// while serialized pages are in flight inside the buffer manager. + const std::function bufferReleaseFn_; + const int64_t maxOutputBufferBytes_; + + velox::memory::MemoryPool* pool_; + + /// Computes per-row partition assignments. Null when numDestinations_ == 1. + std::unique_ptr partitionFunction_; + /// Reusable buffer for per-row partition assignments. + std::vector partitions_; + + std::unique_ptr + serializer_; + /// Row type passed to serializer_->append(). It only includes distinct + /// columns from the output layout. + RowTypePtr serializerInputType_; + /// Input channels that make up the serializer input type. Empty if the output + /// layout is the same as the input. + std::vector serializerInputChannels_; + /// For each output column index, store the corresponding serializer input + /// column. + std::vector serializerInputByOutput_; + + BlockingReason blockingReason_{BlockingReason::kNotBlocked}; + ContinueFuture future_; + bool finished_{false}; + + /// Counts addInput() calls that appended at least one row to the serializer. + /// Exposed as the "numAppendTimes" runtime stat. + uint64_t numAppends_{0}; + /// Counts non-empty flush() calls — flushes that serialized at least one + /// row. Exposed as the "numFlushes" runtime stat for test verification. + uint64_t numFlushes_{0}; + /// Counts flush() calls that caused the driver to block on a full output + /// buffer. Exposed as the "numBlockedTimes" runtime stat. + uint64_t numBlockedTimes_{0}; +}; + +} // namespace facebook::velox::exec diff --git a/velox/exec/OptimizedVectorHasher.cpp b/velox/exec/OptimizedVectorHasher.cpp new file mode 100644 index 00000000000..507ffc9edb1 --- /dev/null +++ b/velox/exec/OptimizedVectorHasher.cpp @@ -0,0 +1,407 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/exec/OptimizedVectorHasher.h" + +#include "velox/common/base/SimdUtil.h" +#include "velox/type/FloatingPointUtil.h" + +namespace facebook::velox::exec { +namespace { + +template +uint64_t hashOne(const DecodedVector& decoded, vector_size_t index) { + if constexpr ( + Kind == TypeKind::ROW || Kind == TypeKind::ARRAY || + Kind == TypeKind::MAP) { + return decoded.base()->hashValueAt(decoded.index(index)); + } else { + using T = typename KindToFlatVector::HashRowType; + const T value = decoded.valueAt(index); + + if constexpr (typeProvidesCustomComparison) { + return static_cast*>( + decoded.base()->type().get()) + ->hash(value); + } else if constexpr (std::is_floating_point_v) { + return util::floating_point::NaNAwareHash()(value); + } else { + return folly::hasher()(value); + } + } +} + +constexpr uint64_t kNullHash = OptimizedVectorHasher::kNullHash; + +// Fills `result[0..size)` with `hash`, mixing into the existing values when +// `Mix` is true. +template +inline void broadcastHash(vector_size_t size, uint64_t* result, uint64_t hash) { + if constexpr (Mix) { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = bits::hashMix(result[i], hash); + } + } else { + std::fill(result, result + size, hash); + } +} + +// Computes one hash per row via `computeHash(i)`. Caller guarantees no nulls. +template +inline void +hashLoopNoNulls(vector_size_t size, uint64_t* result, ComputeHash computeHash) { + if constexpr (Mix) { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = bits::hashMix(result[i], computeHash(i)); + } + } else { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = computeHash(i); + } + } +} + +// Computes one hash per row, substituting `kNullHash` for null rows. +template +inline void hashLoopWithNulls( + vector_size_t size, + uint64_t* result, + const DecodedVector& decoded, + ComputeHash computeHash) { + if constexpr (Mix) { + for (vector_size_t i = 0; i < size; ++i) { + const uint64_t hash = decoded.isNullAt(i) ? kNullHash : computeHash(i); + result[i] = bits::hashMix(result[i], hash); + } + } else { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = decoded.isNullAt(i) ? kNullHash : computeHash(i); + } + } +} + +template +inline void scatterDictionaryHashes( + vector_size_t size, + uint64_t* result, + const vector_size_t* indices, + const uint64_t* baseHashes) { + if constexpr (Mix) { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = bits::hashMix(result[i], baseHashes[indices[i]]); + } + } else { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = baseHashes[indices[i]]; + } + } +} + +template +inline void scatterDictionaryHashesWithExtraNulls( + vector_size_t size, + uint64_t* result, + const vector_size_t* indices, + const uint64_t* nulls, + const uint64_t* baseHashes) { + if constexpr (Mix) { + for (vector_size_t i = 0; i < size; ++i) { + const uint64_t hash = + bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]]; + result[i] = bits::hashMix(result[i], hash); + } + } else { + for (vector_size_t i = 0; i < size; ++i) { + result[i] = + bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]]; + } + } +} + +/// converts Velox’s packed boolean storage into one hash per row. +/// @param values: a bitmap: one bit per row, where set means true and unset +/// means false +template +inline void scatterBoolHashes( + vector_size_t size, + uint64_t* result, + const uint64_t* values, + const uint64_t* nulls) { + using Batch = xsimd::batch; + static constexpr vector_size_t kSimdBatchSize = Batch::size; + const auto falseHash = folly::hasher()(false); + const auto trueHash = folly::hasher()(true); + + vector_size_t row{0}; + if constexpr (!Mix) { + const auto falseHashBatch = + xsimd::broadcast(static_cast(falseHash)); + const auto trueHashBatch = + xsimd::broadcast(static_cast(trueHash)); + const auto nullHashBatch = + xsimd::broadcast(static_cast(kNullHash)); + auto* const signedResult = reinterpret_cast(result); + + for (; row + kSimdBatchSize <= size; row += kSimdBatchSize) { + const auto bitOffset = row & 63; + const auto valueBits = (values[row / 64] >> bitOffset) & + bits::lowMask(static_cast(kSimdBatchSize)); + auto hashes = xsimd::select( + simd::fromBitMask(valueBits), trueHashBatch, falseHashBatch); + + if (nulls != nullptr) { + const auto notNullBits = (nulls[row / 64] >> bitOffset) & + bits::lowMask(static_cast(kSimdBatchSize)); + hashes = xsimd::select( + simd::fromBitMask(notNullBits), hashes, nullHashBatch); + } + + hashes.store_unaligned(signedResult + row); + } + } + + // TODO: improve performance + for (; row < size; ++row) { + const auto hash = nulls != nullptr && bits::isBitNull(nulls, row) + ? kNullHash + : (bits::isBitSet(values, row) ? trueHash : falseHash); + if constexpr (Mix) { + result[row] = bits::hashMix(result[row], hash); + } else { + result[row] = hash; + } + } +} + +// Dispatches `body` with `Mix` resolved as a compile-time bool. +template +inline void dispatchMix(bool mix, Body body) { + if (mix) { + body(std::true_type{}); + } else { + body(std::false_type{}); + } +} + +template +inline void hashDecoded( + bool mix, + vector_size_t size, + uint64_t* result, + const DecodedVector& decoded, + ComputeHash computeHash) { + dispatchMix(mix, [&](auto mixTag) { + constexpr bool kMix = decltype(mixTag)::value; + if (decoded.mayHaveNulls()) { + hashLoopWithNulls(size, result, decoded, computeHash); + } else { + hashLoopNoNulls(size, result, computeHash); + } + }); +} + +} // namespace + +OptimizedVectorHasher::OptimizedVectorHasher( + TypePtr type, + column_index_t channel) + : channel_(channel), + type_(std::move(type)), + typeKind_(type_->kind()), + typeProvidesCustomComparison_(type_->providesCustomComparison()) {} + +void OptimizedVectorHasher::decode( + const BaseVector& vector, + const SelectivityVector& rows) { + VELOX_CHECK( + type_->kindEquals(vector.type()), + "Type mismatch: {} vs. {}", + type_->toString(), + vector.type()->toString()); + decoded_.decode(vector, rows); +} + +void OptimizedVectorHasher::hash(bool mix, raw_vector& result) { + if (typeKind_ == TypeKind::UNKNOWN) { + dispatchMix(mix, [&](auto mixTag) { + broadcastHash( + decoded_.size(), result.data(), kNullHash); + }); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH(hashValues, typeKind_, mix, result.data()); + } +} + +void OptimizedVectorHasher::hash( + const SelectivityVector& rows, + bool mix, + raw_vector& result) { + if (decoded_.size() == 0 || result.empty() || rows.isAllSelected()) { + hash(mix, result); + return; + } + + const auto original = result; + + hash(mix, result); + + // The specialized hash() path computes values for the full decoded extent. + // Restore rows that were not selected to match VectorHasher semantics. + for (vector_size_t row = 0; row < result.size(); ++row) { + if (!rows.isValid(row)) { + result[row] = original[row]; + } + } +} + +template +void OptimizedVectorHasher::hashValues(bool mix, uint64_t* result) { + using T = typename TypeTraits::NativeType; + if constexpr ( + Kind == TypeKind::ROW || Kind == TypeKind::ARRAY || + Kind == TypeKind::MAP) { + if (typeProvidesCustomComparison_) { + hashTyped(mix, result); + } else { + hashTyped(mix, result); + } + return; + } + + if (decoded_.isConstantMapping() || !decoded_.isIdentityMapping() || + typeProvidesCustomComparison_) { + if (typeProvidesCustomComparison_) { + hashTyped(mix, result); + } else { + hashTyped(mix, result); + } + return; + } + hashFlatValues(mix, result); +} + +template +void OptimizedVectorHasher::hashTyped(bool mix, uint64_t* result) { + const auto size = decoded_.size(); + + // Constant column: compute the value once and broadcast. + if (decoded_.isConstantMapping()) { + const uint64_t hash = decoded_.isNullAt(0) + ? kNullHash + : hashOne(decoded_, 0); + dispatchMix(mix, [&](auto mixTag) { + broadcastHash(size, result, hash); + }); + return; + } + + // Dictionary mapping more rows than its base: calculate the hashes for the + // dictionary first, then scatter. + if (!decoded_.isIdentityMapping() && size > decoded_.base()->size()) { + const DecodedVector baseDecoded(*decoded_.base()); + const auto baseSize = decoded_.base()->size(); + dictionaryHashes_.resize(baseSize); + const auto computeBaseHash = [&](vector_size_t i) { + return hashOne(baseDecoded, i); + }; + hashDecoded( + false, + baseSize, + dictionaryHashes_.data(), + baseDecoded, + computeBaseHash); + + const auto* const indices = decoded_.indices(); + dispatchMix(mix, [&](auto mixTag) { + constexpr bool kMix = decltype(mixTag)::value; + if (decoded_.hasExtraNulls()) { + scatterDictionaryHashesWithExtraNulls( + size, result, indices, decoded_.nulls(), dictionaryHashes_.data()); + } else { + scatterDictionaryHashes( + size, result, indices, dictionaryHashes_.data()); + } + }); + return; + } + + // Generic fallback + const auto computeHash = [&](vector_size_t i) { + return hashOne(decoded_, i); + }; + hashDecoded(mix, size, result, decoded_, computeHash); +} + +template +void OptimizedVectorHasher::hashFlatValues(bool mix, uint64_t* result) { + if constexpr (std::is_void_v) { + VELOX_NYI(); + } else { + const T* const values = decoded_.data(); + const auto size = decoded_.size(); + const auto computeHash = [&](vector_size_t i) { + if constexpr (std::is_floating_point_v) { + return util::floating_point::NaNAwareHash()(values[i]); + } else { + return folly::hasher()(values[i]); + } + }; + hashDecoded(mix, size, result, decoded_, computeHash); + } +} + +template <> +void OptimizedVectorHasher::hashFlatValues(bool mix, uint64_t* result) { + const auto* const values = decoded_.data(); + const auto* const nulls = + decoded_.mayHaveNulls() ? decoded_.nulls() : nullptr; + dispatchMix(mix, [&](auto mixTag) { + scatterBoolHashes( + decoded_.size(), result, values, nulls); + }); +} + +void OptimizedVectorHasher::hashPrecomputed( + bool mix, + raw_vector& result) const { + dispatchMix(mix, [&](auto mixTag) { + broadcastHash( + result.size(), result.data(), precomputedHash_); + }); +} + +void OptimizedVectorHasher::precompute(const BaseVector& value) { + if (value.isNullAt(0)) { + precomputedHash_ = kNullHash; + return; + } + + decoded_.decode(value); + if (typeKind_ == TypeKind::UNKNOWN) { + precomputedHash_ = kNullHash; + return; + } + + if (typeProvidesCustomComparison_) { + precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH( + hashOne, true, typeKind_, decoded_, 0); + } else { + precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH( + hashOne, false, typeKind_, decoded_, 0); + } +} + +} // namespace facebook::velox::exec diff --git a/velox/exec/OptimizedVectorHasher.h b/velox/exec/OptimizedVectorHasher.h new file mode 100644 index 00000000000..830b453abe8 --- /dev/null +++ b/velox/exec/OptimizedVectorHasher.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/memory/RawVector.h" +#include "velox/exec/Operator.h" +#include "velox/vector/VectorTypeUtils.h" + +namespace facebook::velox::exec { + +class OptimizedVectorHasher { + public: + OptimizedVectorHasher(TypePtr type, column_index_t channel); + + static std::unique_ptr create( + TypePtr type, + column_index_t channel) { + return std::make_unique(std::move(type), channel); + } + + column_index_t channel() const { + return channel_; + } + + // Decodes the 'vector' in preparation for calling hash() or + // computeValueIds(). The decoded vector can be accessed via decodedVector() + // getter. + void decode(const BaseVector& vector, const SelectivityVector& rows); + + void hash(bool mix, raw_vector& result); + + void + hash(const SelectivityVector& rows, bool mix, raw_vector& result); + + void hashPrecomputed(bool mix, raw_vector& result) const; + + void precompute(const BaseVector& value); + + static constexpr uint64_t kNullHash = BaseVector::kNullHash; + + template + void hashValues(bool mix, uint64_t* result); + + private: + template + void hashTyped(bool mix, uint64_t* result); + + template + void hashFlatValues(bool mix, uint64_t* result); + + const column_index_t channel_; + const TypePtr type_; + const TypeKind typeKind_; + const bool typeProvidesCustomComparison_; + + DecodedVector decoded_; + raw_vector dictionaryHashes_; + uint64_t precomputedHash_{0}; +}; + +} // namespace facebook::velox::exec diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp index ba4e23d738b..74320389489 100644 --- a/velox/exec/PartitionedOutput.cpp +++ b/velox/exec/PartitionedOutput.cpp @@ -207,10 +207,13 @@ PartitionedOutput::PartitionedOutput( numDestinations_(planNode->numPartitions()), replicateNullsAndAny_(planNode->isReplicateNullsAndAny()), partitionFunction_( - numDestinations_ == 1 ? nullptr - : planNode->partitionFunctionSpec().create( - numDestinations_, - /*localExchange=*/false)), + numDestinations_ == 1 + ? nullptr + : planNode->partitionFunctionSpec().create( + numDestinations_, + /*localExchange=*/false, + ctx->queryConfig() + .optimizedHashPartitionFunctionEnabled())), outputChannels_(calculateOutputChannels( planNode->inputType(), planNode->outputType(), diff --git a/velox/exec/RoundRobinPartitionFunction.h b/velox/exec/RoundRobinPartitionFunction.h index b84c6d2ffaf..a13ed529f55 100644 --- a/velox/exec/RoundRobinPartitionFunction.h +++ b/velox/exec/RoundRobinPartitionFunction.h @@ -43,7 +43,8 @@ class RoundRobinPartitionFunctionSpec : public core::PartitionFunctionSpec { public: std::unique_ptr create( int numPartitions, - bool /*localExchange*/) const override { + bool /*localExchange*/, + bool /*useOptimizedPartitionFunction*/ = false) const override { return std::make_unique( numPartitions); } diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp index cd2cd4ce36a..04427975120 100644 --- a/velox/exec/RowNumber.cpp +++ b/velox/exec/RowNumber.cpp @@ -449,8 +449,14 @@ void RowNumber::setupInputSpiller( keyChannels.push_back(hasher->channel()); } - spillHashFunction_ = std::make_unique( - inputSpiller_->hashBits(), inputType_, keyChannels); + spillHashFunction_ = createHashPartitionFunction( + inputSpiller_->hashBits(), + inputType_, + keyChannels, + {}, + operatorCtx_->driverCtx() + ->queryConfig() + .optimizedHashPartitionFunctionEnabled()); } void RowNumber::spill() { diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h index b34fc9d9c20..8e53713fc77 100644 --- a/velox/exec/RowNumber.h +++ b/velox/exec/RowNumber.h @@ -142,7 +142,7 @@ class RowNumber : public Operator { SpillPartitionSet spillInputPartitionSet_; // Used to calculate the spill partition numbers of the inputs. - std::unique_ptr spillHashFunction_; + std::unique_ptr spillHashFunction_; // The cpu may be voluntarily yield after running too long when processing // input from spilled file. diff --git a/velox/exec/ScaleWriterLocalPartition.cpp b/velox/exec/ScaleWriterLocalPartition.cpp index 7530ff403a0..1764adabf6a 100644 --- a/velox/exec/ScaleWriterLocalPartition.cpp +++ b/velox/exec/ScaleWriterLocalPartition.cpp @@ -57,7 +57,10 @@ ScaleWriterPartitioningLocalPartition::ScaleWriterPartitioningLocalPartition( ? nullptr : planNode->partitionFunctionSpec().create( numTablePartitions_, - /*localExchange=*/true); + /*localExchange=*/true, + operatorCtx_->driverCtx() + ->queryConfig() + .optimizedHashPartitionFunctionEnabled()); } void ScaleWriterPartitioningLocalPartition::initialize() { diff --git a/velox/exec/SubPartitionedSortWindowBuild.cpp b/velox/exec/SubPartitionedSortWindowBuild.cpp index 2f2a247a8d4..db437748fbb 100644 --- a/velox/exec/SubPartitionedSortWindowBuild.cpp +++ b/velox/exec/SubPartitionedSortWindowBuild.cpp @@ -22,6 +22,7 @@ namespace facebook::velox::exec { SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild( const std::shared_ptr& node, int32_t numSubPartitions, + const core::QueryConfig& queryConfig, velox::memory::MemoryPool* pool, common::PrefixSortConfig&& prefixSortConfig, const common::SpillConfig* spillConfig, @@ -40,8 +41,13 @@ SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild( for (int i = 0; i < numPartitionKeys_; i++) { keyChannels[i] = inputChannels_[i]; } - subPartitioningFunction_ = std::make_unique( - false, numSubPartitions_, node->inputType(), keyChannels); + subPartitioningFunction_ = createHashPartitionFunction( + /*localExchange=*/false, + numSubPartitions_, + node->inputType(), + keyChannels, + {}, + queryConfig.optimizedHashPartitionFunctionEnabled()); subWindowBuilds_.resize(numSubPartitions_); for (int i = 0; i < numSubPartitions_; i++) { subWindowBuilds_[i] = std::make_unique( @@ -59,7 +65,12 @@ void SubPartitionedSortWindowBuild::addInput(RowVectorPtr input) { VELOX_CHECK_LT(currentSubPartition_, 0); subPartitionIdsBuffer_.resize(input->size()); - subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_); + std::optional singlePartition = + subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_); + if (singlePartition.has_value()) { + simd::simdFill( + subPartitionIdsBuffer_.data(), singlePartition.value(), input->size()); + } for (auto i = 0; i < inputChannels_.size(); ++i) { decodedInputVectors_[i].decode(*input->childAt(inputChannels_[i])); diff --git a/velox/exec/SubPartitionedSortWindowBuild.h b/velox/exec/SubPartitionedSortWindowBuild.h index 8735f438d30..f0da95bdf95 100644 --- a/velox/exec/SubPartitionedSortWindowBuild.h +++ b/velox/exec/SubPartitionedSortWindowBuild.h @@ -33,6 +33,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild { SubPartitionedSortWindowBuild( const std::shared_ptr& node, int32_t numSubPartitions, + const core::QueryConfig& queryConfig, velox::memory::MemoryPool* pool, common::PrefixSortConfig&& prefixSortConfig, const common::SpillConfig* spillConfig, @@ -80,7 +81,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild { exec::SpillStats* const spillStats_; // Divide input rows to the corresponding sub partitions. - std::unique_ptr subPartitioningFunction_; + std::unique_ptr subPartitioningFunction_; // WindowBuilds for each sub partition. std::vector> subWindowBuilds_; diff --git a/velox/exec/Window.cpp b/velox/exec/Window.cpp index f9107522f0a..b763371a801 100644 --- a/velox/exec/Window.cpp +++ b/velox/exec/Window.cpp @@ -75,6 +75,7 @@ Window::Window( windowBuild_ = std::make_unique( windowNode, numSubPartitions, + driverCtx->queryConfig(), pool(), makePrefixSortConfig(driverCtx->queryConfig()), spillConfig, diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt index 7a721bf91a6..3ccff61baae 100644 --- a/velox/exec/benchmarks/CMakeLists.txt +++ b/velox/exec/benchmarks/CMakeLists.txt @@ -20,6 +20,27 @@ target_link_libraries( Folly::follybenchmark ) +add_executable(velox_exec_optimized_vector_hasher_benchmark OptimizedVectorHasherBenchmark.cpp) + +target_link_libraries( + velox_exec_optimized_vector_hasher_benchmark + velox_exec + velox_vector_test_lib + Folly::follybenchmark +) + +add_executable( + velox_exec_optimized_hash_partition_function_benchmark + OptimizedHashPartitionFunctionBenchmark.cpp +) + +target_link_libraries( + velox_exec_optimized_hash_partition_function_benchmark + velox_exec + velox_vector_test_lib + Folly::follybenchmark +) + add_executable(velox_filter_project_benchmark FilterProjectBenchmark.cpp) target_link_libraries( @@ -40,6 +61,16 @@ target_link_libraries( Folly::follybenchmark ) +add_executable(velox_local_exchange_benchmark LocalExchangeBenchmark.cpp) + +target_link_libraries( + velox_local_exchange_benchmark + velox_exec + velox_exec_test_lib + velox_vector_test_lib + Folly::follybenchmark +) + add_executable(velox_merge_benchmark MergeBenchmark.cpp) target_link_libraries( diff --git a/velox/exec/benchmarks/ExchangeBenchmark.cpp b/velox/exec/benchmarks/ExchangeBenchmark.cpp index 45689ccbf64..d204f4ed666 100644 --- a/velox/exec/benchmarks/ExchangeBenchmark.cpp +++ b/velox/exec/benchmarks/ExchangeBenchmark.cpp @@ -17,7 +17,6 @@ #include #include "velox/core/QueryConfig.h" -#include "velox/dwio/common/tests/utils/BatchMaker.h" #include "velox/exec/Exchange.h" #include "velox/exec/PlanNodeStats.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" @@ -32,15 +31,13 @@ DEFINE_int32(width, 16, "Number of parties in shuffle"); DEFINE_int32(task_width, 4, "Number of threads in each task in shuffle"); -DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles"); -DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query"); -DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch."); -DEFINE_int64( - local_exchange_buffer_mb, - 32, - "task-wide buffer in local exchange"); DEFINE_int64(exchange_buffer_mb, 32, "task-wide buffer in remote exchange"); -DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary"); +DEFINE_int32( + dict_pct, + 0, + "Percentage of vectors per column wrapped in dictionary encoding. " + "Applied independently to each column across all generated row vectors " + "and recursively to nested children."); // Add the following definitions to allow Clion runs DEFINE_bool(gtest_color, false, ""); DEFINE_string(gtest_filter, "*", ""); @@ -59,70 +56,401 @@ using namespace facebook::velox::test; namespace { -struct LocalPartitionWaitStats { - int64_t totalProducerWaitMs = 0; - int64_t totalConsumerWaitMs = 0; - std::vector consumerWaitMs; - std::vector producerWaitMs; - std::vector wallMs; +bool shouldWrapVector( + int32_t vectorIndex, + int32_t numVectors, + int32_t dictPct) { + VELOX_CHECK_GE(dictPct, 0); + VELOX_CHECK_LE(dictPct, 100); + return dictPct > 0 && (vectorIndex * 100) / numVectors < dictPct; +} + +void wrapDictionaryRecursive(VectorPtr& vector) { + if (!vector) { + return; + } + + switch (vector->encoding()) { + case VectorEncoding::Simple::ROW: { + auto row = vector->as(); + for (auto i = 0; i < row->childrenSize(); ++i) { + wrapDictionaryRecursive(row->childAt(i)); + } + break; + } + case VectorEncoding::Simple::ARRAY: { + auto array = vector->as(); + auto elements = array->elements(); + wrapDictionaryRecursive(elements); + array->setElements(std::move(elements)); + break; + } + case VectorEncoding::Simple::MAP: { + auto map = vector->as(); + auto keys = map->mapKeys(); + auto values = map->mapValues(); + wrapDictionaryRecursive(keys); + wrapDictionaryRecursive(values); + map->setKeysAndValues(std::move(keys), std::move(values)); + break; + } + default: + break; + } + + auto indices = facebook::velox::test::makeIndices( + vector->size(), [](auto row) { return row; }, vector->pool()); + vector = + BaseVector::wrapInDictionary(nullptr, indices, vector->size(), vector); +} + +struct ExchangeRunStats { + int64_t wallUs = 0; + PlanNodeStats partitionedOutputStats; + PlanNodeStats exchangeStats; +}; + +enum class ExchangeMode { + kNormal, + kOptimized, +}; + +/// Column element type dimension for simple-schema exchange benchmarks. +enum class SimpleColType { + kBoolean, + kTinyint, + kInteger, + kBigint, + kHugeint, + kLongDecimal, + kDouble, +}; + +TypePtr simpleColTypeToType(SimpleColType colType) { + switch (colType) { + case SimpleColType::kBoolean: + return BOOLEAN(); + case SimpleColType::kTinyint: + return TINYINT(); + case SimpleColType::kInteger: + return INTEGER(); + case SimpleColType::kBigint: + return BIGINT(); + case SimpleColType::kHugeint: + return HUGEINT(); + case SimpleColType::kLongDecimal: + return DECIMAL(20, 3); + case SimpleColType::kDouble: + return DOUBLE(); + } + VELOX_UNREACHABLE(); +} + +std::string simpleColTypeName(SimpleColType colType) { + switch (colType) { + case SimpleColType::kBoolean: + return "Boolean"; + case SimpleColType::kTinyint: + return "Tinyint"; + case SimpleColType::kInteger: + return "Integer"; + case SimpleColType::kBigint: + return "Bigint"; + case SimpleColType::kHugeint: + return "Hugeint"; + case SimpleColType::kLongDecimal: + return "LongDecimal"; + case SimpleColType::kDouble: + return "Double"; + } + VELOX_UNREACHABLE(); +} + +enum class ExchangeInputKind { + kDeep10K, + kDeep50, + kStruct1K, +}; + +struct ExchangeInputSpec { + std::string name; + RowTypePtr type; + int32_t numVectors; + int32_t rowsPerVector; +}; + +struct ExchangeBenchmarkResult { + std::string datasetName; + ExchangeMode mode; + ExchangeRunStats stats; }; -void sortByMax(std::vector& metrics) { - std::sort( - metrics.begin(), - metrics.end(), - [](const RuntimeMetric& left, const RuntimeMetric& right) { - return left.max > right.max; - }); +std::vector benchmarkResults; + +std::string modeName(ExchangeMode mode) { + switch (mode) { + case ExchangeMode::kNormal: + return "normal"; + case ExchangeMode::kOptimized: + return "optimized"; + } + + VELOX_UNREACHABLE(); +} + +/// Creates a simple row type with `numCols` columns all of type `colType`. +RowTypePtr makeSimpleType(const TypePtr& colType, int32_t numCols) { + std::vector names; + std::vector types; + names.reserve(numCols); + types.reserve(numCols); + for (int32_t i = 0; i < numCols; ++i) { + names.push_back(fmt::format("c{}", i)); + types.push_back(colType); + } + return ROW(std::move(names), std::move(types)); +} + +RowTypePtr makeStructType() { + return ROW( + {{"c0", BIGINT()}, + {"r1", + ROW( + {{"k2", BIGINT()}, + {"r2", + ROW( + {{"i1", BIGINT()}, + {"i2", BIGINT()}, + {"r3", + ROW( + {{"s3", VARCHAR()}, + {"i5", INTEGER()}, + {"d5", DOUBLE()}, + {"b5", BOOLEAN()}, + {"a5", ARRAY(TINYINT())}})}})}})}}); +} + +RowTypePtr makeDeepType() { + return ROW( + {{"c0", BIGINT()}, + {"long_array_val", ARRAY(ARRAY(BIGINT()))}, + {"array_val", ARRAY(VARCHAR())}, + {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})}, + {"map_val", + MAP(VARCHAR(), + MAP(BIGINT(), + ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}}); } -void sortByAndPrintMax( - const char* title, - int64_t total, - std::vector& metrics) { - sortByMax(metrics); - VELOX_CHECK(!metrics.empty()); - std::cout << title << "\n Total " << succinctNanos(total) - << "\n Max: " << metrics.front().toString() - << "\n Median: " << metrics[metrics.size() / 2].toString() - << "\n Min: " << metrics.back().toString() << std::endl; +ExchangeInputSpec makeInputSpec(ExchangeInputKind kind) { + switch (kind) { + case ExchangeInputKind::kDeep10K: + return {"Deep10K", makeDeepType(), 10, 10000}; + case ExchangeInputKind::kDeep50: + return {"Deep50", makeDeepType(), 2000, 50}; + case ExchangeInputKind::kStruct1K: + return {"Struct1K", makeStructType(), 100, 1000}; + } + + VELOX_UNREACHABLE(); +} + +ExchangeInputSpec makeInputSpec(SimpleColType colType, int32_t numCols) { + return { + fmt::format("Simple10K_{}_col{}", simpleColTypeName(colType), numCols), + makeSimpleType(simpleColTypeToType(colType), numCols), + 10, + 10'000}; +} + +std::string formatStat(const ExchangeRunStats* stats, auto formatter) { + if (stats == nullptr) { + return "N/A"; + } + return formatter(*stats); +} + +void printAllExchangeStats() { + struct PairedStats { + const ExchangeRunStats* normal = nullptr; + const ExchangeRunStats* optimized = nullptr; + }; + + std::vector datasetOrder; + std::unordered_map groupedStats; + for (const auto& result : benchmarkResults) { + auto [it, inserted] = + groupedStats.try_emplace(result.datasetName, PairedStats{}); + if (inserted) { + datasetOrder.push_back(result.datasetName); + } + if (result.mode == ExchangeMode::kNormal) { + it->second.normal = &result.stats; + } else { + it->second.optimized = &result.stats; + } + } + + for (const auto& datasetName : datasetOrder) { + const auto statsIt = groupedStats.find(datasetName); + VELOX_CHECK(statsIt != groupedStats.end()); + const auto& paired = statsIt->second; + std::cout << "--------------------" << datasetName << "--------------------" + << std::endl; + std::cout << "Wall Time (ms) | normal: " + << formatStat( + paired.normal, + [](const ExchangeRunStats& stats) { + return succinctMicros(stats.wallUs); + }) + << " | optimized: " + << formatStat( + paired.optimized, + [](const ExchangeRunStats& stats) { + return succinctMicros(stats.wallUs); + }) + << std::endl; + std::cout << "Normal" << std::endl + << " - PartitionedOutput: " + << formatStat( + paired.normal, + [](const ExchangeRunStats& stats) { + return stats.partitionedOutputStats.toString(); + }) + << std::endl + << " - Exchange: " + << formatStat( + paired.normal, + [](const ExchangeRunStats& stats) { + return stats.exchangeStats.toString(); + }) + << std::endl; + std::cout << "Optimized" << std::endl + << " - PartitionedOutput: " + << formatStat( + paired.optimized, + [](const ExchangeRunStats& stats) { + return stats.partitionedOutputStats.toString(); + }) + << std::endl + << " - Exchange: " + << formatStat( + paired.optimized, + [](const ExchangeRunStats& stats) { + return stats.exchangeStats.toString(); + }) + << std::endl; + } +} + +template +ExchangeRunStats runBenchmarkIterations(unsigned int iters, Fn&& runOnce) { + ExchangeRunStats stats; + while (iters--) { + stats = runOnce(); + } + return stats; } class ExchangeBenchmark : public VectorTestBase { public: + /// Creates a single flat column of `type` with `numRows` rows. + /// Approximately `nullPct` percent of rows are set to null, distributed + /// uniformly (row % 100 < nullPct). Non-null values are sequential integers + /// cast to the native type. + VectorPtr makeColumn(const TypePtr& type, int32_t numRows, int32_t nullPct) { + std::function isNull; + if (nullPct == 100) { + isNull = [](auto) { return true; }; + } else if (nullPct > 0) { + isNull = [nullPct](vector_size_t row) { return (row % 100) < nullPct; }; + } + + switch (type->kind()) { + case TypeKind::BOOLEAN: + return makeFlatVector( + numRows, [](auto row) { return row % 2 == 0; }, isNull); + case TypeKind::TINYINT: + return makeFlatVector( + numRows, [](auto row) { return static_cast(row); }, isNull); + case TypeKind::SMALLINT: + return makeFlatVector( + numRows, + [](auto row) { return static_cast(row); }, + isNull); + case TypeKind::INTEGER: + return makeFlatVector( + numRows, [](auto row) { return row; }, isNull); + case TypeKind::BIGINT: + // Handles plain BIGINT and short-decimal columns (DECIMAL(p,s), p≤18). + return makeFlatVector( + numRows, + [](auto row) { return static_cast(row); }, + isNull, + type); + case TypeKind::REAL: + return makeFlatVector( + numRows, [](auto row) { return static_cast(row); }, isNull); + case TypeKind::DOUBLE: + return makeFlatVector( + numRows, [](auto row) { return static_cast(row); }, isNull); + case TypeKind::HUGEINT: + // Handles long-decimal columns (DECIMAL(p,s), p>18). + return makeFlatVector( + numRows, + [](auto row) { return static_cast(row); }, + isNull, + type); + default: + VELOX_NYI( + "makeColumn does not support complex type {} yet", + type->toString()); + } + } + + /// Generates input batches for the exchange benchmark. + /// + /// `dictPct` is the percentage of vectors for each column that should be + /// wrapped in dictionary encoding across the full set of generated batches. + /// For example, with `numVectors = 10` and `dictPct = 30`, each top-level + /// column will have 3 dictionary-encoded vectors and 7 simple vectors. + /// Nested children of complex columns use the same rule recursively. + /// + /// `nullPct` controls what fraction of values in each column are null: + /// 0 = no nulls, 50 = half the rows null, 100 = all rows null. std::vector makeRows( - RowTypePtr type, + const RowTypePtr& type, int32_t numVectors, int32_t rowsPerVector, - int32_t dictPct = 0) { + int32_t dictPct = 0, + int32_t nullPct = 0) { std::vector vectors; - BufferPtr indices; + vectors.reserve(numVectors); for (int32_t i = 0; i < numVectors; ++i) { - auto vector = std::dynamic_pointer_cast( - BatchMaker::createBatch(type, rowsPerVector, *pool_)); - - auto width = vector->childrenSize(); - for (auto child = 0; child < width; ++child) { - if (100 * child / width > dictPct) { - if (!indices) { - indices = makeIndices(vector->size(), [&](auto i) { return i; }); - } - vector->childAt(child) = BaseVector::wrapInDictionary( - nullptr, indices, vector->size(), vector->childAt(child)); + std::vector children; + children.reserve(type->size()); + for (int32_t col = 0; col < type->size(); ++col) { + children.push_back( + makeColumn(type->childAt(col), rowsPerVector, nullPct)); + } + auto vector = makeRowVector(type->names(), children); + if (shouldWrapVector(i, numVectors, dictPct)) { + for (auto child = 0; child < vector->childrenSize(); ++child) { + wrapDictionaryRecursive(vector->childAt(child)); } } - vectors.push_back(vector); + vectors.push_back(std::move(vector)); } return vectors; } - void run( - std::vector& vectors, + ExchangeRunStats run( + const std::vector& vectors, int32_t width, int32_t taskWidth, - int64_t& wallUs, - PlanNodeStats& partitionedOutputStats, - PlanNodeStats& exchangeStats) { + ExchangeMode mode) { + VELOX_CHECK(!vectors.empty()); + core::PlanNodePtr plan; core::PlanNodeId exchangeId; core::PlanNodeId leafPartitionedOutputId; @@ -136,9 +464,7 @@ class ExchangeBenchmark : public VectorTestBase { const auto startUs = getCurrentTimeMicro(); BENCHMARK_SUSPEND { - assert(!vectors.empty()); - configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] = - fmt::format("{}", FLAGS_exchange_buffer_mb << 20); + configureQuerySettings(mode); const auto iteration = ++iteration_; // leafPlan: PartitionedOutput/kPartitioned(1) <-- Values(0) @@ -159,7 +485,6 @@ class ExchangeBenchmark : public VectorTestBase { // finalAggPlan: PartitionedOutput/kPartitioned(2) <-- Agg/kSingle(1) <-- // Exchange(0) - std::vector finalAggTaskIds; core::PlanNodePtr finalAggPlan = exec::test::PlanBuilder() .exchange(leafPlan->outputType(), "Presto") @@ -194,139 +519,44 @@ class ExchangeBenchmark : public VectorTestBase { .splits(finalAggSplits) .assertResults(expected); + ExchangeRunStats stats; BENCHMARK_SUSPEND { - wallUs = getCurrentTimeMicro() - startUs; - std::vector taskWallMs; + stats.wallUs = getCurrentTimeMicro() - startUs; for (const auto& task : leafTasks) { const auto& taskStats = task->taskStats(); - taskWallMs.push_back( - taskStats.executionEndTimeMs - taskStats.executionStartTimeMs); const auto& planStats = toPlanStats(taskStats); auto& taskPartitionedOutputStats = planStats.at(leafPartitionedOutputId); - partitionedOutputStats += taskPartitionedOutputStats; + stats.partitionedOutputStats += taskPartitionedOutputStats; } for (const auto& task : finalAggTasks) { const auto& taskStats = task->taskStats(); - taskWallMs.push_back( - taskStats.executionEndTimeMs - taskStats.executionStartTimeMs); const auto& planStats = toPlanStats(taskStats); auto& taskPartitionedOutputStats = planStats.at(finalAggPartitionedOutputId); - partitionedOutputStats += taskPartitionedOutputStats; + stats.partitionedOutputStats += taskPartitionedOutputStats; auto& taskExchangeStats = planStats.at(exchangeId); - exchangeStats += taskExchangeStats; - } - }; - } - - void runLocal( - std::vector& vectors, - int32_t taskWidth, - int32_t numTasks, - int64_t& localPartitionWallUs, - PlanNodeStats& partitionedOutputStats, - LocalPartitionWaitStats& localPartitionWaitStats) { - assert(!vectors.empty()); - - core::PlanNodePtr plan; - core::PlanNodeId localPartitionId1; - core::PlanNodeId localPartitionId2; - std::vector> tasks; - std::vector threads; - - RowVectorPtr expected; - - BENCHMARK_SUSPEND { - std::vector aggregates = {"count(1)"}; - auto& rowType = vectors[0]->type()->as(); - for (auto i = 1; i < rowType.size(); ++i) { - aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i))); + stats.exchangeStats += taskExchangeStats; } - - // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2) - // <-- LocalPartition/kRepartition(1) <-- Values(0) - plan = exec::test::PlanBuilder() - .values(vectors, true) - .localPartition({"c0"}) - .capturePlanNodeId(localPartitionId1) - .singleAggregation({}, aggregates) - .localPartition(std::vector{}) - .capturePlanNodeId(localPartitionId2) - .singleAggregation({}, {"sum(a0)"}) - .planNode(); - - threads.reserve(numTasks); - expected = makeRowVector({makeFlatVector(1, [&](auto /*row*/) { - return vectors.size() * vectors[0]->size() * taskWidth; - })}); }; - auto startMicros = getCurrentTimeMicro(); - std::mutex mutex; - for (int32_t i = 0; i < numTasks; ++i) { - threads.push_back(std::thread([&]() { - for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) { - auto task = - exec::test::AssertQueryBuilder(plan) - .config( - core::QueryConfig::kMaxLocalExchangeBufferSize, - fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20)) - .maxDrivers(taskWidth) - .assertResults(expected); - { - std::lock_guard l(mutex); - tasks.push_back(task); - } - } - })); - } - for (auto& thread : threads) { - thread.join(); - } - - BENCHMARK_SUSPEND { - localPartitionWallUs = getCurrentTimeMicro() - startMicros; - - std::vector localPartitionNodeIds{ - localPartitionId1, localPartitionId2}; - - localPartitionWaitStats.totalProducerWaitMs = 0; - localPartitionWaitStats.totalConsumerWaitMs = 0; - for (const auto& task : tasks) { - auto taskStats = task->taskStats(); - localPartitionWaitStats.wallMs.push_back( - taskStats.executionEndTimeMs - taskStats.executionStartTimeMs); - auto planStats = toPlanStats(taskStats); - - for (const auto& nodeId : localPartitionNodeIds) { - auto& taskLocalPartition1Stats = planStats.at(nodeId); - partitionedOutputStats += taskLocalPartition1Stats; - - auto& taskLocalPartition1RuntimeStats = - taskLocalPartition1Stats.customStats; - localPartitionWaitStats.producerWaitMs.push_back( - taskLocalPartition1RuntimeStats - ["blockedWaitForProducerWallNanos"]); - localPartitionWaitStats.consumerWaitMs.push_back( - taskLocalPartition1RuntimeStats - ["blockedWaitForConsumerWallNanos"]); - localPartitionWaitStats.totalProducerWaitMs += - localPartitionWaitStats.producerWaitMs.back().sum; - localPartitionWaitStats.totalConsumerWaitMs += - localPartitionWaitStats.consumerWaitMs.back().sum; - } - } - }; + return stats; } private: static constexpr int64_t kMaxMemory = 6UL << 30; // 6GB + void configureQuerySettings(ExchangeMode mode) { + configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] = + fmt::format("{}", FLAGS_exchange_buffer_mb << 20); + configSettings_[core::QueryConfig::kOptimizedPartitionedOutputEnabled] = + mode == ExchangeMode::kOptimized ? "true" : "false"; + } + static std::string makeTaskId(int32_t iteration, const std::string& prefix, int num) { return fmt::format("local://{}-{}-{}", iteration, prefix, num); @@ -373,223 +603,131 @@ int32_t ExchangeBenchmark::iteration_; std::unique_ptr bm; -void runBenchmarks() { - std::vector flatNames = {"c0"}; - std::vector flatTypes = {BIGINT()}; - std::vector typeSelection = { - BOOLEAN(), - TINYINT(), - DECIMAL(20, 3), - INTEGER(), - BIGINT(), - REAL(), - DECIMAL(10, 2), - DOUBLE(), - VARCHAR()}; - - int64_t flatSize = 0; - // Add enough columns of different types to make a 10K row batch be - // flat_batch_mb in flat size. - while (flatSize * 10000 < static_cast(FLAGS_flat_batch_mb) << 20) { - flatNames.push_back(fmt::format("c{}", flatNames.size())); - assert(!flatNames.empty()); - flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]); - if (flatTypes.back()->isFixedWidth()) { - flatSize += flatTypes.back()->cppSizeInBytes(); - } else { - flatSize += 20; - } - } - auto flatType = ROW(std::move(flatNames), std::move(flatTypes)); - - auto structType = ROW( - {{"c0", BIGINT()}, - {"r1", - ROW( - {{"k2", BIGINT()}, - {"r2", - ROW( - {{"i1", BIGINT()}, - {"i2", BIGINT()}, - {"r3}, ROW({{s3", VARCHAR()}, - {"i5", INTEGER()}, - {"d5", DOUBLE()}, - {"b5", BOOLEAN()}, - {"a5", ARRAY(TINYINT())}})}})}}); - - auto deepType = ROW( - {{"c0", BIGINT()}, - {"long_array_val", ARRAY(ARRAY(BIGINT()))}, - {"array_val", ARRAY(VARCHAR())}, - {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})}, - {"map_val", - MAP(VARCHAR(), - MAP(BIGINT(), - ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}}); - - std::vector flat10k( - bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct)); - std::vector deep10k( - bm->makeRows(deepType, 10, 10000, FLAGS_dict_pct)); - std::vector flat50( - bm->makeRows(flatType, 2000, 50, FLAGS_dict_pct)); - std::vector deep50( - bm->makeRows(deepType, 2000, 50, FLAGS_dict_pct)); - std::vector struct1k( - bm->makeRows(structType, 100, 1000, FLAGS_dict_pct)); - - int64_t flat10KWallUs; - PlanNodeStats partitionedOutputStatsFlat10K; - PlanNodeStats exchangeStatsFlat10K; - folly::addBenchmark(__FILE__, "exchangeFlat10k", [&]() { - bm->run( - flat10k, - FLAGS_width, - FLAGS_task_width, - flat10KWallUs, - partitionedOutputStatsFlat10K, - exchangeStatsFlat10K); - return 1; - }); - - int64_t flat50KWallUs; - PlanNodeStats partitionedOutputStatsFlat50; - PlanNodeStats exchangeStatsFlat50; - folly::addBenchmark(__FILE__, "exchangeFlat50", [&]() { - bm->run( - flat50, - FLAGS_width, - FLAGS_task_width, - flat50KWallUs, - partitionedOutputStatsFlat50, - exchangeStatsFlat50); - return 1; - }); - - int64_t deep10KWallUs; - PlanNodeStats partitionedOutputStatsDeep10K; - PlanNodeStats exchangeStatsDeep10K; - folly::addBenchmark(__FILE__, "exchangeDeep10k", [&]() { - bm->run( - deep10k, - FLAGS_width, - FLAGS_task_width, - deep10KWallUs, - partitionedOutputStatsDeep10K, - exchangeStatsDeep10K); - return 1; - }); - - int64_t deep50KWallUs; - PlanNodeStats partitionedOutputStatsDeep50; - PlanNodeStats exchangeStatsDeep50; - folly::addBenchmark(__FILE__, "exchangeDeep50", [&]() { - bm->run( - deep50, - FLAGS_width, - FLAGS_task_width, - deep50KWallUs, - partitionedOutputStatsDeep50, - exchangeStatsDeep50); - return 1; - }); - - int64_t stuct1KWallUs; - PlanNodeStats partitionedOutputStatsStruct1K; - PlanNodeStats exchangeStatsStruct1K; - folly::addBenchmark(__FILE__, "exchangeStruct1K", [&]() { - bm->run( - struct1k, - FLAGS_width, - FLAGS_task_width, - stuct1KWallUs, - partitionedOutputStatsStruct1K, - exchangeStatsStruct1K); - return 1; - }); - - int64_t localPartitionWallUs; - PlanNodeStats localPartitionStatsFlat10K; - LocalPartitionWaitStats localPartitionWaitStats; - folly::addBenchmark(__FILE__, "localFlat10k", [&]() { - bm->runLocal( - flat10k, - FLAGS_width, - FLAGS_num_local_tasks, - localPartitionWallUs, - localPartitionStatsFlat10K, - localPartitionWaitStats); - return 1; +void benchmarkExchange( + unsigned int iters, + const ExchangeInputSpec& input, + ExchangeMode mode, + int32_t dictPct, + int32_t nullPct) { + auto vectors = bm->makeRows( + input.type, input.numVectors, input.rowsPerVector, dictPct, nullPct); + auto stats = runBenchmarkIterations(iters, [&]() { + return bm->run(vectors, FLAGS_width, FLAGS_task_width, mode); }); + benchmarkResults.push_back( + {fmt::format("{}_dict{}_null{}", input.name, dictPct, nullPct), + mode, + std::move(stats)}); +} - folly::runBenchmarks(); +#define EXCHANGE_BENCHMARK_NAMED_PARAM(name, param_name, ...) \ + BENCHMARK_IMPL( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + FOLLY_PP_STRINGIZE(param_name), \ + iters, \ + unsigned, \ + iters) { \ + name(iters, ##__VA_ARGS__); \ + } - std::cout - << "----------------------------------Flat10K----------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << succinctMicros(flat10KWallUs) << std::endl; - std::cout << "PartitionOutput: " << partitionedOutputStatsFlat10K.toString() - << std::endl; - std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl; - - std::cout - << "----------------------------------Flat50K----------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << succinctMicros(flat50KWallUs) << std::endl; - std::cout << "PartitionOutput: " << partitionedOutputStatsFlat50.toString() - << std::endl; - std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl; - - std::cout - << "----------------------------------Deep10K----------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << succinctMicros(deep10KWallUs) << std::endl; - std::cout << "PartitionOutput: " << partitionedOutputStatsDeep10K.toString() - << std::endl; - std::cout << "Exchange: " << exchangeStatsDeep10K.toString() << std::endl; - - std::cout - << "----------------------------------Deep50K----------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << succinctMicros(deep50KWallUs) << std::endl; - std::cout << "PartitionOutput: " << partitionedOutputStatsDeep50.toString() - << std::endl; - std::cout << "Exchange: " << exchangeStatsDeep50.toString() << std::endl; - - std::cout - << "----------------------------------Struct1K---------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << succinctMicros(stuct1KWallUs) << std::endl; - std::cout << "PartitionOutput: " << partitionedOutputStatsStruct1K.toString() - << std::endl; - std::cout << "Exchange: " << exchangeStatsStruct1K.toString() << std::endl; - - std::cout - << "--------------------------------LocalFlat10K-------------------------------" - << std::endl; - std::cout << "Wall Time (ms): " << "\n Total: " - << succinctMicros(localPartitionWallUs) - << "\n Max: " << localPartitionWaitStats.wallMs.back() - << "\n Median: " - << localPartitionWaitStats - .wallMs[localPartitionWaitStats.wallMs.size() / 2] - << "\n Min: " << localPartitionWaitStats.wallMs.front() - << std::endl; - std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString() - << std::endl; - sortByAndPrintMax( - "Producer Wait Time (ms)", - localPartitionWaitStats.totalProducerWaitMs, - localPartitionWaitStats.producerWaitMs); - sortByAndPrintMax( - "Consumer Wait Time (ms)", - localPartitionWaitStats.totalConsumerWaitMs, - localPartitionWaitStats.consumerWaitMs); - std::sort( - localPartitionWaitStats.wallMs.begin(), - localPartitionWaitStats.wallMs.end()); - assert(!localPartitionWaitStats.wallMs.empty()); -} +// ── Benchmarks: input spec × nullPct × mode ─────────────────────────────── + +#define EXCHANGE_BENCHMARK_INPUT( \ + _case_name, _input_expr, _mode_name, _dict_pct, _null_pct, _mode) \ + EXCHANGE_BENCHMARK_NAMED_PARAM( \ + benchmarkExchange, \ + _case_name##_dict##_dict_pct##_null##_null_pct##_##_mode_name, \ + _input_expr, \ + ExchangeMode::_mode, \ + _dict_pct, \ + _null_pct) + +#define EXCHANGE_BENCHMARK_MODES( \ + _case_name, _input_expr, _dict_pct, _null_pct) \ + EXCHANGE_BENCHMARK_INPUT( \ + _case_name, _input_expr, normal, _dict_pct, _null_pct, kNormal); \ + EXCHANGE_BENCHMARK_INPUT( \ + _case_name, _input_expr, optimized, _dict_pct, _null_pct, kOptimized) + +#define EXCHANGE_BENCHMARK_CASE(_case_name, _input_expr) \ + EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 0); \ + EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 50); \ + EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 100) + +EXCHANGE_BENCHMARK_CASE( + Simple10K_Boolean_col1, + makeInputSpec(SimpleColType::kBoolean, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Boolean_col4, + makeInputSpec(SimpleColType::kBoolean, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Boolean_col16, + makeInputSpec(SimpleColType::kBoolean, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Tinyint_col1, + makeInputSpec(SimpleColType::kTinyint, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Tinyint_col4, + makeInputSpec(SimpleColType::kTinyint, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Tinyint_col16, + makeInputSpec(SimpleColType::kTinyint, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Integer_col1, + makeInputSpec(SimpleColType::kInteger, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Integer_col4, + makeInputSpec(SimpleColType::kInteger, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Integer_col16, + makeInputSpec(SimpleColType::kInteger, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Bigint_col1, + makeInputSpec(SimpleColType::kBigint, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Bigint_col4, + makeInputSpec(SimpleColType::kBigint, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Bigint_col16, + makeInputSpec(SimpleColType::kBigint, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Hugeint_col1, + makeInputSpec(SimpleColType::kHugeint, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Hugeint_col4, + makeInputSpec(SimpleColType::kHugeint, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Hugeint_col16, + makeInputSpec(SimpleColType::kHugeint, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_LongDecimal_col1, + makeInputSpec(SimpleColType::kLongDecimal, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_LongDecimal_col4, + makeInputSpec(SimpleColType::kLongDecimal, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_LongDecimal_col16, + makeInputSpec(SimpleColType::kLongDecimal, 16)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Double_col1, + makeInputSpec(SimpleColType::kDouble, 1)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Double_col4, + makeInputSpec(SimpleColType::kDouble, 4)); +EXCHANGE_BENCHMARK_CASE( + Simple10K_Double_col16, + makeInputSpec(SimpleColType::kDouble, 16)); + +// The complex type benchmarks are temporarily disabled. +// EXCHANGE_BENCHMARK_CASE(Deep10K, makeInputSpec(ExchangeInputKind::kDeep10K)); +// EXCHANGE_BENCHMARK_CASE(Deep50, makeInputSpec(ExchangeInputKind::kDeep50)); +// EXCHANGE_BENCHMARK_CASE(Struct1K, +// makeInputSpec(ExchangeInputKind::kStruct1K)); + +#undef EXCHANGE_BENCHMARK_CASE +#undef EXCHANGE_BENCHMARK_MODES +#undef EXCHANGE_BENCHMARK_INPUT +#undef EXCHANGE_BENCHMARK_NAMED_PARAM } // namespace @@ -605,7 +743,8 @@ int main(int argc, char** argv) { exec::ExchangeSource::registerFactory(exec::test::createLocalExchangeSource); bm = std::make_unique(); - runBenchmarks(); + folly::runBenchmarks(); + printAllExchangeStats(); bm.reset(); return 0; diff --git a/velox/exec/benchmarks/LocalExchangeBenchmark.cpp b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp new file mode 100644 index 00000000000..50b7637fd92 --- /dev/null +++ b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include +#include +#include + +#include "velox/core/QueryConfig.h" +#include "velox/dwio/common/tests/utils/BatchMaker.h" +#include "velox/exec/PlanNodeStats.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" +#include "velox/parse/TypeResolver.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +DEFINE_int32(width, 16, "Number of drivers in each local exchange task"); +DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles"); +DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query"); +DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch."); +DEFINE_int64( + local_exchange_buffer_mb, + 32, + "task-wide buffer in local exchange"); +DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary"); +// Add the following definitions to allow Clion runs +DEFINE_bool(gtest_color, false, ""); +DEFINE_string(gtest_filter, "*", ""); + +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::test; + +namespace { + +struct LocalPartitionWaitStats { + int64_t totalProducerWaitMs = 0; + int64_t totalConsumerWaitMs = 0; + std::vector consumerWaitMs; + std::vector producerWaitMs; + std::vector wallMs; +}; + +void sortByMax(std::vector& metrics) { + std::sort( + metrics.begin(), + metrics.end(), + [](const RuntimeMetric& left, const RuntimeMetric& right) { + return left.max > right.max; + }); +} + +void sortByAndPrintMax( + const char* title, + int64_t total, + std::vector& metrics) { + sortByMax(metrics); + VELOX_CHECK(!metrics.empty()); + std::cout << title << "\n Total " << succinctNanos(total) + << "\n Max: " << metrics.front().toString() + << "\n Median: " << metrics[metrics.size() / 2].toString() + << "\n Min: " << metrics.back().toString() << std::endl; +} + +class LocalExchangeBenchmark : public VectorTestBase { + public: + std::vector makeRows( + RowTypePtr type, + int32_t numVectors, + int32_t rowsPerVector, + int32_t dictPct = 0) { + std::vector vectors; + BufferPtr indices; + for (int32_t i = 0; i < numVectors; ++i) { + auto vector = std::dynamic_pointer_cast( + BatchMaker::createBatch(type, rowsPerVector, *pool_)); + + auto width = vector->childrenSize(); + for (auto child = 0; child < width; ++child) { + if (100 * child / width > dictPct) { + if (!indices) { + indices = makeIndices(vector->size(), [&](auto i) { return i; }); + } + vector->childAt(child) = BaseVector::wrapInDictionary( + nullptr, indices, vector->size(), vector->childAt(child)); + } + } + vectors.push_back(vector); + } + return vectors; + } + + void runLocal( + std::vector& vectors, + int32_t taskWidth, + int32_t numTasks, + int64_t& localPartitionWallUs, + PlanNodeStats& partitionedOutputStats, + LocalPartitionWaitStats& localPartitionWaitStats) { + VELOX_CHECK(!vectors.empty()); + + core::PlanNodePtr plan; + core::PlanNodeId localPartitionId1; + core::PlanNodeId localPartitionId2; + std::vector> tasks; + std::vector threads; + + RowVectorPtr expected; + + BENCHMARK_SUSPEND { + std::vector aggregates = {"count(1)"}; + auto& rowType = vectors[0]->type()->as(); + for (auto i = 1; i < rowType.size(); ++i) { + aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i))); + } + + // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2) + // <-- LocalPartition/kRepartition(1) <-- Values(0) + plan = exec::test::PlanBuilder() + .values(vectors, true) + .localPartition({"c0"}) + .capturePlanNodeId(localPartitionId1) + .singleAggregation({}, aggregates) + .localPartition(std::vector{}) + .capturePlanNodeId(localPartitionId2) + .singleAggregation({}, {"sum(a0)"}) + .planNode(); + + threads.reserve(numTasks); + expected = makeRowVector({makeFlatVector(1, [&](auto /*row*/) { + return vectors.size() * vectors[0]->size() * taskWidth; + })}); + }; + + const auto startMicros = getCurrentTimeMicro(); + std::mutex mutex; + for (int32_t i = 0; i < numTasks; ++i) { + threads.push_back(std::thread([&]() { + for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) { + auto task = + exec::test::AssertQueryBuilder(plan) + .config( + core::QueryConfig::kMaxLocalExchangeBufferSize, + fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20)) + .maxDrivers(taskWidth) + .assertResults(expected); + { + std::lock_guard l(mutex); + tasks.push_back(task); + } + } + })); + } + for (auto& thread : threads) { + thread.join(); + } + + BENCHMARK_SUSPEND { + localPartitionWallUs = getCurrentTimeMicro() - startMicros; + + std::vector localPartitionNodeIds{ + localPartitionId1, localPartitionId2}; + + localPartitionWaitStats.totalProducerWaitMs = 0; + localPartitionWaitStats.totalConsumerWaitMs = 0; + for (const auto& task : tasks) { + const auto taskStats = task->taskStats(); + localPartitionWaitStats.wallMs.push_back( + taskStats.executionEndTimeMs - taskStats.executionStartTimeMs); + const auto planStats = toPlanStats(taskStats); + + for (const auto& nodeId : localPartitionNodeIds) { + const auto planStatsIt = planStats.find(nodeId); + if (planStatsIt == planStats.end()) { + continue; + } + const auto& taskLocalPartitionStats = planStatsIt->second; + partitionedOutputStats += taskLocalPartitionStats; + + const auto& runtimeStats = taskLocalPartitionStats.customStats; + const auto producerWaitIt = + runtimeStats.find("blockedWaitForProducerWallNanos"); + const auto consumerWaitIt = + runtimeStats.find("blockedWaitForConsumerWallNanos"); + const RuntimeMetric producerWait = + producerWaitIt == runtimeStats.end() ? RuntimeMetric{} + : producerWaitIt->second; + const RuntimeMetric consumerWait = + consumerWaitIt == runtimeStats.end() ? RuntimeMetric{} + : consumerWaitIt->second; + localPartitionWaitStats.producerWaitMs.push_back(producerWait); + localPartitionWaitStats.consumerWaitMs.push_back(consumerWait); + localPartitionWaitStats.totalProducerWaitMs += + localPartitionWaitStats.producerWaitMs.back().sum; + localPartitionWaitStats.totalConsumerWaitMs += + localPartitionWaitStats.consumerWaitMs.back().sum; + } + } + }; + } +}; + +std::unique_ptr bm; + +void runBenchmarks() { + std::vector flatNames = {"c0"}; + std::vector flatTypes = {BIGINT()}; + std::vector typeSelection = { + BOOLEAN(), + TINYINT(), + DECIMAL(20, 3), + INTEGER(), + BIGINT(), + REAL(), + DECIMAL(10, 2), + DOUBLE(), + VARCHAR()}; + + int64_t flatSize = 0; + // Add enough columns of different types to make a 10K row batch be + // flat_batch_mb in flat size. + while (flatSize * 10000 < static_cast(FLAGS_flat_batch_mb) << 20) { + flatNames.push_back(fmt::format("c{}", flatNames.size())); + flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]); + if (flatTypes.back()->isFixedWidth()) { + flatSize += flatTypes.back()->cppSizeInBytes(); + } else { + flatSize += 20; + } + } + auto flatType = ROW(std::move(flatNames), std::move(flatTypes)); + std::vector flat10k( + bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct)); + + int64_t localPartitionWallUs; + PlanNodeStats localPartitionStatsFlat10K; + LocalPartitionWaitStats localPartitionWaitStats; + folly::addBenchmark(__FILE__, "localFlat10k", [&]() { + bm->runLocal( + flat10k, + FLAGS_width, + FLAGS_num_local_tasks, + localPartitionWallUs, + localPartitionStatsFlat10K, + localPartitionWaitStats); + return 1; + }); + + folly::runBenchmarks(); + + std::sort( + localPartitionWaitStats.wallMs.begin(), + localPartitionWaitStats.wallMs.end()); + VELOX_CHECK(!localPartitionWaitStats.wallMs.empty()); + + std::cout + << "--------------------------------LocalFlat10K-------------------------------" + << std::endl; + std::cout << "Wall Time (ms): " << "\n Total: " + << succinctMicros(localPartitionWallUs) + << "\n Max: " << localPartitionWaitStats.wallMs.back() + << "\n Median: " + << localPartitionWaitStats + .wallMs[localPartitionWaitStats.wallMs.size() / 2] + << "\n Min: " << localPartitionWaitStats.wallMs.front() + << std::endl; + std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString() + << std::endl; + sortByAndPrintMax( + "Producer Wait Time (ms)", + localPartitionWaitStats.totalProducerWaitMs, + localPartitionWaitStats.producerWaitMs); + sortByAndPrintMax( + "Consumer Wait Time (ms)", + localPartitionWaitStats.totalConsumerWaitMs, + localPartitionWaitStats.consumerWaitMs); +} + +} // namespace + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize(memory::MemoryManager::Options{}); + functions::prestosql::registerAllScalarFunctions(); + aggregate::prestosql::registerAllAggregateFunctions(); + parse::registerTypeResolver(); + + bm = std::make_unique(); + runBenchmarks(); + bm.reset(); + + return 0; +} diff --git a/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp new file mode 100644 index 00000000000..3d2635fda94 --- /dev/null +++ b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp @@ -0,0 +1,469 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include "velox/exec/OptimizedHashPartitionFunction.h" +#include "velox/vector/BaseVector.h" +#include "velox/vector/tests/utils/VectorMaker.h" + +// Add the following definitions to allow Clion runs. +DEFINE_bool(gtest_color, false, ""); +DEFINE_string(gtest_filter, "*", ""); + +using namespace facebook; +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::test; + +namespace { + +constexpr vector_size_t kSize = 10'000; +constexpr vector_size_t kDictionarySize = kSize / 5; + +enum class FunctionKind { + kNormal, + kOptimized, +}; + +enum class EncodingMode { + kFlat, + kDictionary, + kConstant, +}; + +enum class NullMode { + kNoNulls, + kHalfNulls, + kAllNulls, +}; + +enum class PartitionMode { + kRemote, + kLocalExchange, + kHashBitRangeFirst8, + kHashBitRangeLast8, +}; + +template +T makeValue(vector_size_t row) { + return static_cast((row * 8191) ^ (row >> 3)); +} + +template <> +bool makeValue(vector_size_t row) { + return (row & 1) == 0; +} + +template <> +StringView makeValue(vector_size_t row) { + thread_local std::array buffer; + const auto length = 5 + row % 16; + for (vector_size_t index = 0; index < length; ++index) { + buffer[index] = 'a' + (row + index * 7) % 26; + } + return StringView(buffer.data(), length); +} + +std::function makeNulls(NullMode nullMode) { + switch (nullMode) { + case NullMode::kNoNulls: + return nullptr; + case NullMode::kHalfNulls: + return [](vector_size_t row) { return (row & 1) == 0; }; + case NullMode::kAllNulls: + return [](vector_size_t /*row*/) { return true; }; + } + + VELOX_UNREACHABLE(); +} + +VectorPtr wrapInDictionary( + const VectorPtr& base, + vector_size_t size, + memory::MemoryPool* pool, + NullMode nullMode = NullMode::kNoNulls) { + auto indices = AlignedBuffer::allocate(size, pool); + auto* rawIndices = indices->asMutable(); + const auto baseSize = base->size(); + for (vector_size_t row = 0; row < size; ++row) { + rawIndices[row] = (size - row - 1) % baseSize; + } + + BufferPtr nulls; + if (nullMode == NullMode::kHalfNulls) { + nulls = AlignedBuffer::allocate(size, pool); + auto* rawNulls = nulls->asMutable(); + bits::fillBits(rawNulls, 0, size, bits::kNotNull); + for (vector_size_t row = 0; row < size; row += 2) { + bits::setNull(rawNulls, row); + } + } else if (nullMode == NullMode::kAllNulls) { + nulls = AlignedBuffer::allocate(size, pool); + auto* rawNulls = nulls->asMutable(); + bits::fillBits(rawNulls, 0, size, bits::kNull); + } + + return BaseVector::wrapInDictionary(nulls, indices, size, base); +} + +template +VectorPtr makeValuesVector( + VectorMaker& vectorMaker, + memory::MemoryPool* pool, + EncodingMode encodingMode, + NullMode nullMode, + vector_size_t size) { + const auto flatSize = + encodingMode == EncodingMode::kDictionary ? kDictionarySize : size; + auto flat = vectorMaker.flatVector( + flatSize, + [](vector_size_t row) { return makeValue(row); }, + makeNulls(nullMode)); + + switch (encodingMode) { + case EncodingMode::kFlat: + return flat; + case EncodingMode::kDictionary: + return wrapInDictionary(flat, size, pool); + case EncodingMode::kConstant: + if (nullMode == NullMode::kAllNulls) { + return BaseVector::createNullConstant( + CppToType::create(), size, pool); + } + if (nullMode == NullMode::kHalfNulls) { + auto constant = BaseVector::wrapInConstant(size, 1, flat); + // ConstantVector has one nullness for all logical rows. Use a + // dictionary wrapper to express alternating nulls while keeping the + // repeated-value payload constant. + return wrapInDictionary(constant, size, pool, nullMode); + } + return BaseVector::wrapInConstant(size, 0, flat); + } + + VELOX_UNREACHABLE(); +} + +template +std::unique_ptr makePartitionFunction( + PartitionMode partitionMode, + const RowTypePtr& inputType, + int numPartitions) { + switch (partitionMode) { + case PartitionMode::kRemote: + if constexpr (Kind == FunctionKind::kNormal) { + return std::make_unique( + false, numPartitions, inputType, std::vector{0}); + } else { + return std::make_unique( + false, numPartitions, inputType, std::vector{0}); + } + case PartitionMode::kLocalExchange: + if constexpr (Kind == FunctionKind::kNormal) { + return std::make_unique( + true, numPartitions, inputType, std::vector{0}); + } else { + return std::make_unique( + true, numPartitions, inputType, std::vector{0}); + } + case PartitionMode::kHashBitRangeFirst8: + if constexpr (Kind == FunctionKind::kNormal) { + return std::make_unique( + HashBitRange{0, 8}, inputType, std::vector{0}); + } else { + return std::make_unique( + HashBitRange{0, 8}, inputType, std::vector{0}); + } + case PartitionMode::kHashBitRangeLast8: + if constexpr (Kind == FunctionKind::kNormal) { + return std::make_unique( + HashBitRange{56, 64}, inputType, std::vector{0}); + } else { + return std::make_unique( + HashBitRange{56, 64}, inputType, std::vector{0}); + } + } + + VELOX_UNREACHABLE(); +} + +void normalRangeReduction( + const uint64_t* hashes, + uint32_t* partitions, + int size, + uint32_t numPartitions) { + for (int index = 0; index < size; ++index) { + partitions[index] = hashes[index] % numPartitions; + } +} + +template +void runRangeReductionBenchmark(uint32_t iterations, uint32_t numPartitions) { + folly::BenchmarkSuspender suspender; + + std::vector hashes(kSize); + std::vector partitions(kSize); + for (vector_size_t row = 0; row < kSize; ++row) { + hashes[row] = (static_cast(row * 8191) << 32) ^ + static_cast(row * 1315423911ULL + 17); + } + + suspender.dismiss(); + + for (uint32_t iteration = 0; iteration < iterations; ++iteration) { + if constexpr (Kind == FunctionKind::kNormal) { + normalRangeReduction( + hashes.data(), partitions.data(), kSize, numPartitions); + } else { + rangeReduction(hashes.data(), partitions.data(), kSize, numPartitions); + } + folly::doNotOptimizeAway(partitions.data()); + } +} + +template +void runPartitionBenchmark( + uint32_t iterations, + PartitionMode partitionMode, + EncodingMode encodingMode, + NullMode nullMode, + int numPartitions) { + folly::BenchmarkSuspender suspender; + + auto pool = memory::memoryManager()->addLeafPool(); + VectorMaker vectorMaker(pool.get()); + auto values = makeValuesVector( + vectorMaker, pool.get(), encodingMode, nullMode, kSize); + auto input = vectorMaker.rowVector({values}); + auto partitionFunction = makePartitionFunction( + partitionMode, asRowType(input->type()), numPartitions); + std::vector partitions; + + suspender.dismiss(); + + for (uint32_t iteration = 0; iteration < iterations; ++iteration) { + std::optional singlePartition = + partitionFunction->partition(*input, partitions); + if (singlePartition.has_value()) { + std::fill(partitions.begin(), partitions.end(), singlePartition.value()); + } + folly::doNotOptimizeAway(partitions.data()); + } +} + +template +void benchmarkNormalHashPartitionFunction( + uint32_t iterations, + PartitionMode partitionMode, + EncodingMode encodingMode, + NullMode nullMode, + int numPartitions) { + runPartitionBenchmark( + iterations, partitionMode, encodingMode, nullMode, numPartitions); +} + +template +void benchmarkOptimizedHashPartitionFunction( + uint32_t iterations, + PartitionMode partitionMode, + EncodingMode encodingMode, + NullMode nullMode, + int numPartitions) { + runPartitionBenchmark( + iterations, partitionMode, encodingMode, nullMode, numPartitions); +} + +#define REGISTER_PARTITION_PAIR( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + NULL_MODE, \ + NULL_NAME) \ + BENCHMARK( \ + partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME, \ + iterations) { \ + benchmarkNormalHashPartitionFunction( \ + iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS); \ + } \ + BENCHMARK_RELATIVE( \ + optimized_partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME, \ + iterations) { \ + benchmarkOptimizedHashPartitionFunction( \ + iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS); \ + } \ + BENCHMARK_DRAW_LINE(); + +#define REGISTER_PARTITION_NULL_MODES( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME) \ + REGISTER_PARTITION_PAIR( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + NullMode::kNoNulls, \ + no_null) \ + REGISTER_PARTITION_PAIR( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + NullMode::kHalfNulls, \ + half_null) \ + REGISTER_PARTITION_PAIR( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + NullMode::kAllNulls, \ + all_null) + +#define REGISTER_PARTITION_ENCODINGS( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME) \ + REGISTER_PARTITION_NULL_MODES( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + EncodingMode::kFlat, \ + flat) \ + REGISTER_PARTITION_NULL_MODES( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + EncodingMode::kDictionary, \ + dictionary) \ + REGISTER_PARTITION_NULL_MODES( \ + T, \ + TYPE_NAME, \ + PARTITION_MODE, \ + PARTITION_NAME, \ + NUM_PARTITIONS, \ + NUM_PARTITIONS_NAME, \ + EncodingMode::kConstant, \ + constant) + +#define REGISTER_PARTITION_COUNTS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1, p1) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 4, p4) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 16, p16) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 100, p100) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'000, p1000) \ + REGISTER_PARTITION_ENCODINGS( \ + T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'024, p1024) + +#define REGISTER_PARTITION_MODES(T, TYPE_NAME) \ + REGISTER_PARTITION_COUNTS(T, TYPE_NAME, PartitionMode::kRemote, remote) \ + REGISTER_PARTITION_COUNTS( \ + T, TYPE_NAME, PartitionMode::kLocalExchange, local_exchange) \ + REGISTER_PARTITION_ENCODINGS( \ + T, \ + TYPE_NAME, \ + PartitionMode::kHashBitRangeFirst8, \ + hashbits_0_8, \ + 0, \ + hashbits) \ + REGISTER_PARTITION_ENCODINGS( \ + T, \ + TYPE_NAME, \ + PartitionMode::kHashBitRangeLast8, \ + hashbits_last_8, \ + 0, \ + hashbits) + +REGISTER_PARTITION_MODES(bool, bool) +REGISTER_PARTITION_MODES(int8_t, tinyint) +REGISTER_PARTITION_MODES(int16_t, smallint) +REGISTER_PARTITION_MODES(int32_t, integer) +REGISTER_PARTITION_MODES(int64_t, bigint) +REGISTER_PARTITION_MODES(StringView, varchar) + +#define REGISTER_RANGE_REDUCTION_PAIR(NUM_PARTITIONS, NUM_PARTITIONS_NAME) \ + BENCHMARK(normal_range_reduction_##NUM_PARTITIONS_NAME, iterations) { \ + runRangeReductionBenchmark( \ + iterations, NUM_PARTITIONS); \ + } \ + BENCHMARK_RELATIVE( \ + optimized_range_reduction_##NUM_PARTITIONS_NAME, iterations) { \ + runRangeReductionBenchmark( \ + iterations, NUM_PARTITIONS); \ + } \ + BENCHMARK_DRAW_LINE(); + +REGISTER_RANGE_REDUCTION_PAIR(1, p1) +REGISTER_RANGE_REDUCTION_PAIR(4, p4) +REGISTER_RANGE_REDUCTION_PAIR(16, p16) +REGISTER_RANGE_REDUCTION_PAIR(100, p100) +REGISTER_RANGE_REDUCTION_PAIR(1'000, p1000) +REGISTER_RANGE_REDUCTION_PAIR(1'024, p1024) + +#undef REGISTER_PARTITION_MODES +#undef REGISTER_PARTITION_COUNTS +#undef REGISTER_PARTITION_ENCODINGS +#undef REGISTER_PARTITION_NULL_MODES +#undef REGISTER_PARTITION_PAIR +#undef REGISTER_RANGE_REDUCTION_PAIR + +} // namespace + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize(memory::MemoryManager::Options{}); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp new file mode 100644 index 00000000000..32fdc278857 --- /dev/null +++ b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "velox/exec/OptimizedVectorHasher.h" +#include "velox/exec/VectorHasher.h" +#include "velox/type/HugeInt.h" +#include "velox/vector/BaseVector.h" +#include "velox/vector/tests/utils/VectorMaker.h" + +// Add the following definitions to allow Clion runs. +DEFINE_bool(gtest_color, false, ""); +DEFINE_string(gtest_filter, "*", ""); + +using namespace facebook; +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::test; + +namespace { + +enum class NullMode { + kNoNulls, + kHalfNulls, + kAllNulls, +}; + +enum class EncodingMode { + kFlat, + kDictionary, + kConstant, +}; + +template +T makeValue(vector_size_t row) { + return static_cast((row * 8191) ^ (row >> 3)); +} + +template <> +bool makeValue(vector_size_t row) { + return (row & 1) == 0; +} + +template <> +float makeValue(vector_size_t row) { + return static_cast(row) * 1.25f - 1000.0f; +} + +template <> +double makeValue(vector_size_t row) { + return static_cast(row) * 1.25 - 1000.0; +} + +template <> +int128_t makeValue(vector_size_t row) { + return HugeInt::build( + static_cast(row * 31), + static_cast(row * 1315423911ULL + 17)); +} + +template <> +StringView makeValue(vector_size_t row) { + thread_local std::array buffer; + const auto length = 5 + row % 16; + for (vector_size_t i = 0; i < length; ++i) { + buffer[i] = 'a' + (row + i * 7) % 26; + } + return StringView(buffer.data(), length); +} + +std::function makeNulls(NullMode nullMode) { + switch (nullMode) { + case NullMode::kNoNulls: + return nullptr; + case NullMode::kHalfNulls: + return [](vector_size_t row) { return (row & 1) == 0; }; + case NullMode::kAllNulls: + return [](vector_size_t /*row*/) { return true; }; + } + + VELOX_UNREACHABLE(); +} + +template +VectorPtr makeValuesVector( + VectorMaker& vectorMaker, + memory::MemoryPool* pool, + NullMode nullMode, + EncodingMode encodingMode, + vector_size_t numValues, + vector_size_t dictionarySize) { + auto flat = vectorMaker.flatVector( + encodingMode == EncodingMode::kDictionary ? dictionarySize : numValues, + [](vector_size_t row) { return makeValue(row); }, + makeNulls(nullMode)); + + switch (encodingMode) { + case EncodingMode::kFlat: + return flat; + case EncodingMode::kDictionary: { + auto indices = AlignedBuffer::allocate(numValues, pool); + auto* rawIndices = indices->asMutable(); + for (vector_size_t i = 0; i < numValues; ++i) { + rawIndices[i] = (numValues - i - 1) % dictionarySize; + } + return BaseVector::wrapInDictionary( + BufferPtr(nullptr), indices, numValues, flat); + } + case EncodingMode::kConstant: + if (nullMode == NullMode::kAllNulls) { + return BaseVector::createNullConstant( + CppToType::create(), numValues, pool); + } + return BaseVector::wrapInConstant(numValues, 0, flat); + } + + VELOX_UNREACHABLE(); +} + +template +struct HasherRunner; + +template <> +struct HasherRunner { + static std::unique_ptr create(const TypePtr& type) { + return VectorHasher::create(type, 0); + } +}; + +template <> +struct HasherRunner { + static std::unique_ptr create(const TypePtr& type) { + return OptimizedVectorHasher::create(type, 0); + } +}; + +template +void runHashBenchmark( + uint32_t iterations, + NullMode nullMode, + EncodingMode encodingMode, + bool mix, + vector_size_t size, + vector_size_t dictionarySize) { + folly::BenchmarkSuspender suspender; + + auto pool = memory::memoryManager()->addLeafPool(); + VectorMaker vectorMaker(pool.get()); + auto values = makeValuesVector( + vectorMaker, pool.get(), nullMode, encodingMode, size, dictionarySize); + auto hasher = HasherRunner::create(CppToType::create()); + raw_vector hashes(size, pool.get()); + + SelectivityVector rows(size); + hasher->decode(*values, rows); + if (mix) { + std::iota(hashes.begin(), hashes.end(), 0); + } + + suspender.dismiss(); + + for (uint32_t i = 0; i < iterations; ++i) { + hasher->hash(rows, mix, hashes); + folly::doNotOptimizeAway(hashes.data()); + } +} + +template +void benchmarkVectorHasher( + uint32_t iterations, + NullMode nullMode, + EncodingMode encodingMode, + bool mix, + vector_size_t size, + vector_size_t dictionarySize) { + runHashBenchmark( + iterations, nullMode, encodingMode, mix, size, dictionarySize); +} + +template +void benchmarkOptimizedVectorHasher( + uint32_t iterations, + NullMode nullMode, + EncodingMode encodingMode, + bool mix, + vector_size_t size, + vector_size_t dictionarySize) { + runHashBenchmark( + iterations, nullMode, encodingMode, mix, size, dictionarySize); +} + +#define REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NULL_MODE, \ + NULL_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + DICTIONARY_SIZE) \ + BENCHMARK(TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) { \ + benchmarkVectorHasher( \ + n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE); \ + } \ + BENCHMARK_RELATIVE( \ + optimized_##TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) { \ + benchmarkOptimizedVectorHasher( \ + n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE); \ + } \ + BENCHMARK_DRAW_LINE(); + +#define REGISTER_HASHER_NULL_MODES( \ + T, \ + TYPE_NAME, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + DICTIONARY_SIZE) \ + REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NullMode::kNoNulls, \ + no_null, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + DICTIONARY_SIZE) \ + REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NullMode::kHalfNulls, \ + half_null, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + DICTIONARY_SIZE) \ + REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NullMode::kAllNulls, \ + all_null, \ + ENCODING_MODE, \ + ENCODING_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + DICTIONARY_SIZE) + +#define REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, SIZE) \ + REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NullMode::kNoNulls, \ + no_null, \ + EncodingMode::kConstant, \ + constant, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + SIZE) \ + REGISTER_HASHER_PAIR( \ + T, \ + TYPE_NAME, \ + NullMode::kAllNulls, \ + all_null, \ + EncodingMode::kConstant, \ + constant, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + SIZE) + +#define REGISTER_HASHER_SIZES( \ + T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME) \ + REGISTER_HASHER_NULL_MODES( \ + T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME, 10000, 10000) + +#define REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME) \ + REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 10000) + +#define REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, SIZE, PERCENT, PERCENT_NAME) \ + REGISTER_HASHER_NULL_MODES( \ + T, \ + TYPE_NAME, \ + EncodingMode::kDictionary, \ + dictionary_##PERCENT_NAME, \ + MIX, \ + MIX_NAME, \ + SIZE, \ + SIZE* PERCENT / 100) + +#define REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \ + REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, 10000, 80, 80pct) \ + REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, 10000, 60, 60pct) \ + REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, 10000, 40, 40pct) \ + REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, 10000, 20, 20pct) \ + REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT( \ + T, TYPE_NAME, MIX, MIX_NAME, 10000, 5, 5pct) + +#define REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, MIX, MIX_NAME) \ + REGISTER_HASHER_SIZES( \ + T, TYPE_NAME, EncodingMode::kFlat, flat, MIX, MIX_NAME) \ + REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \ + REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME) + +#define REGISTER_HASHER_TYPE(T, TYPE_NAME) \ + REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, false, no_mix) \ + REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, true, mix) + +REGISTER_HASHER_TYPE(bool, boolean) +REGISTER_HASHER_TYPE(int8_t, tinyint) +REGISTER_HASHER_TYPE(int16_t, smallint) +REGISTER_HASHER_TYPE(int32_t, integer) +REGISTER_HASHER_TYPE(int64_t, bigint) +REGISTER_HASHER_TYPE(int128_t, hugeint) +REGISTER_HASHER_TYPE(float, real) +REGISTER_HASHER_TYPE(double, double) +REGISTER_HASHER_TYPE(StringView, varchar) + +#undef REGISTER_HASHER_TYPE +#undef REGISTER_HASHER_SIZES_DICTIONARY +#undef REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT +#undef REGISTER_HASHER_SIZES +#undef REGISTER_HASHER_NULL_MODES +#undef REGISTER_HASHER_PAIR + +} // namespace + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize(memory::MemoryManager::Options{}); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt index a97d63ccd5b..189e7fc8680 100644 --- a/velox/exec/tests/CMakeLists.txt +++ b/velox/exec/tests/CMakeLists.txt @@ -65,6 +65,7 @@ set( EnforceDistinctTest.cpp TraceUtilTest.cpp HashPartitionFunctionTest.cpp + OptimizedHashPartitionFunctionTest.cpp SpatialIndexTest.cpp ValuesTest.cpp ParallelProjectTest.cpp @@ -148,6 +149,8 @@ set( AssignUniqueIdTest.cpp FilterProjectTest.cpp AsyncConnectorTest.cpp + OptimizedPartitionedOutputTest.cpp + OptimizedVectorHasherTest.cpp ) set( diff --git a/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp new file mode 100644 index 00000000000..b9d6b193159 --- /dev/null +++ b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/exec/OptimizedHashPartitionFunction.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook; +using namespace facebook::velox; +using namespace facebook::velox::exec; + +class OptimizedHashPartitionFunctionTest : public velox::test::VectorTestBase, + public testing::Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); + } +}; + +TEST_F( + OptimizedHashPartitionFunctionTest, + powerOfTwoRangeReductionMatchesMultiplyHigh) { + const std::vector hashes = { + 0, + 1, + 0x0000'0001'0000'0000ULL, + 0x1234'5678'9abc'def0ULL, + 0xffff'ffff'ffff'ffffULL, + }; + + for (const auto numPartitions : {1, 2, 4, 1'024}) { + std::vector partitions(hashes.size()); + rangeReduction( + hashes.data(), + partitions.data(), + static_cast(hashes.size()), + numPartitions); + + std::vector expected; + expected.reserve(hashes.size()); + for (const auto hash : hashes) { + const auto mixedHash = + static_cast(hash) ^ static_cast(hash >> 32); + expected.push_back( + (static_cast(mixedHash) * numPartitions) >> 32); + } + + EXPECT_EQ(partitions, expected); + } +} + +TEST_F( + OptimizedHashPartitionFunctionTest, + optimizedHashBitRangeMatchesRegular) { + const auto numRows = 10'000; + auto input = makeRowVector( + {makeNullableFlatVector([&] { + std::vector> values; + values.reserve(numRows); + for (auto row = 0; row < numRows; ++row) { + values.emplace_back( + row % 17 == 0 ? std::nullopt : std::optional(row * 13)); + } + return values; + }()), + makeFlatVector(numRows, [](auto row) { + return StringView::makeInline(fmt::format("value_{}", row % 97)); + })}); + const auto rowType = asRowType(input->type()); + + HashPartitionFunction regular(HashBitRange{0, 5}, rowType, {0, 1}); + OptimizedHashPartitionFunction optimized(HashBitRange{0, 5}, rowType, {0, 1}); + + std::vector regularPartitions; + std::vector optimizedPartitions; + EXPECT_EQ( + regular.partition(*input, regularPartitions), + optimized.partition(*input, optimizedPartitions)); + EXPECT_EQ(regularPartitions, optimizedPartitions); +} + +TEST_F(OptimizedHashPartitionFunctionTest, onePartitionReturnsConstantResult) { + auto input = makeRowVector({makeConstant(true, 10'000)}); + const auto rowType = asRowType(input->type()); + OptimizedHashPartitionFunction partitionFunction( + /*localExchange=*/true, 1, rowType, {0}); + + std::vector partitions{123}; + EXPECT_EQ(partitionFunction.partition(*input, partitions), 0u); + EXPECT_EQ(partitions, std::vector{123}); +} + +TEST_F(OptimizedHashPartitionFunctionTest, emptyConstantKeyReturnsEmptyResult) { + auto input = makeRowVector({makeConstant(true, 0)}); + const auto rowType = asRowType(input->type()); + OptimizedHashPartitionFunction optimized( + /*localExchange=*/true, 16, rowType, {0}); + + std::vector optimizedPartitions{123}; + EXPECT_EQ(optimized.partition(*input, optimizedPartitions), std::nullopt); + EXPECT_TRUE(optimizedPartitions.empty()); +} + +TEST_F(OptimizedHashPartitionFunctionTest, specUsesConfiguredImplementation) { + auto input = makeRowVector( + {makeFlatVector({1, 2, 3, 4}), + makeFlatVector({"a", "b", "c", "d"})}); + const auto rowType = asRowType(input->type()); + HashPartitionFunctionSpec spec(rowType, std::vector{0, 1}); + auto optimizedFunction = spec.create(8, /*localExchange=*/false, true); + ASSERT_NE( + dynamic_cast(optimizedFunction.get()), + nullptr); + + auto regularFunction = spec.create(8, /*localExchange=*/false); + ASSERT_NE( + dynamic_cast(regularFunction.get()), nullptr); + + std::vector optimizedPartitions; + ASSERT_EQ( + optimizedFunction->partition(*input, optimizedPartitions), std::nullopt); + ASSERT_EQ(optimizedPartitions.size(), input->size()); + for (const auto partition : optimizedPartitions) { + EXPECT_LT(partition, 8); + } +} diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp new file mode 100644 index 00000000000..ed9fa875624 --- /dev/null +++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp @@ -0,0 +1,1036 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include "velox/common/base/BitUtil.h" +#include "velox/common/memory/ByteStream.h" +#include "velox/exec/HashPartitionFunction.h" +#include "velox/exec/OptimizedPartitionedOutput.h" +#include "velox/exec/Task.h" +#include "velox/exec/tests/utils/OperatorTestBase.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/exec/tests/utils/QueryAssertions.h" +#include "velox/serializers/PrestoSerializer.h" +#include "velox/serializers/PrestoSerializerSerializationUtils.h" + +namespace facebook::velox::exec::test { + +namespace { + +int64_t simpleColumnPageBytes( + std::string_view encodingName, + int64_t numRows, + int64_t numNulls, + int64_t valueWidth) { + return serializer::presto::detail::kHeaderSize + // page header + 4 + // numColumns + 4 + static_cast(encodingName.size()) + // encoding header + 4 + // rowCount + 1 + // null flag + (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap + (numRows - numNulls) * valueWidth; // values +} + +} // namespace + +/// How null values are distributed in value columns. +enum class NullMode { + kNoNull, // no null values + kPartialNull, // row i is null if i % 2 == 0 + kAllNull, // all values are null +}; + +/// Describes one parameterized test configuration. +struct TestParam { + /// Short lowercase name used as the gtest parameter suffix. + std::string name; + /// Element type for value columns. Ignored when numValueCols == 0. + TypePtr valueType; + /// Number of partition-key columns (all INTEGER). + int numPartitionCols; + /// Number of value columns of valueType. + int numValueCols; + /// Null pattern applied to value columns. + NullMode nullMode; +}; + +/// Returns the full set of TestParam combinations: +/// - numValueCols==0: 1 entry per numPartitionCols (type/nullMode irrelevant) +/// - numValueCols∈{1,256}: all 4 types × 2 pk counts × 3 null modes +std::vector testParams() { + std::vector params; + + const std::vector> types = { + {"bool", BOOLEAN()}, + {"tinyint", TINYINT()}, + {"bigint", BIGINT()}, + {"hugeint", HUGEINT()}, + }; + + const std::vector> nullModes = { + {"no_null", NullMode::kNoNull}, + {"partial_null", NullMode::kPartialNull}, + {"all_null", NullMode::kAllNull}, + }; + + // Zero value columns: type and null mode do not affect test behavior. + for (int numPk : {1, 4}) { + params.push_back({ + .name = "pk" + std::to_string(numPk) + "_val0", + .valueType = BIGINT(), + .numPartitionCols = numPk, + .numValueCols = 0, + .nullMode = NullMode::kNoNull, + }); + } + + // One and many value columns: all type × pk-count × null-mode combinations. + for (int numVal : {1, 256}) { + for (const auto& [typeName, type] : types) { + for (int numPk : {1, 4}) { + for (const auto& [nullName, nullMode] : nullModes) { + params.push_back({ + .name = "pk" + std::to_string(numPk) + "_val" + + std::to_string(numVal) + "_" + typeName + "_" + nullName, + .valueType = type, + .numPartitionCols = numPk, + .numValueCols = numVal, + .nullMode = nullMode, + }); + } + } + } + } + + return params; +} + +/// Collected output from a single run of runPartitionedOutput(). +struct PartitionedOutputResult { + // Declared first so it is destroyed last: the IOBufs in pages reference the + // task's memory pool, so the task must outlive all the pages. + std::shared_ptr task; + /// Serialized output pages per partition, indexed by partition ID. + std::vector>> pages; + /// Number of pages received by each partition. + std::vector pageCounts; + /// Total rows deserialized from each partition's pages. + std::vector rowCounts; + /// Number of partitions that received at least one page. + int numNonEmptyPartitions{0}; + /// Sum of operator's numAppends runtime stat. + int64_t numAppends{0}; + /// Sum of operator's numFlushes runtime stat. + int64_t numFlushes{0}; + /// Sum of operator's numBlockedTimes runtime stat. + int64_t numBlockedTimes{0}; +}; + +/// Shared infrastructure for all OptimizedPartitionedOutput tests. +class OptimizedPartitionedOutputTest : public OperatorTestBase { + protected: + void SetUp() override { + OperatorTestBase::SetUp(); + bufferManager_->setListenerFactory([]() { + return std::make_unique(); + }); + } + + std::shared_ptr createQueryContext( + std::unordered_map config) { + config[core::QueryConfig::kOptimizedPartitionedOutputEnabled] = "true"; + return core::QueryCtx::create( + executor_.get(), core::QueryConfig(std::move(config))); + } + + /// Fetches one batch of serialized pages from the output buffer for the given + /// destination. Returns the pages via a promise/future callback. + std::vector> + getData(const std::string& taskId, int destination, int64_t sequence) { + auto [promise, semiFuture] = folly::makePromiseContract< + std::vector>>(); + VELOX_CHECK(bufferManager_->getData( + taskId, + destination, + OptimizedPartitionedOutput::kMinDestinationSize, + sequence, + [result = std::make_shared< + folly::Promise>>>( + std::move(promise))]( + std::vector> pages, + int64_t /*sequence*/, + std::vector /*remainingBytes*/) { + result->setValue(std::move(pages)); + })); + auto future = std::move(semiFuture).via(executor_.get()); + future.wait(std::chrono::seconds{10}); + VELOX_CHECK(future.isReady()); + return std::move(future).value(); + } + + /// Drains all pages for a destination until the null sentinel is received. + std::vector> getAllData( + const std::string& taskId, + int destination) { + std::vector> result; + int attempts = 0; + bool done = false; + while (!done) { + VELOX_CHECK_LT(++attempts, 10'000); + auto pages = getData(taskId, destination, result.size()); + for (auto& page : pages) { + if (page) { + result.push_back(std::move(page)); + } else { + bufferManager_->deleteResults(taskId, destination); + done = true; + break; + } + } + } + return result; + } + + /// Deserializes a single Presto-serialized IOBuf page into a RowVector. + RowVectorPtr deserializePage( + const folly::IOBuf* iobuf, + const RowTypePtr& rowType) { + auto byteRanges = byteRangesFromIOBuf(const_cast(iobuf)); + auto byteStream = + std::make_unique(std::move(byteRanges)); + serializer::presto::PrestoVectorSerde serde; + RowVectorPtr result; + serde.deserialize(byteStream.get(), pool(), rowType, &result, 0, nullptr); + return result; + } + + /// Deserializes and concatenates all pages for one partition into a single + /// RowVector. Returns an empty RowVector when pages is empty. + RowVectorPtr concatPages( + const std::vector>& pages, + const RowTypePtr& rowType) { + RowVectorPtr result; + for (const auto& iobuf : pages) { + auto page = deserializePage(iobuf.get(), rowType); + if (!result) { + result = page; + } else { + result->append(page.get()); + } + } + if (!result) { + result = std::static_pointer_cast( + BaseVector::create(rowType, 0, pool())); + } + return result; + } + + RowTypePtr outputTypeForLayout( + const RowTypePtr& inputType, + const std::vector& outputLayout) { + if (outputLayout.empty()) { + return inputType; + } + + std::vector types; + types.reserve(outputLayout.size()); + for (const auto& name : outputLayout) { + types.push_back(inputType->findChild(name)); + } + return ROW(outputLayout, std::move(types)); + } + + RowVectorPtr buildOutput( + const RowVectorPtr& input, + const std::vector& outputLayout) { + const auto inputType = asRowType(input->type()); + const auto outputType = outputTypeForLayout(inputType, outputLayout); + + std::vector columns; + columns.reserve(outputLayout.size()); + for (const auto& name : outputLayout) { + columns.push_back(input->childAt(inputType->getChildIdx(name))); + } + return std::make_shared( + input->pool(), outputType, nullptr, input->size(), std::move(columns)); + } + + /// Sorts a vector by value for order-independent comparison. Returns a + /// dictionary vector with rows sorted in ascending order. + VectorPtr canonicalize(const VectorPtr& vector) { + const auto numRows = vector->size(); + auto indices = makeIndices(numRows, [](auto i) { return i; }); + auto* data = indices->asMutable(); + std::stable_sort(data, data + numRows, [&](auto a, auto b) { + return vector->compare(vector.get(), a, b) < 0; + }); + return BaseVector::wrapInDictionary(nullptr, indices, numRows, vector); + } + + /// Builds a RowVector by gathering rows from inputBatches at the given + /// (batchIdx, rowIdx) positions. Used to construct the per-partition expected + /// RowVector. + RowVectorPtr gatherRows( + const std::vector& batches, + const std::vector>& rowList, + const RowTypePtr& rowType) { + const auto numRows = static_cast(rowList.size()); + auto result = std::static_pointer_cast( + BaseVector::create(rowType, numRows, pool())); + for (vector_size_t r = 0; r < numRows; ++r) { + result->copy(batches[rowList[r].first].get(), r, rowList[r].second, 1); + } + return result; + } + + int64_t getIntRuntimeStat(Task* task, const std::string& statName) { + const auto taskStats = task->taskStats(); + const auto& runtimeStats = + taskStats.pipelineStats[0].operatorStats.back().runtimeStats; + auto it = runtimeStats.find(statName); + return it != runtimeStats.end() ? it->second.sum : 0; + } + + /// Builds a plan from inputBatches, creates and starts a task, drains all + /// numPartitions destinations concurrently, waits for task completion, and + /// returns the collected pages, per-partition row counts, and operator + /// runtime stats. extraConfig is merged into the query config on top of the + /// OptimizedPartitionedOutput enable flag. + PartitionedOutputResult runPartitionedOutput( + const std::string& taskId, + const std::vector& inputBatches, + const std::vector& partitionKeys, + int numPartitions, + std::unordered_map extraConfig = {}, + std::chrono::seconds timeout = std::chrono::seconds{30}) { + return runPartitionedOutputWithLayout( + taskId, + inputBatches, + partitionKeys, + numPartitions, + {}, + std::move(extraConfig), + timeout); + } + + PartitionedOutputResult runPartitionedOutputWithLayout( + const std::string& taskId, + const std::vector& inputBatches, + const std::vector& partitionKeys, + int numPartitions, + const std::vector& outputLayout, + std::unordered_map extraConfig = {}, + std::chrono::seconds timeout = std::chrono::seconds{30}) { + VELOX_CHECK(!inputBatches.empty()); + const auto rowType = + std::dynamic_pointer_cast(inputBatches[0]->type()); + const auto outputType = outputTypeForLayout(rowType, outputLayout); + + auto plan = + PlanBuilder() + .values(inputBatches) + .partitionedOutput(partitionKeys, numPartitions, outputLayout) + .planNode(); + + auto task = Task::create( + taskId, + core::PlanFragment{plan}, + 0, + createQueryContext(std::move(extraConfig)), + Task::ExecutionMode::kParallel); + task->start(1); + + // Drain all partitions concurrently to avoid deadlock with the driver. + std::vector>>> + futures; + futures.reserve(numPartitions); + for (int p = 0; p < numPartitions; ++p) { + futures.push_back(std::async(std::launch::async, [&, p] { + return getAllData(taskId, p); + })); + } + + const auto taskWaitUs = + std::chrono::duration_cast(timeout).count(); + EXPECT_TRUE(waitForTaskCompletion(task.get(), taskWaitUs)); + + PartitionedOutputResult result; + result.pages.resize(numPartitions); + result.pageCounts.resize(numPartitions, 0); + result.rowCounts.resize(numPartitions, 0); + + for (int p = 0; p < numPartitions; ++p) { + result.pages[p] = futures[p].get(); + result.pageCounts[p] = result.pages[p].size(); + if (result.pageCounts[p] > 0) { + ++result.numNonEmptyPartitions; + } + result.rowCounts[p] = concatPages(result.pages[p], outputType)->size(); + } + + result.numAppends = getIntRuntimeStat(task.get(), "numAppends"); + result.numFlushes = getIntRuntimeStat(task.get(), "numFlushes"); + result.numBlockedTimes = getIntRuntimeStat(task.get(), "numBlockedTimes"); + result.task = task; + + return result; + } + + private: + const std::shared_ptr bufferManager_{ + OutputBufferManager::getInstanceRef()}; +}; + +// ─── Parameterized fixture ─────────────────────────────────────────────────── + +/// Parameterized fixture that exercises every TestParam combination. +class OptimizedPartitionedOutputParamTest + : public OptimizedPartitionedOutputTest, + public ::testing::WithParamInterface { + protected: + const TestParam& param() const { + return GetParam(); + } + + /// Names for pk columns: ["p1"] or ["p1","p2","p3","p4"]. + std::vector pkColNames() const { + std::vector names; + for (int i = 0; i < param().numPartitionCols; ++i) { + names.push_back("p" + std::to_string(i + 1)); + } + return names; + } + + /// Names for value columns: ["v0", ..., "v{N-1}"]. + std::vector valueColNames() const { + std::vector names; + for (int i = 0; i < param().numValueCols; ++i) { + names.push_back("v" + std::to_string(i)); + } + return names; + } + + /// Full input ROW type: pk cols (INTEGER) followed by value cols. + RowTypePtr inputType() const { + std::vector names = pkColNames(); + std::vector types(param().numPartitionCols, INTEGER()); + for (const auto& name : valueColNames()) { + names.push_back(name); + types.push_back(param().valueType); + } + return ROW(std::move(names), std::move(types)); + } + + /// Channel indices of the pk columns within the input type. + std::vector pkChannels() const { + std::vector channels(param().numPartitionCols); + std::iota(channels.begin(), channels.end(), 0); + return channels; + } + + /// Returns true if row i should be null in value columns for the current + /// null mode. + bool isNull(int rowIdx) const { + switch (param().nullMode) { + case NullMode::kNoNull: + return false; + case NullMode::kAllNull: + return true; + case NullMode::kPartialNull: + return rowIdx % 2 == 0; + } + VELOX_UNREACHABLE(); + } + + /// Creates a flat vector of the param's value type with random values and + /// nulls applied according to nullMode. + VectorPtr makeRandomValueVector(int numRows, std::mt19937_64& rng) { + auto isNullFn = [this](vector_size_t i) -> bool { return isNull(i); }; + + switch (param().valueType->kind()) { + case TypeKind::BOOLEAN: + return vectorMaker_.flatVector( + numRows, + [&](auto /*i*/) -> bool { return rng() % 2 == 0; }, + isNullFn); + case TypeKind::TINYINT: + return vectorMaker_.flatVector( + numRows, + [&](auto /*i*/) -> int8_t { return static_cast(rng()); }, + isNullFn); + case TypeKind::BIGINT: + return vectorMaker_.flatVector( + numRows, + [&](auto /*i*/) -> int64_t { return static_cast(rng()); }, + isNullFn); + case TypeKind::HUGEINT: + return vectorMaker_.flatVector( + numRows, + [&](auto /*i*/) -> int128_t { + int64_t hi = static_cast(rng()); + uint64_t lo = rng(); + return (static_cast(hi) << 64) | + static_cast(lo); + }, + isNullFn); + default: + VELOX_UNREACHABLE( + "Unsupported value type: {}", param().valueType->toString()); + } + } + + /// Builds one input RowVector. p0Values holds the first pk column; each + /// subsequent pk column i is p0 + i. Value columns are filled with + /// independent random data drawn from rng. + RowVectorPtr makeInputBatch( + const std::vector& p0Values, + std::mt19937_64& rng) { + const int numRows = p0Values.size(); + std::vector names; + std::vector vecs; + + // pk columns + for (int k = 0; k < param().numPartitionCols; ++k) { + names.push_back("p" + std::to_string(k + 1)); + vecs.push_back(vectorMaker_.flatVector( + numRows, [&, k](auto i) { return p0Values[i] + k; })); + } + + // value columns + for (int v = 0; v < param().numValueCols; ++v) { + names.push_back("v" + std::to_string(v)); + vecs.push_back(makeRandomValueVector(numRows, rng)); + } + + return makeRowVector(names, vecs); + } + + /// Verifies that the deserialized pages for each partition exactly match the + /// rows from inputBatches that were routed to that partition. Both expected + /// and actual rows are sorted (canonicalized) before comparison to allow + /// order-independent matching. + void verifyDataIntegrity( + const std::vector& inputBatches, + const std::vector>>& allPages, + int numPartitions) { + // Compute expected per-partition row list using the same hash function as + // the operator. + auto partitionFn = std::make_unique( + false, numPartitions, inputType(), pkChannels()); + + std::vector>> expectedRows(numPartitions); + for (int batchIdx = 0; batchIdx < static_cast(inputBatches.size()); + ++batchIdx) { + std::vector assignments(inputBatches[batchIdx]->size()); + partitionFn->partition(*inputBatches[batchIdx], assignments); + for (int rowIdx = 0; rowIdx < static_cast(assignments.size()); + ++rowIdx) { + expectedRows[assignments[rowIdx]].emplace_back(batchIdx, rowIdx); + } + } + + const auto rowType = inputType(); + int64_t totalRows = 0; + + for (int p = 0; p < numPartitions; ++p) { + auto expected = gatherRows(inputBatches, expectedRows[p], rowType); + auto actual = concatPages(allPages[p], rowType); + + totalRows += expected->size(); + ASSERT_EQ(expected->size(), actual->size()) + << "partition " << p << " row count mismatch"; + + // Sort both vectors before comparing to allow order-independent matching. + auto expectedSorted = canonicalize(expected); + auto actualSorted = canonicalize(actual); + velox::test::assertEqualVectors(expectedSorted, actualSorted); + } + + int64_t sentRows = 0; + for (const auto& batch : inputBatches) { + sentRows += batch->size(); + } + EXPECT_EQ(totalRows, sentRows); + } +}; + +// ─── singleFlush ───────────────────────────────────────────────────────────── + +// Sends one batch into a large-buffer operator. All data is buffered without +// triggering an intermediate flush; the final noMoreInput flush serializes +// everything once. Verifies numFlushes==1, numBlockedTimes==0, and that every +// deserialized row matches its source. +TEST_P(OptimizedPartitionedOutputParamTest, singleFlush) { + constexpr int kNumPartitions = 4; + // One row per partition key, so every partition gets data. + std::vector p0Values; + for (int i = 0; i < kNumPartitions; ++i) { + p0Values.push_back(i); + } + + std::mt19937_64 rng(42); + const std::vector inputBatches = { + makeInputBatch(p0Values, rng)}; + + auto result = runPartitionedOutput( + "local://test-single-flush-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + EXPECT_EQ(result.numAppends, 1); + EXPECT_EQ(result.numFlushes, 1); + EXPECT_EQ(result.numBlockedTimes, 0); +} + +// ─── multipleFlushes ───────────────────────────────────────────────────────── + +// Sends multiple batches through a 1-byte serializer ceiling so each addInput +// triggers its own flush. A 10-byte OutputBuffer ceiling forces blocking. +// Concurrent consumers drain each partition so the driver can unblock. +// Verifies numFlushes==kBatches, numBlockedTimes>=1, and full data integrity. +TEST_P(OptimizedPartitionedOutputParamTest, multipleFlushes) { + constexpr int kNumPartitions = 4; + constexpr int kBatches = 10; + + // For wide schemas, reduce rows per batch so each batch stays small. + const int kRowsPerBatch = param().numValueCols >= 64 ? 2 : kNumPartitions; + + std::vector p0Values(kRowsPerBatch); + for (int i = 0; i < kRowsPerBatch; ++i) { + p0Values[i] = i % kNumPartitions; + } + std::mt19937_64 rng(42); + std::vector inputBatches; + inputBatches.reserve(kBatches); + for (int b = 0; b < kBatches; ++b) { + inputBatches.push_back(makeInputBatch(p0Values, rng)); + } + + auto result = runPartitionedOutput( + "local://test-multiple-flushes-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions, + // 1-byte serializer ceiling flushes before every addInput. + // 10-byte OutputBuffer ceiling forces blocking on every enqueue. + {{core::QueryConfig::kMaxPartitionedOutputBufferSize, "1"}, + {core::QueryConfig::kMaxOutputBufferSize, "10"}}, + std::chrono::seconds{30}); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + EXPECT_EQ(result.numAppends, kBatches); + EXPECT_EQ(result.numFlushes, kBatches); + EXPECT_EQ(result.numBlockedTimes, kBatches); +} + +// ─── uniformDistribution ───────────────────────────────────────────────────── + +// Sends many batches with p1 cycling through all partition keys so every +// partition receives rows. Uses the default buffer size (no intermediate +// flush). Verifies that all partitions are non-empty and that data integrity +// holds across all rows. +TEST_P(OptimizedPartitionedOutputParamTest, uniformDistribution) { + constexpr int kNumPartitions = 4; + constexpr int kBatches = 10; + + std::mt19937_64 rng(123); + // Use enough distinct p1 values across a wide range so all partitions receive + // rows regardless of how the hash distributes them. With 50 distinct p1 + // values and 4 partitions the probability of any partition being empty is < + // 1e-6. + constexpr int kRowsPerBatch = 50; + std::uniform_int_distribution dist(0, 999); + + std::vector inputBatches; + inputBatches.reserve(kBatches); + for (int b = 0; b < kBatches; ++b) { + std::vector p0Values(kRowsPerBatch); + for (auto& v : p0Values) { + v = dist(rng); + } + inputBatches.push_back(makeInputBatch(p0Values, rng)); + } + + auto result = runPartitionedOutput( + "local://test-uniform-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + + // With 50 distinct p1 values per batch and 4 partitions, every partition must + // receive rows (probability of any bucket being empty is < 1e-6). + EXPECT_EQ(result.numNonEmptyPartitions, kNumPartitions); +} + +// ─── skewed distributions +// ────────────────────────────────────────────────────── + +// Sends batches with 6 distinct key values whose frequencies decrease by +// roughly 2x per step, so non-empty partitions end up with very different row +// counts. Because 6 < 8 some partitions stay empty; because 6 > 8/2 most +// partitions receive rows. This sits between uniformDistribution (all full) +// and skewedDistribution (at most 2 of 64 filled). +TEST_P(OptimizedPartitionedOutputParamTest, moderateSkew) { + constexpr int kNumPartitions = 8; + constexpr int kBatches = 5; + + // Key i appears 2^(5-i) times per batch: key 0 → 32 rows, key 1 → 16, + // key 2 → 8, key 3 → 4, key 4 → 2, key 5 → 1. Total: 63 rows per batch. + std::vector keyPattern; + for (int key = 0; key < 6; ++key) { + const int count = 1 << (5 - key); // 32, 16, 8, 4, 2, 1 + for (int j = 0; j < count; ++j) { + keyPattern.push_back(key); + } + } + + std::mt19937_64 rng(55); + std::vector inputBatches; + inputBatches.reserve(kBatches); + for (int b = 0; b < kBatches; ++b) { + auto p0Values = keyPattern; + std::shuffle(p0Values.begin(), p0Values.end(), rng); + inputBatches.push_back(makeInputBatch(p0Values, rng)); + } + + auto result = runPartitionedOutput( + "local://test-moderate-skew-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + + // 6 distinct keys → at most 6 non-empty partitions; 6 < 8 → at least one + // empty partition. + EXPECT_LE(result.numNonEmptyPartitions, 6); + + // Verify a wide spread in per-partition row counts: the heaviest non-empty + // partition must have at least 2x the average non-empty partition size. + // This remains stable even when several low-frequency keys hash to the same + // bucket, unlike a comparison against the minimum non-empty partition. + int64_t maxRows = 0; + int64_t totalNonZeroRows = 0; + int64_t numNonZeroPartitions = 0; + for (int p = 0; p < kNumPartitions; ++p) { + if (result.rowCounts[p] > 0) { + maxRows = std::max(maxRows, result.rowCounts[p]); + totalNonZeroRows += result.rowCounts[p]; + ++numNonZeroPartitions; + } + } + ASSERT_GT(numNonZeroPartitions, 0); + EXPECT_GE(maxRows * numNonZeroPartitions, totalNonZeroRows * 2); +} + +// Sends many batches with p1 restricted to {0, 1} into a 64-partition +// operator. At most 2 of the 64 partitions will receive any rows; the rest +// must be empty. Verifies data integrity and the empty-partition invariant. +TEST_P(OptimizedPartitionedOutputParamTest, twoDestinations) { + constexpr int kNumPartitions = 64; + constexpr int kBatches = 10; + constexpr int kRowsPerBatch = 4; + + std::mt19937_64 rng(7); + std::vector inputBatches; + inputBatches.reserve(kBatches); + for (int b = 0; b < kBatches; ++b) { + // p1 only takes values 0 and 1; at most 2 of 64 partitions receive rows. + std::vector p0Values(kRowsPerBatch); + for (int i = 0; i < kRowsPerBatch; ++i) { + p0Values[i] = i % 2; + } + inputBatches.push_back(makeInputBatch(p0Values, rng)); + } + + auto result = runPartitionedOutput( + "local://test-skewed-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + + // p1 ∈ {0, 1}: at most 2 distinct hash buckets receive rows. + EXPECT_LE(result.numNonEmptyPartitions, 2); + EXPECT_GE(result.numNonEmptyPartitions, 1); +} + +// Sends multiple batches where every row carries the same partition key value +// so all rows hash to a single destination. Verifies that exactly one partition +// receives all rows and the remaining partitions stay empty. +TEST_P(OptimizedPartitionedOutputParamTest, singleDestination) { + constexpr int kNumPartitions = 8; + constexpr int kBatches = 5; + constexpr int kRowsPerBatch = 10; + + // Every row has p1=0 (p2=1, p3=2, p4=3 for multi-pk params), so the hash is + // identical for every row and all rows land in one partition. + std::mt19937_64 rng(99); + std::vector inputBatches; + inputBatches.reserve(kBatches); + for (int b = 0; b < kBatches; ++b) { + inputBatches.push_back( + makeInputBatch(std::vector(kRowsPerBatch, 0), rng)); + } + + auto result = runPartitionedOutput( + "local://test-single-dest-" + param().name, + inputBatches, + pkColNames(), + kNumPartitions); + + verifyDataIntegrity(inputBatches, result.pages, kNumPartitions); + + // All rows must land in exactly one partition. + EXPECT_EQ(result.numNonEmptyPartitions, 1); + + // That one partition must hold every row from every batch. + const int64_t totalInputRows = static_cast(kBatches) * kRowsPerBatch; + for (int p = 0; p < kNumPartitions; ++p) { + if (result.rowCounts[p] > 0) { + EXPECT_EQ(result.rowCounts[p], totalInputRows) << "partition " << p; + } + } +} + +// ─── instantiation ─────────────────────────────────────────────────────────── + +INSTANTIATE_TEST_SUITE_P( + Params, + OptimizedPartitionedOutputParamTest, + ::testing::ValuesIn(testParams()), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// ─── non-parameterized tests ───────────────────────────────────────────────── + +// In single-partition case, if the second addInput() is estimated to stay +// below the partitioned-output limit, it doesn't flush before appending. +TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateBelowLimit) { + auto rowType = ROW({"v"}, {BIGINT()}); + std::vector inputBatches = { + makeRowVector({"v"}, {makeFlatVector({10})}), + makeRowVector({"v"}, {makeFlatVector({20})})}; + + const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8); + auto result = runPartitionedOutput( + "local://test-buffer-below-limit", + inputBatches, + {}, + 1, + {{core::QueryConfig::kMaxPartitionedOutputBufferSize, + std::to_string(twoRowPageBytes + 1)}}); + + EXPECT_EQ(result.numAppends, 2); + EXPECT_EQ(result.numFlushes, 1); + + auto expected = makeRowVector({"v"}, {makeFlatVector({10, 20})}); + auto actual = concatPages(result.pages[0], rowType); + velox::test::assertEqualVectors(expected, actual); +} + +// In single-partition case, if the second addInput() is estimated to land +// exactly on the partitioned-output limit, it doesn't flush before appending. +TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateAtLimit) { + auto rowType = ROW({"v"}, {BIGINT()}); + std::vector inputBatches = { + makeRowVector({"v"}, {makeFlatVector({10})}), + makeRowVector({"v"}, {makeFlatVector({20})})}; + + const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8); + auto result = runPartitionedOutput( + "local://test-buffer-equals-limit", + inputBatches, + {}, + 1, + {{core::QueryConfig::kMaxPartitionedOutputBufferSize, + std::to_string(twoRowPageBytes)}}); + + EXPECT_EQ(result.numAppends, 2); + EXPECT_EQ(result.numFlushes, 1); + + auto expected = makeRowVector({"v"}, {makeFlatVector({10, 20})}); + auto actual = concatPages(result.pages[0], rowType); + velox::test::assertEqualVectors(expected, actual); +} + +// In the single-partition case, if the second addInput() is estimated to +// exceed the partitioned-output limit, addInput() flushes before appending. +TEST_F(OptimizedPartitionedOutputTest, preFlushWhenEstimateExceedsLimit) { + auto rowType = ROW({"v"}, {BIGINT()}); + std::vector inputBatches = { + makeRowVector({"v"}, {makeFlatVector({10})}), + makeRowVector({"v"}, {makeFlatVector({20})})}; + + const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8); + auto result = runPartitionedOutput( + "local://test-buffer-exceeds-limit", + inputBatches, + {}, + 1, + {{core::QueryConfig::kMaxPartitionedOutputBufferSize, + std::to_string(twoRowPageBytes - 1)}}); + + EXPECT_EQ(result.numAppends, 2); + EXPECT_EQ(result.numFlushes, 2); + + auto expected = makeRowVector({"v"}, {makeFlatVector({10, 20})}); + auto actual = concatPages(result.pages[0], rowType); + velox::test::assertEqualVectors(expected, actual); +} + +// In multi-partition case, estimateBytesAfterAppend() may conservatively +// assume an input could go to the last empty partition even when every row +// actually goes to an existing partition, causing a pre-flush. +TEST_F( + OptimizedPartitionedOutputTest, + preFlushWhenConservativeEstimateExceedsLimit) { + auto rowType = ROW({"p1"}, {INTEGER()}); + std::vector inputBatches = { + makeRowVector({"p1"}, {makeFlatVector({5})}), + makeRowVector({"p1"}, {makeFlatVector({5})})}; + + const auto twoRowPageBytes = simpleColumnPageBytes("INT_ARRAY", 2, 0, 4); + auto result = runPartitionedOutput( + "local://test-buffer-conservative-exceeds-limit", + inputBatches, + {"p1"}, + 2, + {{core::QueryConfig::kMaxPartitionedOutputBufferSize, + std::to_string( + twoRowPageBytes)}}); // exact append fits; estimate does not + + EXPECT_EQ(result.numAppends, 2); + EXPECT_EQ(result.numFlushes, 2); + EXPECT_EQ(result.numNonEmptyPartitions, 1); + + EXPECT_THAT(result.pageCounts, testing::UnorderedElementsAre(2, 0)); + EXPECT_THAT(result.rowCounts, testing::UnorderedElementsAre(2, 0)); + + const auto nonEmptyPartition = result.rowCounts[0] > 0 ? 0 : 1; + + auto expected = makeRowVector({"p1"}, {makeFlatVector({5, 5})}); + auto actual = concatPages(result.pages[nonEmptyPartition], rowType); + velox::test::assertEqualVectors(expected, actual); +} + +// Verifies that replicateNullsAndAny raises an error since it is not yet +// supported by OptimizedPartitionedOutput. +TEST_F(OptimizedPartitionedOutputTest, replicateNullsAndAnyUnsupported) { + auto input = makeRowVector( + {"p1", "v1"}, + {makeNullableFlatVector({0, std::nullopt, 1}), + makeFlatVector({"a", "b", "c"})}); + + auto plan = + PlanBuilder() + .values({input}) + .partitionedOutput({"p1"}, 2, /*replicateNullsAndAny=*/true, {"v1"}) + .planNode(); + + auto taskId = "local://test-replicate-nulls-unsupported-0"; + auto task = Task::create( + taskId, + core::PlanFragment{plan}, + 0, + createQueryContext({}), + Task::ExecutionMode::kParallel); + task->start(1); + + const auto taskWaitUs = std::chrono::duration_cast( + std::chrono::seconds{10}) + .count(); + ASSERT_TRUE(waitForTaskFailure(task.get(), taskWaitUs)); + ASSERT_THAT( + task->errorMessage(), + testing::HasSubstr( + "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput")); +} + +TEST_F(OptimizedPartitionedOutputTest, outputLayout) { + auto input = makeRowVector( + {"p1", "v1", "v2", "unused"}, + {makeFlatVector({0, 1, 2, 3, 4, 5, 6, 7}), + makeFlatVector({10, 11, 12, 13, 14, 15, 16, 17}), + makeFlatVector({20, 21, 22, 23, 24, 25, 26, 27}), + makeFlatVector({30, 31, 32, 33, 34, 35, 36, 37})}); + auto inputCopy = + std::static_pointer_cast(BaseVector::copy(*input, pool())); + + const std::vector outputLayout = {"v2", "v1"}; + const auto inputType = asRowType(input->type()); + const auto outputType = outputTypeForLayout(inputType, outputLayout); + auto expected = buildOutput(inputCopy, outputLayout); + + auto result = runPartitionedOutputWithLayout( + "local://test-optimized-output-layout", {input}, {}, 1, outputLayout); + + auto actual = concatPages(result.pages[0], outputType); + velox::test::assertEqualVectors(expected, actual); +} + +TEST_F(OptimizedPartitionedOutputTest, duplicateOutputColumns) { + constexpr int kNumPartitions = 4; + auto input = makeRowVector( + {"p1", "v1"}, + {makeFlatVector({0, 1, 2, 3, 0, 1, 2, 3}), + makeFlatVector({10, 11, 12, 13, 14, 15, 16, 17})}); + auto inputCopy = + std::static_pointer_cast(BaseVector::copy(*input, pool())); + const std::vector outputLayout = {"v1", "v1"}; + const auto inputType = asRowType(input->type()); + const auto outputType = outputTypeForLayout(inputType, outputLayout); + auto output = buildOutput(inputCopy, outputLayout); + + auto result = runPartitionedOutputWithLayout( + "local://test-optimized-output-layout-duplicated-columns", + {input}, + {"p1"}, + kNumPartitions, + outputLayout); + + std::vector assignments(inputCopy->size()); + auto partitionFn = std::make_unique( + false, kNumPartitions, inputType, std::vector{0}); + partitionFn->partition(*inputCopy, assignments); + + std::vector>> expectedRows(kNumPartitions); + for (vector_size_t i = 0; i < assignments.size(); ++i) { + expectedRows[assignments[i]].emplace_back(0, i); + } + + for (int p = 0; p < kNumPartitions; ++p) { + auto expected = gatherRows({output}, expectedRows[p], outputType); + auto actual = concatPages(result.pages[p], outputType); + ASSERT_EQ(expected->size(), actual->size()) << "partition " << p; + velox::test::assertEqualVectors( + canonicalize(expected), canonicalize(actual)); + } +} + +} // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/OptimizedVectorHasherTest.cpp b/velox/exec/tests/OptimizedVectorHasherTest.cpp new file mode 100644 index 00000000000..e0a107b6fd4 --- /dev/null +++ b/velox/exec/tests/OptimizedVectorHasherTest.cpp @@ -0,0 +1,308 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/exec/OptimizedVectorHasher.h" +#include "velox/exec/VectorHasher.h" +#include "velox/type/tests/utils/CustomTypesForTesting.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook; +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::test; + +namespace { + +class OptimizedVectorHasherTest : public testing::Test, public VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); + } + + BufferPtr makeIndices( + vector_size_t size, + std::function indexAt) { + auto indices = AlignedBuffer::allocate(size, pool()); + auto rawIndices = indices->asMutable(); + for (vector_size_t i = 0; i < size; ++i) { + rawIndices[i] = indexAt(i); + } + return indices; + } + + static SelectivityVector makeOddRows(vector_size_t size) { + SelectivityVector oddRows(size); + for (vector_size_t i = 0; i < size; i += 2) { + oddRows.setValid(i, false); + } + oddRows.updateBounds(); + return oddRows; + } + + void compareHashes( + const TypePtr& type, + const VectorPtr& vector, + const SelectivityVector& rows, + bool mix, + uint64_t seed = 0) { + auto expectedHasher = VectorHasher::create(type, 0); + auto actualHasher = OptimizedVectorHasher::create(type, 0); + + raw_vector expected(vector->size(), pool()); + raw_vector actual(vector->size(), pool()); + if (mix) { + std::iota(expected.begin(), expected.end(), seed); + std::iota(actual.begin(), actual.end(), seed); + } else { + std::fill(expected.begin(), expected.end(), 0); + std::fill(actual.begin(), actual.end(), 0); + } + + expectedHasher->decode(*vector, rows); + actualHasher->decode(*vector, rows); + + expectedHasher->hash(rows, mix, expected); + actualHasher->hash(rows, mix, actual); + + for (vector_size_t i = 0; i < vector->size(); ++i) { + EXPECT_EQ(expected[i], actual[i]) << "at " << i; + } + } + + void comparePrecomputed( + const TypePtr& type, + const VectorPtr& value, + vector_size_t size, + bool mix, + uint64_t seed = 0) { + auto expectedHasher = VectorHasher::create(type, 0); + auto actualHasher = OptimizedVectorHasher::create(type, 0); + + raw_vector expected(size, pool()); + raw_vector actual(size, pool()); + if (mix) { + std::iota(expected.begin(), expected.end(), seed); + std::iota(actual.begin(), actual.end(), seed); + } else { + std::fill(expected.begin(), expected.end(), 0); + std::fill(actual.begin(), actual.end(), 0); + } + + const SelectivityVector rows(size); + expectedHasher->precompute(*value); + actualHasher->precompute(*value); + + expectedHasher->hashPrecomputed(rows, mix, expected); + actualHasher->hashPrecomputed(mix, actual); + + for (vector_size_t i = 0; i < size; ++i) { + EXPECT_EQ(expected[i], actual[i]) << "at " << i; + } + } +}; + +TEST_F(OptimizedVectorHasherTest, flat) { + auto vector = BaseVector::create(BIGINT(), 100, pool()); + auto flatVector = vector->asFlatVector(); + for (vector_size_t i = 0; i < 100; ++i) { + if (i % 5 == 0) { + flatVector->setNull(i, true); + } else { + flatVector->set(i, i); + } + } + + const SelectivityVector allRows(100); + const auto oddRows = makeOddRows(100); + + compareHashes(BIGINT(), vector, oddRows, false); + compareHashes(BIGINT(), vector, allRows, false); + compareHashes(BIGINT(), vector, allRows, true, 10); + + flatVector->setNull(0, true); + comparePrecomputed(BIGINT(), vector, 100, false); + + flatVector->setNull(0, false); + flatVector->set(0, 7); + comparePrecomputed(BIGINT(), vector, 100, false); + + flatVector->set(0, 55); + comparePrecomputed(BIGINT(), vector, 100, true, 20); +} + +TEST_F(OptimizedVectorHasherTest, boolFlat) { + constexpr vector_size_t kSize = 137; + auto vector = makeFlatVector( + kSize, + [](vector_size_t row) { return row % 7 == 0 || row % 11 == 3; }, + [](vector_size_t row) { return row % 13 == 5; }); + const SelectivityVector allRows(vector->size()); + const auto oddRows = makeOddRows(vector->size()); + + compareHashes(BOOLEAN(), vector, oddRows, false); + compareHashes(BOOLEAN(), vector, allRows, false); + compareHashes(BOOLEAN(), vector, allRows, true, 17); + + vector = makeFlatVector( + kSize, [](vector_size_t row) { return row % 5 < 2; }); + compareHashes(BOOLEAN(), vector, allRows, false); + compareHashes(BOOLEAN(), vector, allRows, true, 23); +} + +TEST_F(OptimizedVectorHasherTest, nans) { + static const auto kNaN = std::numeric_limits::quiet_NaN(); + static const auto kSNaN = std::numeric_limits::signaling_NaN(); + auto vector = makeFlatVector({1.0, -1.0, kNaN, kSNaN, 0.0, -0.0}); + const SelectivityVector allRows(vector->size()); + + compareHashes(DOUBLE(), vector, allRows, false); + compareHashes(DOUBLE(), vector, allRows, true, 15); +} + +TEST_F(OptimizedVectorHasherTest, nonNullConstant) { + auto vector = BaseVector::createConstant(INTEGER(), 123, 6, pool()); + const SelectivityVector allRows(vector->size()); + const auto oddRows = makeOddRows(vector->size()); + + compareHashes(INTEGER(), vector, oddRows, false); + compareHashes(INTEGER(), vector, allRows, false); + compareHashes(INTEGER(), vector, allRows, true, 7); +} + +TEST_F(OptimizedVectorHasherTest, nullConstant) { + auto vector = BaseVector::createNullConstant(INTEGER(), 6, pool()); + const SelectivityVector allRows(vector->size()); + const auto oddRows = makeOddRows(vector->size()); + + compareHashes(INTEGER(), vector, oddRows, false); + compareHashes(INTEGER(), vector, allRows, false); + compareHashes(INTEGER(), vector, allRows, true, 11); +} + +TEST_F(OptimizedVectorHasherTest, unknown) { + auto vector = makeAllNullFlatVector(100); + const SelectivityVector allRows(vector->size()); + const auto oddRows = makeOddRows(vector->size()); + + compareHashes(UNKNOWN(), vector, oddRows, false); + compareHashes(UNKNOWN(), vector, allRows, false); + compareHashes(UNKNOWN(), vector, allRows, true, 0); +} + +TEST_F(OptimizedVectorHasherTest, dictionary) { + auto base = makeNullableFlatVector({10, 20, std::nullopt, 40, 50}); + constexpr vector_size_t kSize = 100; + auto dictionary = BaseVector::wrapInDictionary( + makeNulls(kSize, [&](vector_size_t row) { return row == 1 || row == 7; }), + makeIndices(kSize, [&](vector_size_t row) { return row % base->size(); }), + kSize, + base); + const SelectivityVector allRows(dictionary->size()); + const auto oddRows = makeOddRows(dictionary->size()); + + compareHashes(BIGINT(), dictionary, oddRows, false); + compareHashes(BIGINT(), dictionary, allRows, false); + compareHashes(BIGINT(), dictionary, allRows, true, 10); +} + +TEST_F(OptimizedVectorHasherTest, customComparison) { + auto vector = makeNullableFlatVector( + {0, 1, 256, 257, std::nullopt, 512, 513}, + BIGINT_TYPE_WITH_CUSTOM_COMPARISON()); + const SelectivityVector allRows(vector->size()); + + compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, false); + compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, true, 9); +} + +TEST_F(OptimizedVectorHasherTest, customComparisonArray) { + auto vector = makeNullableArrayVector( + {{0, 1, 2}, + {256, 257, 258}, + {512, 513, 514}, + {3, 4, 5}, + {259, 260, 261}, + {515, 516, 517}, + {std::nullopt}}, + ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON())); + const SelectivityVector allRows(vector->size()); + + compareHashes( + ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON()), vector, allRows, false); +} + +TEST_F(OptimizedVectorHasherTest, customComparisonMap) { + auto vector = makeNullableMapVector( + {std::vector>>{ + {0, 10}, {1, 11}, {2, 12}}, + std::vector>>{ + {256, 266}, {257, 267}, {258, 268}}, + std::vector>>{ + {512, 522}, {513, 523}, {514, 524}}, + std::vector>>{ + {3, 103}, {4, 104}, {5, 105}}, + std::vector>>{ + {259, 359}, {260, 360}, {261, 361}}, + std::vector>>{ + {515, 615}, {516, 616}, {517, 617}}, + std::vector>>{ + {0, std::nullopt}}}, + MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), + BIGINT_TYPE_WITH_CUSTOM_COMPARISON())); + const SelectivityVector allRows(vector->size()); + + compareHashes( + MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), + BIGINT_TYPE_WITH_CUSTOM_COMPARISON()), + vector, + allRows, + false); +} + +TEST_F(OptimizedVectorHasherTest, customComparisonRow) { + auto vector = makeRowVector( + {"a"}, + {makeNullableFlatVector( + {std::nullopt, 0, 1, 256, 257, 512, 513}, + BIGINT_TYPE_WITH_CUSTOM_COMPARISON())}); + const SelectivityVector allRows(vector->size()); + + compareHashes(vector->type(), vector, allRows, false); +} + +TEST_F(OptimizedVectorHasherTest, precompute) { + auto value = makeNullableFlatVector({std::nullopt}); + comparePrecomputed(BIGINT(), value, 100, false); + + value = makeNullableFlatVector({7}); + comparePrecomputed(BIGINT(), value, 100, false); + + value = makeNullableFlatVector({55}); + comparePrecomputed(BIGINT(), value, 100, true, 100); +} + +TEST_F(OptimizedVectorHasherTest, typeMismatch) { + auto hasher = OptimizedVectorHasher::create(BIGINT(), 0); + auto vector = makeFlatVector({"a", "b", "c"}); + SelectivityVector rows(vector->size()); + + VELOX_ASSERT_THROW( + hasher->decode(*vector, rows), "Type mismatch: BIGINT vs. VARCHAR"); +} + +} // namespace diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 9117cba55ee..468ae20bf79 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -1689,7 +1689,8 @@ class RoundRobinRowPartitionFunctionSpec : public core::PartitionFunctionSpec { public: std::unique_ptr create( int numPartitions, - bool /*localExchange*/) const override { + bool /*localExchange*/, + bool /*useOptimizedPartitionFunction*/ = false) const override { return std::make_unique(numPartitions); } diff --git a/velox/flag_definitions/flags.cpp b/velox/flag_definitions/flags.cpp index 8648e80a68e..4adc6a5a22c 100644 --- a/velox/flag_definitions/flags.cpp +++ b/velox/flag_definitions/flags.cpp @@ -55,6 +55,8 @@ DEFINE_int32( DEFINE_bool(avx2, true, "Enables use of AVX2 when available"); +DEFINE_bool(avx512f, true, "Enables use of AVX512F when available"); + DEFINE_bool(bmi2, true, "Enables use of BMI2 when available"); // Used in exec/Expr.cpp diff --git a/velox/serializers/CMakeLists.txt b/velox/serializers/CMakeLists.txt index c5227f763ff..366b043aeb1 100644 --- a/velox/serializers/CMakeLists.txt +++ b/velox/serializers/CMakeLists.txt @@ -29,6 +29,7 @@ velox_add_library( UnsafeRowSerializer.cpp PrestoBatchVectorSerializer.cpp PrestoHeader.cpp + PrestoIterativePartitioningSerializer.cpp PrestoIterativeVectorSerializer.cpp PrestoSerializerDeserializationUtils.cpp PrestoSerializerEstimationUtils.cpp diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp new file mode 100644 index 00000000000..533b8d6bb75 --- /dev/null +++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp @@ -0,0 +1,1166 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/serializers/PrestoIterativePartitioningSerializer.h" + +#include +#include + +#include "velox/common/base/BitUtil.h" +#include "velox/type/Type.h" +#include "velox/vector/ComplexVector.h" +#include "velox/vector/ConstantVector.h" +#include "velox/vector/FlatVector.h" + +namespace facebook::velox::serializer::presto { + +namespace { + +constexpr int8_t kCheckSumBitMask = 4; +constexpr int64_t kVectorSizeTypeSize{sizeof(vector_size_t)}; +// [numRows:4][codec:1] +constexpr int64_t kUncompressedSizeOffset{kVectorSizeTypeSize + 1}; +// [numRows:4][codec:1][uncompressedSize:4][compressedSize:4][checksum:8] +constexpr int64_t kHeaderSize{kUncompressedSizeOffset + 4 + 4 + 8}; + +// chunk size for flushing constant values +constexpr int32_t kChunkBytes = 4096; + +static inline const std::string_view kByteArray{"BYTE_ARRAY"}; +static inline const std::string_view kShortArray{"SHORT_ARRAY"}; +static inline const std::string_view kIntArray{"INT_ARRAY"}; +static inline const std::string_view kLongArray{"LONG_ARRAY"}; +static inline const std::string_view kInt128Array{"INT128_ARRAY"}; +static inline const std::string_view kVariableWidth{"VARIABLE_WIDTH"}; +static inline const std::string_view kRow{"ROW"}; + +inline void writeInt32(OutputStream* out, int32_t value) { + out->write(reinterpret_cast(&value), sizeof(value)); +} + +inline void writeInt64(OutputStream* out, int64_t value) { + out->write(reinterpret_cast(&value), sizeof(value)); +} + +char getCodecMarker(bool checksumEnabled) { + char marker = 0; + if (checksumEnabled) { + marker |= kCheckSumBitMask; + } + return marker; +} + +std::string_view typeToEncodingName(const TypePtr& type) { + switch (type->kind()) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + return kByteArray; + case TypeKind::SMALLINT: + return kShortArray; + case TypeKind::INTEGER: + case TypeKind::REAL: + return kIntArray; + case TypeKind::BIGINT: + case TypeKind::DOUBLE: + case TypeKind::TIMESTAMP: + return kLongArray; + case TypeKind::HUGEINT: + return kInt128Array; + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + return kVariableWidth; + case TypeKind::ROW: + return kRow; + default: + VELOX_FAIL("Unsupported type kind: {}", static_cast(type->kind())); + } +} + +/// Finalizes the Presto page CRC by mixing in the codec marker, row count, +/// and uncompressed size on top of the listener's accumulated data checksum. +int64_t computeChecksum( + PrestoOutputStreamListener& listener, + int8_t codecMarker, + int32_t numRows, + int32_t uncompressedSize) { + auto crc = listener.crc(); + crc.process_bytes(&codecMarker, 1); + crc.process_bytes(&numRows, 4); + crc.process_bytes(&uncompressedSize, 4); + return static_cast(crc.checksum()); +} + +/// Returns the serialized byte width of a fixed-width type, matching the +/// sizeof(T) used in flushFlatValues. +int32_t fixedTypeWidth(TypeKind kind) { + switch (kind) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + return 1; + case TypeKind::SMALLINT: + return 2; + case TypeKind::INTEGER: + case TypeKind::REAL: + return 4; + case TypeKind::BIGINT: + case TypeKind::DOUBLE: + return 8; + case TypeKind::TIMESTAMP: + case TypeKind::HUGEINT: + return 16; + default: + return 0; + } +} + +/// Returns the exact bytes for one fixed-width column in one partition. +int64_t +simpleColumnBytes(const TypePtr& colType, int64_t numRows, int64_t numNulls) { + const auto encodingName = typeToEncodingName(colType); + return 4 + static_cast(encodingName.size()) + // header + 4 + // rowCount + 1 + // nullFlag + (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap + (numRows - numNulls) * fixedTypeWidth(colType->kind()); // values +} + +/// Returns the null counts if it can be derived without row-by-row checks, +/// otherwise returns std::nullopt. +std::optional countNulls(const BaseVector& vector) { + if (!vector.mayHaveNulls()) { + return 0; + } + + if (const auto nullCount = vector.getNullCount()) { + return *nullCount; + } + + switch (vector.encoding()) { + case VectorEncoding::Simple::FLAT: + case VectorEncoding::Simple::ROW: + return BaseVector::countNulls(vector.nulls(), vector.size()); + case VectorEncoding::Simple::CONSTANT: + return vector.isNullAt(0) ? vector.size() : 0; + case VectorEncoding::Simple::DICTIONARY: { + vector_size_t nullCount = 0; + for (auto i = 0; i < vector.size(); ++i) { + nullCount += vector.isNullAt(i); + } + return nullCount; + } + default: + return std::nullopt; + } +} + +/// Returns the maximum null-bitmap bytes for totalRows distributed across +/// numPartitionsWithNulls partitions. This occurs when one row is put in each +/// partition first, then one byte is added for every 8 remaining rows. +int64_t maxBitmapBytes(int64_t totalRows, int64_t numPartitionsWithNulls) { + if (numPartitionsWithNulls == 0) { + return 0; + } + VELOX_DCHECK_LE(numPartitionsWithNulls, totalRows); + return numPartitionsWithNulls + (totalRows - numPartitionsWithNulls) / 8; +} + +/// Base class for column nodes in the serializer's per-partition accounting. +/// +/// A node tracks exact row, null, and byte counts for one column while +/// appending partitioned vectors. +class ColumnBufferState { + public: + ColumnBufferState(TypePtr type, uint32_t numPartitions) + : type_(std::move(type)), + numPartitions_(numPartitions), + rowsPerPartition_(numPartitions, 0), + nullsPerPartition_(numPartitions, 0), + bytesPerPartition_(numPartitions, 0) {} + + virtual ~ColumnBufferState() = default; + + static std::unique_ptr create( + const TypePtr& type, + uint32_t numPartitions); + + virtual void append(const PartitionedVectorPtr& partitionedVector) = 0; + + virtual void clear() { + std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0); + std::fill(nullsPerPartition_.begin(), nullsPerPartition_.end(), 0); + std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0); + numNonEmptyPartitions_ = 0; + numPartitionsWithNulls_ = 0; + } + + const std::vector& rowsPerPartition() const { + return rowsPerPartition_; + } + + const std::vector& bytesPerPartition() const { + return bytesPerPartition_; + } + + uint32_t numNonEmptyPartitions() const { + return numNonEmptyPartitions_; + } + + uint32_t numPartitionsWithNulls() const { + return numPartitionsWithNulls_; + } + + int64_t nullBitmapBytesBuffered() const { + int64_t total = 0; + for (auto p = 0; p < numPartitions_; ++p) { + if (nullsPerPartition_[p] > 0) { + total += bits::nbytes(rowsPerPartition_[p]); + } + } + return total; + } + + protected: + const TypePtr type_; + const uint32_t numPartitions_; + std::vector rowsPerPartition_; + std::vector nullsPerPartition_; + std::vector bytesPerPartition_; + + // count of partitions with at least one buffered row + uint32_t numNonEmptyPartitions_{0}; + + // count of partitions that require a null bitmap + uint32_t numPartitionsWithNulls_{0}; +}; + +/// Buffer state for one fixed-width column. +class FixedWidthBufferState : public ColumnBufferState { + public: + FixedWidthBufferState(TypePtr type, uint32_t numPartitions) + : ColumnBufferState(std::move(type), numPartitions) {} + + void append(const PartitionedVectorPtr& partitionedVector) override { + for (auto p = 0; p < numPartitions_; ++p) { + const auto numRows = partitionedVector->numRowsAt(p); + if (numRows == 0) { + continue; + } + + const auto numNulls = partitionedVector->numNullsAt(p); + auto& rows = rowsPerPartition_[p]; + auto& nulls = nullsPerPartition_[p]; + + if (rows == 0) { + ++numNonEmptyPartitions_; + } + if (nulls == 0 && numNulls > 0) { + ++numPartitionsWithNulls_; + } + rows += numRows; + nulls += numNulls; + bytesPerPartition_[p] = simpleColumnBytes(type_, rows, nulls); + } + } +}; + +/// Buffer state for one VARCHAR or VARBINARY column. +class VariableWidthBufferState : public ColumnBufferState { + public: + VariableWidthBufferState(TypePtr type, uint32_t numPartitions) + : ColumnBufferState(std::move(type), numPartitions) {} + + void append(const PartitionedVectorPtr& partitionedVector) override { + VELOX_NYI( + "Variable-width columns are not yet supported by " + "PrestoIterativePartitioningSerializer::append"); + } +}; + +std::unique_ptr ColumnBufferState::create( + const TypePtr& type, + uint32_t numPartitions) { + switch (type->kind()) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + case TypeKind::REAL: + case TypeKind::DOUBLE: + case TypeKind::HUGEINT: + return std::make_unique(type, numPartitions); + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + return std::make_unique(type, numPartitions); + case TypeKind::TIMESTAMP: + case TypeKind::ROW: + case TypeKind::ARRAY: + case TypeKind::MAP: + VELOX_NYI( + "Unsupported type kind for createColumnBufferState: {}", + type->kind()); + default: + VELOX_UNSUPPORTED( + "Unsupported type kind for createColumnBufferState: {}", + type->kind()); + } +} + +} // namespace + +/// Top-level buffer state for one output page. +/// +/// For each partition, tracks page-level headers and aggregates child column +/// sizes. +class BufferState { + public: + BufferState( + uint32_t numPartitions, + std::vector> children) + : numPartitions_(numPartitions), + rowsPerPartition_(numPartitions, 0), + bytesPerPartition_(numPartitions, 0), + children_(std::move(children)) {} + + static std::unique_ptr create( + const RowTypePtr& type, + uint32_t numPartitions); + + void append( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputToInputChannels) { + auto rowVector = + std::dynamic_pointer_cast(partitionedVector); + VELOX_CHECK_NOT_NULL(rowVector); + + rowsBuffered_ += partitionedVector->baseVector()->size(); + + for (column_index_t column = 0; column < children_.size(); ++column) { + const auto inputColumn = outputToInputChannels.empty() + ? column + : outputToInputChannels[column]; + children_[column]->append(rowVector->childAt(inputColumn)); + } + + for (auto p = 0; p < numPartitions_; ++p) { + const auto numRows = partitionedVector->numRowsAt(p); + if (numRows == 0) { + continue; + } + if (rowsPerPartition_[p] == 0) { + ++numNonEmptyPartitions_; + } + rowsPerPartition_[p] += numRows; + + int64_t partitionBytes = kHeaderSize + 4; + for (const auto& child : children_) { + partitionBytes += child->bytesPerPartition()[p]; + } + bytesBuffered_ += partitionBytes - bytesPerPartition_[p]; + bytesPerPartition_[p] = partitionBytes; + } + } + + void clear() { + std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0); + std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0); + numNonEmptyPartitions_ = 0; + rowsBuffered_ = 0; + bytesBuffered_ = 0; + for (auto& child : children_) { + child->clear(); + } + } + + const std::vector& rowsPerPartition() const { + return rowsPerPartition_; + } + + const std::vector& bytesPerPartition() const { + return bytesPerPartition_; + } + + uint32_t numNonEmptyPartitions() const { + return numNonEmptyPartitions_; + } + + vector_size_t rowsBuffered() const { + return rowsBuffered_; + } + + int64_t bytesBuffered() const { + return bytesBuffered_; + } + + const std::vector>& children() const { + return children_; + } + + private: + const uint32_t numPartitions_; + std::vector rowsPerPartition_; + std::vector bytesPerPartition_; + uint32_t numNonEmptyPartitions_{0}; + vector_size_t rowsBuffered_{0}; + int64_t bytesBuffered_{0}; + std::vector> children_; +}; + +std::unique_ptr BufferState::create( + const RowTypePtr& type, + uint32_t numPartitions) { + std::vector> children; + children.reserve(type->size()); + for (auto column = 0; column < type->size(); ++column) { + children.push_back( + ColumnBufferState::create(type->childAt(column), numPartitions)); + } + return std::make_unique(numPartitions, std::move(children)); +} + +PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer( + RowTypePtr outputType, + uint32_t numPartitions, + const SerdeOpts& opts, + memory::MemoryPool* pool, + std::vector outputToInputChannels, + std::function()> listenerFactory) + : outputType_(std::move(outputType)), + outputToInputChannels_(std::move(outputToInputChannels)), + numPartitions_(numPartitions), + opts_(opts), + pool_(pool), + listenerFactory_(std::move(listenerFactory)), + numColumns_(outputType_->size()), + bufferState_(BufferState::create(outputType_, numPartitions_)) { + VELOX_CHECK_GT(numPartitions_, 0); + VELOX_CHECK_NOT_NULL(pool_); + VELOX_CHECK( + outputToInputChannels_.empty() || + outputToInputChannels_.size() == outputType_->size(), + "outputToInputChannels size must match output column count"); +} + +PrestoIterativePartitioningSerializer:: + ~PrestoIterativePartitioningSerializer() = default; + +int64_t PrestoIterativePartitioningSerializer::bytesBuffered() const { + return bufferState_->bytesBuffered(); +} + +vector_size_t PrestoIterativePartitioningSerializer::rowsBuffered() const { + return bufferState_->rowsBuffered(); +} + +void PrestoIterativePartitioningSerializer::clear() { + partitionedRowVectors_.clear(); + bufferState_->clear(); +} + +void PrestoIterativePartitioningSerializer::validateOutputInputMapping( + const RowVectorPtr& input) const { + const auto numInputColumns = input->childrenSize(); + for (column_index_t outputColumn = 0; outputColumn < numColumns_; + ++outputColumn) { + const auto inputColumn = outputToInputChannel(outputColumn); + VELOX_CHECK_LT( + inputColumn, + numInputColumns, + "Output column {} maps to invalid input column {}", + outputColumn, + inputColumn); + + const auto& child = input->childAt(inputColumn); + VELOX_CHECK_NOT_NULL( + child, + "Output column {} maps to null input column {}", + outputColumn, + inputColumn); + + const auto type = outputType_->childAt(outputColumn); + VELOX_CHECK( + child->type()->equivalent(*type), + "Output column {} expects {}, got {} from input column {}", + outputColumn, + type->toString(), + child->type()->toString(), + inputColumn); + } +} + +int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend( + const RowVectorPtr& input) const { + VELOX_CHECK_NOT_NULL(input); + validateOutputInputMapping(input); + + if (input->size() == 0) { + return bytesBuffered(); + } + + const auto numRows = input->size(); + + // Worst case: each input row lands in a distinct empty partition, capped by + // the number of empty partitions. + const auto numNewPartitions = std::min( + numRows, numPartitions_ - bufferState_->numNonEmptyPartitions()); + // One page header per newly non-empty partition. + auto estimatedBytes = + bufferState_->bytesBuffered() + numNewPartitions * (kHeaderSize + 4); + + // Cache per input column. If multiple output columns map to the same input + // column, reuse the already computed incremental bytes. + std::vector> estimatedIncrementalBytes( + input->childrenSize()); + for (column_index_t column = 0; column < numColumns_; ++column) { + const auto inputColumn = outputToInputChannel(column); + if (estimatedIncrementalBytes[inputColumn].has_value()) { + estimatedBytes += *estimatedIncrementalBytes[inputColumn]; + continue; + } + const auto& columnType = outputType_->childAt(column); + if (columnType->isUnknown()) { + VELOX_UNSUPPORTED( + "Unsupported type kind for " + "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}", + columnType->kind()); + } else if (columnType->isFixedWidth()) { + const auto* columnState = bufferState_->children()[column].get(); + const auto inputNulls = countNulls(*input->childAt(inputColumn)); + const auto partitionsWithNulls = std::min( + bufferState_->numNonEmptyPartitions() + numNewPartitions, + columnState->numPartitionsWithNulls() + inputNulls.value_or(numRows)); + const auto nullBitmapBytes = maxBitmapBytes( + bufferState_->rowsBuffered() + numRows, partitionsWithNulls); + auto nullBitmapBytesBuffered = columnState->nullBitmapBytesBuffered(); + VELOX_DCHECK_GE(nullBitmapBytes, nullBitmapBytesBuffered); + + estimatedIncrementalBytes[inputColumn] = numNewPartitions * + simpleColumnBytes(columnType, 0, 0) + // header growth + nullBitmapBytes - + nullBitmapBytesBuffered + // null bitmap growth + static_cast(numRows - inputNulls.value_or(0)) * + fixedTypeWidth(columnType->kind()); // value bytes growth + estimatedBytes += *estimatedIncrementalBytes[inputColumn]; + } else { + switch (columnType->kind()) { + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + case TypeKind::ROW: + case TypeKind::ARRAY: + case TypeKind::MAP: + VELOX_NYI( + "Unsupported type kind for " + "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}", + columnType->kind()); + default: + VELOX_UNSUPPORTED( + "Unsupported type kind for " + "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}", + columnType->kind()); + } + } + } + return estimatedBytes; +} + +void PrestoIterativePartitioningSerializer::append( + const RowVectorPtr& input, + const std::vector& partitions) { + VELOX_CHECK_NOT_NULL(input); + VELOX_CHECK_EQ( + input->size(), + partitions.size(), + "partitions.size() must equal input->size()"); + + validateOutputInputMapping(input); + + if (input->size() == 0) { + return; + } + + PartitionBuildContext ctx; + auto partitionedRowVector = PartitionedVector::create( + std::static_pointer_cast(input), + partitions, + numPartitions_, + ctx, + pool_); + + bufferState_->append(partitionedRowVector, outputToInputChannels_); + partitionedRowVectors_.push_back(std::move(partitionedRowVector)); +} + +// --------------------------------------------------------------------------- +// Top-level flush +// --------------------------------------------------------------------------- + +std::map, vector_size_t>> +PrestoIterativePartitioningSerializer::flush() { + auto pages = + (opts_.compressionKind == common::CompressionKind::CompressionKind_NONE) + ? flushUncompressed() + : flushCompressed(); + + clear(); + + return pages; +} + +std::map, vector_size_t>> +PrestoIterativePartitioningSerializer::flushUncompressed() { + if (partitionedRowVectors_.empty()) { + return {}; + } + + // 1. Determine non-empty partitions. + std::vector nonEmptyPartitions; + for (uint32_t p = 0; p < numPartitions_; ++p) { + if (bufferState_->rowsPerPartition()[p] > 0) { + nonEmptyPartitions.push_back(p); + } + } + const auto& rowSchema = outputType_->asRow(); + + // 2. Create per-partition listeners first so the codec mask can be derived + // from whether the factory actually produced a listener. The factory may + // return nullptr (e.g. when OutputBufferManager has no listener factory + // set), in which case checksumming is skipped and the checksum bit must not + // be set in the codec byte. + std::vector> listeners(numPartitions_); + for (uint32_t p : nonEmptyPartitions) { + if (listenerFactory_) { + listeners[p] = listenerFactory_(); + } + } + const bool checksumEnabled = !nonEmptyPartitions.empty() && + listeners[nonEmptyPartitions[0]] != nullptr; + const char codecMask = getCodecMarker(checksumEnabled); + + // 3. Create output streams sized to the exact bytes each partition will need, + // so that the entire payload fits. This avoids multiple resizing and copying. + std::vector> outputStreams(numPartitions_); + std::vector rawOutputStreams(numPartitions_); + std::vector beginStreamPositions(numPartitions_); + + for (uint32_t p : nonEmptyPartitions) { + outputStreams[p] = std::make_unique( + *pool_, listeners[p].get(), bufferState_->bytesPerPartition()[p]); + rawOutputStreams[p] = outputStreams[p].get(); + beginStreamPositions[p] = outputStreams[p]->tellp(); + + flushStart(*outputStreams[p], p, codecMask); + } + + // 4. Flush column data. + flushRowChildren( + partitionedRowVectors_, rowSchema, nonEmptyPartitions, rawOutputStreams); + + // 5. Finalize the page by seeking back to fill in sizes and CRC, and get the + // IOBuf and numOfRows from each stream. + std::map, vector_size_t>> + result; + for (uint32_t p : nonEmptyPartitions) { + flushFinish( + *outputStreams[p], + p, + beginStreamPositions[p], + codecMask, + listeners[p].get()); + result[p] = std::make_pair( + outputStreams[p]->getIOBuf(), bufferState_->rowsPerPartition()[p]); + } + + return result; +} + +std::map, vector_size_t>> +PrestoIterativePartitioningSerializer::flushCompressed() { + VELOX_NYI(); +} + +// --------------------------------------------------------------------------- +// Second level functions: start, columns and finish +// --------------------------------------------------------------------------- + +void PrestoIterativePartitioningSerializer::flushStart( + IOBufOutputStream& out, + uint32_t partition, + char codecMask) const { + auto* listener = dynamic_cast(out.listener()); + if (listener) { + listener->pause(); + } + + // Write 21-byte Presto page header; sizes and CRC are filled in later. + const int32_t numRows = + static_cast(bufferState_->rowsPerPartition()[partition]); + char header[kHeaderSize] = {}; + std::memcpy(&header[0], &numRows, 4); + std::memcpy(&header[4], &codecMask, 1); + out.write(header, kHeaderSize); + + if (listener) { + listener->resume(); + } + + // Number of columns is included in the CRC. + const int32_t numCols = static_cast(numColumns_); + out.write(reinterpret_cast(&numCols), 4); +} + +void PrestoIterativePartitioningSerializer::flushRowChildren( + const std::vector& partitionedVectors, + const RowType& rowSchema, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + for (uint32_t col = 0; col < rowSchema.size(); ++col) { + std::vector column; + column.reserve(partitionedVectors.size()); + for (const auto& partitionedVector : partitionedVectors) { + const auto& partitionedRowVector = + std::dynamic_pointer_cast(partitionedVector); + VELOX_DCHECK_NOT_NULL(partitionedRowVector.get()); + column.push_back( + partitionedRowVector->childAt(outputToInputChannel(col))); + } + + flushColumn( + column, rowSchema.childAt(col), nonEmptyPartitions, outputStreams); + } +} + +void PrestoIterativePartitioningSerializer::flushFinish( + IOBufOutputStream& out, + uint32_t partition, + std::streampos beginOffset, + char codecMask, + OutputStreamListener* listener) const { + auto* prestoListener = dynamic_cast(listener); + if (prestoListener) { + prestoListener->pause(); + } + + const std::streampos totalSize = + static_cast(out.tellp() - beginOffset); + const std::streampos uncompressedSize = totalSize - kHeaderSize; + int64_t crc = 0; + if (prestoListener) { + crc = computeChecksum( + *prestoListener, + static_cast(codecMask), + static_cast(bufferState_->rowsPerPartition()[partition]), + uncompressedSize); + } + + out.seekp(beginOffset + kUncompressedSizeOffset); + writeInt32(&out, uncompressedSize); + writeInt32(&out, uncompressedSize); // TODO: compressedSize + writeInt64(&out, crc); + out.seekp(beginOffset + totalSize); +} + +// --------------------------------------------------------------------------- +// Column-level dispatch +// --------------------------------------------------------------------------- + +void PrestoIterativePartitioningSerializer::flushColumn( + const std::vector& partitionedVectors, + const TypePtr& colType, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + VELOX_CHECK_GT(partitionedVectors.size(), 0); + + auto typeKind = partitionedVectors[0]->baseVector()->typeKind(); + switch (typeKind) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + case TypeKind::REAL: + case TypeKind::DOUBLE: + case TypeKind::HUGEINT: + flushSimpleColumn( + partitionedVectors, colType, nonEmptyPartitions, outputStreams); + break; + + case TypeKind::TIMESTAMP: + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + case TypeKind::ROW: + case TypeKind::ARRAY: + case TypeKind::MAP: + VELOX_NYI( + "Unsupported vector type kind for PrestoIterativePartitioningSerializer: {}", + typeKind); + + default: + VELOX_UNSUPPORTED( + "Invalid vector type kind for PrestoIterativePartitioningSerializer: {}", + typeKind); + } +} + +void PrestoIterativePartitioningSerializer::flushSimpleColumn( + const std::vector& partitionedVectors, + const TypePtr& colType, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + flushHeader(typeToEncodingName(colType), nonEmptyPartitions, outputStreams); + flushRowCounts(nonEmptyPartitions, outputStreams); + flushNulls(partitionedVectors, nonEmptyPartitions, outputStreams); + + for (size_t i = 0; i < partitionedVectors.size(); i++) { + flushSingleSimpleVector(partitionedVectors[i], outputStreams); + } +} + +template +void PrestoIterativePartitioningSerializer::flushSingleFlatVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const { + using T = typename TypeTraits::NativeType; + auto* flatVector = partitionedVector->as>(); + VELOX_DCHECK_NOT_NULL(flatVector); + + const auto* rawValues = + flatVector->baseVector()->template as>()->rawValues(); + const auto* rawNulls = flatVector->baseVector()->rawNulls(); + const auto* partitionOffsets = flatVector->rawPartitionOffsets(); + + flushFlatValues(rawValues, rawNulls, partitionOffsets, outputStreams); +} + +// BOOLEAN columns use kByteArray encoding: FlatVector stores bits +// packed, so rawValues() is unsupported. Each non-null value is written as +// one byte (0x00 or 0x01). +template <> +void PrestoIterativePartitioningSerializer::flushSingleFlatVector< + TypeKind::BOOLEAN>( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const { + auto* flatVector = partitionedVector->as>(); + VELOX_DCHECK_NOT_NULL(flatVector); + + const auto* rawBoolValues = + flatVector->baseVector()->as>()->rawValues(); + const auto* rawNulls = flatVector->baseVector()->rawNulls(); + const auto* partitionOffsets = flatVector->rawPartitionOffsets(); + + // TODO: Improve performance + vector_size_t lastOffset = 0; + for (uint32_t p = 0; p < numPartitions_; ++p) { + const auto offset = partitionOffsets[p]; + const auto numValues = offset - lastOffset; + const auto numNulls = partitionedVector->numNullsAt(p); + if (outputStreams[p] != nullptr && numValues > 0) { + if (numNulls == 0) { + for (vector_size_t i = lastOffset; i < offset; ++i) { + const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0; + outputStreams[p]->write(reinterpret_cast(&val), 1); + } + } else { + VELOX_DCHECK_NOT_NULL(rawNulls); + for (vector_size_t i = lastOffset; i < offset; ++i) { + if (!bits::isBitNull(rawNulls, i)) { + const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0; + outputStreams[p]->write(reinterpret_cast(&val), 1); + } + } + } + } + lastOffset = offset; + } +} + +template +void PrestoIterativePartitioningSerializer::flushSingleConstantVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const { + if constexpr ( + kind == TypeKind::VARCHAR || kind == TypeKind::VARBINARY || + kind == TypeKind::TIMESTAMP) { + VELOX_NYI( + "flushSingleConstantVector does not support variable-length type: {}", + kind); + } + + using T = typename TypeTraits::NativeType; + auto* constantVector = + partitionedVector->baseVector()->template as>(); + VELOX_DCHECK_NOT_NULL(constantVector); + + if (constantVector->isNullAt(0)) { + return; + } + + const auto value = constantVector->valueAtFast(0); + const auto* partitionOffsets = partitionedVector->rawPartitionOffsets(); + + Scratch scratch; + ScratchPtr values(scratch); + const auto numRowsPerChunk = + std::max(1, kChunkBytes / sizeof(T)); + const char* chunkBytes = nullptr; + + vector_size_t lastOffset = 0; + for (uint32_t p = 0; p < numPartitions_; ++p) { + const auto offset = partitionOffsets[p]; + auto numRows = offset - lastOffset; + if (numRows > 0) { + VELOX_DCHECK_NOT_NULL(outputStreams[p]); + + if (chunkBytes == nullptr) { + auto* ptr = values.get(numRowsPerChunk); + std::fill_n(ptr, numRowsPerChunk, value); + chunkBytes = reinterpret_cast(ptr); + } + + while (numRows > 0) { + auto n = std::min(numRowsPerChunk, numRows); + outputStreams[p]->write(chunkBytes, n * sizeof(T)); + numRows -= n; + } + } + lastOffset = offset; + } +} + +void PrestoIterativePartitioningSerializer::flushSingleSimpleVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const { + auto encoding = partitionedVector->baseVector()->encoding(); + auto typeKind = partitionedVector->baseVector()->typeKind(); + + switch (encoding) { + case VectorEncoding::Simple::FLAT: + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + flushSingleFlatVector, typeKind, partitionedVector, outputStreams); + break; + case VectorEncoding::Simple::CONSTANT: + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + flushSingleConstantVector, + typeKind, + partitionedVector, + outputStreams); + break; + case VectorEncoding::Simple::BIASED: + case VectorEncoding::Simple::DICTIONARY: + case VectorEncoding::Simple::SEQUENCE: + VELOX_NYI( + "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}", + encoding); + default: + VELOX_UNSUPPORTED( + "Invalid vector encoding for PrestoIterativePartitioningSerializer:flushSingleSimpleVector: {}", + encoding); + } +} + +// --------------------------------------------------------------------------- +// Column building blocks +// --------------------------------------------------------------------------- + +void PrestoIterativePartitioningSerializer::flushHeader( + std::string_view name, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + const int32_t nameLen = static_cast(name.size()); + for (uint32_t p : nonEmptyPartitions) { + writeInt32(outputStreams[p], nameLen); + outputStreams[p]->write(name.data(), nameLen); + } +} + +void PrestoIterativePartitioningSerializer::flushRowCounts( + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + for (uint32_t p : nonEmptyPartitions) { + writeInt32( + outputStreams[p], + static_cast(bufferState_->rowsPerPartition()[p])); + } +} + +void PrestoIterativePartitioningSerializer::flushNulls( + const std::vector& partitionedVectors, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + std::vector nullCounts(numPartitions_, 0); + for (uint32_t p : nonEmptyPartitions) { + for (const auto& pv : partitionedVectors) { + nullCounts[p] += pv->numNullsAt(p); + } + const char flagByte = nullCounts[p] > 0 ? 1 : 0; + outputStreams[p]->write(&flagByte, 1); + } + + const bool hasAnyNulls = std::any_of( + nonEmptyPartitions.begin(), nonEmptyPartitions.end(), [&](uint32_t p) { + return nullCounts[p] > 0; + }); + if (!hasAnyNulls) { + return; + } + + // Build each partition's null bitmap in a temporary buffer, accumulating + // bits across all batches. Writing via write() correctly handles range + // boundaries in the output stream without requiring seekp(). + // TODO: Avoid this extra memory allocation and copy + std::vector> bitmaps(numPartitions_); + for (uint32_t p : nonEmptyPartitions) { + if (nullCounts[p] > 0) { + bitmaps[p].assign( + bits::nbytes(bufferState_->rowsPerPartition()[p]), + bits::kNotNullByte); + } + } + + std::vector destBitOffsets(numPartitions_, 0); + for (const auto& pv : partitionedVectors) { + auto encoding = pv->baseVector()->encoding(); + switch (encoding) { + case VectorEncoding::Simple::FLAT: + flushSimpleVectorNulls(pv, nonEmptyPartitions, bitmaps, destBitOffsets); + break; + case VectorEncoding::Simple::CONSTANT: + flushConstantVectorNulls( + pv, nonEmptyPartitions, bitmaps, destBitOffsets); + break; + case VectorEncoding::Simple::BIASED: + case VectorEncoding::Simple::DICTIONARY: + case VectorEncoding::Simple::SEQUENCE: + VELOX_NYI( + "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}", + encoding); + default: + VELOX_UNSUPPORTED( + "Invalid vector encoding for PrestoIterativePartitioningSerializer: {}", + encoding); + } + } + + for (uint32_t p : nonEmptyPartitions) { + if (nullCounts[p] == 0) { + continue; + } + + // Convert Velox format (LSB-first, 1=not-null) to Presto wire format + // (MSB-first, 1=null) in-place. + const int32_t numBytes = bits::nbytes(bufferState_->rowsPerPartition()[p]); + for (int32_t i = 0; i < numBytes; ++i) { + bitmaps[p][i] = ~bitmaps[p][i]; + bits::reverseBits(&bitmaps[p][i], 1); + } + + outputStreams[p]->write( + reinterpret_cast(bitmaps[p].data()), numBytes); + } +} + +void PrestoIterativePartitioningSerializer::flushSimpleVectorNulls( + const PartitionedVectorPtr& partitionedVector, + const std::vector& nonEmptyPartitions, + std::vector>& bitmaps, + std::vector& destBitOffsets) { + const uint64_t* rawNulls = partitionedVector->baseVector()->rawNulls(); + const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets(); + vector_size_t startBit = 0; + for (uint32_t p : nonEmptyPartitions) { + vector_size_t numBits = rawPartitionOffsets[p] - startBit; + if (rawNulls && numBits > 0 && !bitmaps[p].empty()) { + bits::copyBits( + rawNulls, + startBit, + reinterpret_cast(bitmaps[p].data()), + destBitOffsets[p], + numBits); + } + if (!bitmaps[p].empty()) { + destBitOffsets[p] += numBits; + } + startBit = rawPartitionOffsets[p]; + } +} + +void PrestoIterativePartitioningSerializer::flushConstantVectorNulls( + const PartitionedVectorPtr& partitionedVector, + const std::vector& nonEmptyPartitions, + std::vector>& bitmaps, + std::vector& destBitOffsets) { + const bool isNullConstant = partitionedVector->baseVector()->isNullAt(0); + const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets(); + vector_size_t startBit = 0; + for (uint32_t p : nonEmptyPartitions) { + vector_size_t numBits = rawPartitionOffsets[p] - startBit; + if (isNullConstant && numBits > 0 && !bitmaps[p].empty()) { + bits::fillBits( + reinterpret_cast(bitmaps[p].data()), + destBitOffsets[p], + destBitOffsets[p] + numBits, + bits::kNull); + } + if (!bitmaps[p].empty()) { + destBitOffsets[p] += numBits; + } + startBit = rawPartitionOffsets[p]; + } +} + +template +void PrestoIterativePartitioningSerializer::flushFlatValues( + const T* partitionedValues, + const uint64_t* rawNulls, + const vector_size_t* partitionOffsets, + const std::vector& outputStreams) const { + const auto typeWidth = sizeof(T); + vector_size_t lastOffset = 0; + for (uint32_t p = 0; p < numPartitions_; ++p) { + const auto offset = partitionOffsets[p]; + const auto numValues = offset - lastOffset; + if (outputStreams[p] != nullptr && numValues > 0) { + if (!rawNulls) { + outputStreams[p]->write( + reinterpret_cast(&partitionedValues[lastOffset]), + numValues * typeWidth); + } else { + // Presto writes only non-null values; null slots are omitted. + // TODO: Improve performance + for (vector_size_t i = lastOffset; i < offset; ++i) { + if (!bits::isBitNull(rawNulls, i)) { + outputStreams[p]->write( + reinterpret_cast(&partitionedValues[i]), + typeWidth); + } + } + } + } + lastOffset = offset; + } +} + +void PrestoIterativePartitioningSerializer::flushSequentialOffsets( + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const { + for (uint32_t p : nonEmptyPartitions) { + const int32_t numRows = + static_cast(bufferState_->rowsPerPartition()[p]); + for (int32_t i = 0; i <= numRows; ++i) { + writeInt32(outputStreams[p], i); + } + } +} + +} // namespace facebook::velox::serializer::presto diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h new file mode 100644 index 00000000000..8ab7d31dc7e --- /dev/null +++ b/velox/serializers/PrestoIterativePartitioningSerializer.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +#include "velox/common/memory/ByteStream.h" +#include "velox/serializers/PrestoSerializer.h" +#include "velox/type/Type.h" +#include "velox/vector/PartitionedVector.h" + +namespace facebook::velox::serializer::presto { + +/// Convenience alias matching PrestoSerializer.cpp convention. +using SerdeOpts = PrestoVectorSerde::PrestoOptions; + +class BufferState; + +/// Serializes a stream of RowVectors into per-partition Presto pages. +/// +/// Each call to append() routes rows to their assigned partition. flush() +/// produces one Presto-format IOBuf per non-empty partition and resets the +/// internal state so the serializer can be reused for the next cycle. +class PrestoIterativePartitioningSerializer { + public: + PrestoIterativePartitioningSerializer( + RowTypePtr outputType, + uint32_t numPartitions, + const SerdeOpts& opts, + memory::MemoryPool* pool) + : PrestoIterativePartitioningSerializer( + std::move(outputType), + numPartitions, + opts, + pool, + {}, + nullptr) {} + + /// Constructs the serializer. If `listenerFactory` is non-null it is called + /// once per non-empty partition on each flush to create an + /// OutputStreamListener that accumulates the CRC32 checksum; the checksum + /// bit is then set in the Presto page codec byte and the computed value is + /// written into the page header. Pass nullptr to skip checksum computation, + /// which matches the behavior of kNormal PartitionedOutput when + /// OutputBufferManager has no listener factory set. + PrestoIterativePartitioningSerializer( + RowTypePtr outputType, + uint32_t numPartitions, + const SerdeOpts& opts, + memory::MemoryPool* pool, + std::function()> listenerFactory) + : PrestoIterativePartitioningSerializer( + std::move(outputType), + numPartitions, + opts, + pool, + {}, + std::move(listenerFactory)) {} + + /// Constructs the serializer with an explicit output-column to input-column + /// mapping. `outputToInputChannels[i]` indicates which child of the RowVector + /// passed to append() should be serialized for output column i. When empty, + /// output column i uses input child i. + PrestoIterativePartitioningSerializer( + RowTypePtr outputType, + uint32_t numPartitions, + const SerdeOpts& opts, + memory::MemoryPool* pool, + std::vector outputToInputChannels, + std::function()> listenerFactory = + nullptr); + + ~PrestoIterativePartitioningSerializer(); + + /// Returns a conservative estimate of bytesBuffered() after appending + /// `input`. The partition assignment of the input is not known at the time of + /// the call, so this assumes worst-case growth from new non-empty partitions + /// and may overestimate. + int64_t estimateBytesAfterAppend(const RowVectorPtr& input) const; + + /// Routes each row in `input` to the partition indicated by `partitions`. + /// `partitions.size()` must equal `input->size()`. + void append( + const RowVectorPtr& input, + const std::vector& partitions); + + /// Serializes all buffered data into one Presto page per non-empty partition + /// and resets internal state. Returns an empty map if nothing has been + /// appended since the last flush. + std::map, vector_size_t>> + flush(); + + /// Returns the serialized bytes buffered across all partitions since the last + /// flush. + int64_t bytesBuffered() const; + + /// Returns the total number of rows appended since the last flush. + vector_size_t rowsBuffered() const; + + private: + void validateOutputInputMapping(const RowVectorPtr&) const; + + column_index_t outputToInputChannel(column_index_t outputColumn) const { + return outputToInputChannels_.empty() + ? outputColumn + : outputToInputChannels_[outputColumn]; + } + + std::map, vector_size_t>> + flushUncompressed(); + std::map, vector_size_t>> + flushCompressed(); + + void clear(); + + void flushStart(IOBufOutputStream& out, uint32_t partition, char codecMask) + const; + + void flushFinish( + IOBufOutputStream& out, + uint32_t partition, + std::streampos beginOffset, + char codecMask, + OutputStreamListener* listener) const; + + void flushRowChildren( + const std::vector& partitionedVectors, + const RowType& rowSchema, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + void flushColumn( + const std::vector& partitionedVectors, + const TypePtr& colType, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + void flushSimpleColumn( + const std::vector& partitionedVectors, + const TypePtr& colType, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + void flushSingleSimpleVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const; + + template + void flushSingleFlatVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const; + + template + void flushSingleConstantVector( + const PartitionedVectorPtr& partitionedVector, + const std::vector& outputStreams) const; + + void flushHeader( + std::string_view name, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + void flushRowCounts( + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + void flushNulls( + const std::vector& partitionedVectors, + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + static void flushSimpleVectorNulls( + const PartitionedVectorPtr& partitionedVector, + const std::vector& nonEmptyPartitions, + std::vector>& bitmaps, + std::vector& destBitOffsets); + + static void flushConstantVectorNulls( + const PartitionedVectorPtr& partitionedVector, + const std::vector& nonEmptyPartitions, + std::vector>& bitmaps, + std::vector& destBitOffsets); + + template + void flushFlatValues( + const T* partitionedValues, + const uint64_t* rawNulls, + const vector_size_t* partitionOffsets, + const std::vector& outputStreams) const; + + void flushSequentialOffsets( + const std::vector& nonEmptyPartitions, + const std::vector& outputStreams) const; + + RowTypePtr outputType_; + std::vector outputToInputChannels_; + uint32_t numPartitions_; + SerdeOpts opts_; + memory::MemoryPool* pool_; + + std::function()> listenerFactory_; + + /// Number of top-level columns in `outputType_`. + uint32_t numColumns_{0}; + + std::vector partitionedRowVectors_; + + /// Accumulated state for all batches buffered since the last + /// flush. + std::unique_ptr bufferState_; +}; + +} // namespace facebook::velox::serializer::presto diff --git a/velox/serializers/benchmarks/CMakeLists.txt b/velox/serializers/benchmarks/CMakeLists.txt index 7d1044e4367..a81530595e8 100644 --- a/velox/serializers/benchmarks/CMakeLists.txt +++ b/velox/serializers/benchmarks/CMakeLists.txt @@ -21,3 +21,17 @@ target_link_libraries( Folly::folly Folly::follybenchmark ) + +add_executable( + velox_presto_iterative_partitioning_serializer_benchmark + PrestoIterativePartitioningSerializerBenchmark.cpp +) + +target_link_libraries( + velox_presto_iterative_partitioning_serializer_benchmark + velox_presto_serializer + velox_vector_test_lib + velox_memory + Folly::folly + Folly::follybenchmark +) diff --git a/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp new file mode 100644 index 00000000000..ec6330f42ed --- /dev/null +++ b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp @@ -0,0 +1,277 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include "velox/serializers/PrestoIterativePartitioningSerializer.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook::velox; +using namespace facebook::velox::serializer::presto; + +constexpr int64_t kBufferSize = 2 * 1024 * 1024; + +namespace { + +class PrestoIterativePartitioningSerializerBenchmark + : public test::VectorTestBase { + public: + /// Creates a flat vector of type T with deterministic null pattern. + /// Rows where (row % 100) < nullPct are null. + template + VectorPtr makeFlatColumnOfType(vector_size_t size, int32_t nullPct) { + if (nullPct == 0) { + return makeFlatVector( + size, [](auto row) { return static_cast(row); }); + } + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + [nullPct](auto row) { return (row % 100) < nullPct; }); + } + + /// Creates a flat vector of the given TypeKind with the given null ratio. + VectorPtr + makeFlatColumn(vector_size_t size, TypeKind colKind, int32_t nullPct) { + switch (colKind) { + case TypeKind::BOOLEAN: + return makeFlatColumnOfType(size, nullPct); + case TypeKind::INTEGER: + return makeFlatColumnOfType(size, nullPct); + case TypeKind::BIGINT: + return makeFlatColumnOfType(size, nullPct); + case TypeKind::HUGEINT: + return makeFlatColumnOfType(size, nullPct); + default: + VELOX_UNSUPPORTED( + "Unsupported TypeKind: {}", TypeKindName::toName(colKind)); + } + } + + VectorPtr + makeConstantColumn(vector_size_t size, TypeKind colKind, bool nullConstant) { + if (nullConstant) { + return makeNullConstant(colKind, size); + } + switch (colKind) { + case TypeKind::BOOLEAN: + return makeConstant(true, size); + case TypeKind::INTEGER: + return makeConstant(42, size); + case TypeKind::BIGINT: + return makeConstant(1000, size); + case TypeKind::HUGEINT: + return makeConstant(10000, size); + default: + VELOX_UNSUPPORTED( + "Unsupported TypeKind: {}", TypeKindName::toName(colKind)); + } + } + + /// Creates a RowVector with numCols columns of the given TypeKind. + RowVectorPtr makeInput( + vector_size_t size, + VectorEncoding::Simple encoding, + TypeKind colKind, + uint32_t numCols, + int32_t nullPct, + bool nullConstant = false) { + std::vector names; + std::vector children; + names.reserve(numCols); + children.reserve(numCols); + for (uint32_t i = 0; i < numCols; ++i) { + names.push_back(fmt::format("c{}", i)); + } + switch (encoding) { + case VectorEncoding::Simple::FLAT: { + for (uint32_t i = 0; i < numCols; ++i) { + children.push_back(makeFlatColumn(size, colKind, nullPct)); + } + break; + } + case VectorEncoding::Simple::CONSTANT: { + for (uint32_t i = 0; i < numCols; ++i) { + children.push_back(makeConstantColumn(size, colKind, nullConstant)); + } + break; + } + default: + VELOX_UNSUPPORTED("Unsupported encoding: {}", encoding); + } + return makeRowVector(names, children); + } + + std::vector makePartitions( + vector_size_t size, + uint32_t numPartitions) { + std::vector partitions(size); + for (vector_size_t i = 0; i < size; ++i) { + partitions[i] = i % numPartitions; + } + return partitions; + } + + std::unique_ptr makeSerializer( + const RowTypePtr& type, + uint32_t numPartitions) { + SerdeOpts opts; + return std::make_unique( + type, numPartitions, opts, pool_.get()); + } +}; + +} // namespace + +/// Single benchmark function parameterized by (encoding, colKind, numCols, +/// nullPct, nullConstant, numPartitions). Registered via BENCHMARK_NAMED_PARAM +/// below. +/// +/// All runs use 10'000 rows. Setup (input creation, serializer construction, +/// append) is excluded from the measured time. +void benchmarkFlush( + VectorEncoding::Simple encoding, + TypeKind colKind, + uint32_t numCols, + int32_t nullPct, + bool nullConstant, + uint32_t numPartitions) { + folly::BenchmarkSuspender suspender; + PrestoIterativePartitioningSerializerBenchmark benchmark; + auto input = benchmark.makeInput( + 10'000, encoding, colKind, numCols, nullPct, nullConstant); + auto parts = benchmark.makePartitions(10'000, numPartitions); + auto serializer = benchmark.makeSerializer( + std::static_pointer_cast(input->type()), numPartitions); + + while (serializer->bytesBuffered() < kBufferSize) { + serializer->append(input, parts); + } + + suspender.dismiss(); + + auto result = serializer->flush(); + folly::doNotOptimizeAway(result); +} + +void benchmarkFlushFlat( + uint32_t /* iters */, + TypeKind colKind, + uint32_t numCols, + int32_t nullPct, + uint32_t numPartitions) { + benchmarkFlush( + VectorEncoding::Simple::FLAT, + colKind, + numCols, + nullPct, + false, + numPartitions); +} + +void benchmarkFlushConstant( + uint32_t /* iters */, + TypeKind colKind, + uint32_t numCols, + bool nullConstant, + uint32_t numPartitions) { + benchmarkFlush( + VectorEncoding::Simple::CONSTANT, + colKind, + numCols, + 0, + nullConstant, + numPartitions); +} + +// clang-format off +// Dimensions: +// col type: {bool, int, bigint, hugeint} +// num cols: {1, 4, 16, 64} +// null pct: {0, 25, 50, 75, 100} +// num partitions: {1, 4, 16, 64, 256, 1024} +// +// Naming: flat__cols_

pct_parts +#define FLUSH_FLAT_PARAM(type_name, kind, num_cols, null_pct, num_parts) \ + BENCHMARK_NAMED_PARAM( \ + benchmarkFlushFlat, \ + type_name##_##num_cols##cols_##null_pct##pct_##num_parts##parts, \ + TypeKind::kind, \ + num_cols, \ + null_pct, \ + num_parts) + +// Dimensions: +// col type: {bool, int, bigint, hugeint} +// num cols: {1, 4, 16, 64} +// null constant: {false, true} +// num partitions: {1, 4, 16, 64, 256, 1024} +// +// Naming: constant__cols_[non_]null_parts +#define FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts) \ + BENCHMARK_NAMED_PARAM( \ + benchmarkFlushConstant, \ + type_name##_##num_cols##cols_##notnull_##num_parts##parts, \ + TypeKind::kind, \ + num_cols, \ + false, \ + num_parts) + +#define FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts) \ + BENCHMARK_NAMED_PARAM( \ + benchmarkFlushConstant, \ + type_name##_##num_cols##cols_##null_##num_parts##parts, \ + TypeKind::kind, \ + num_cols, \ + true, \ + num_parts) + +#define FLUSH_FOR_NULLS(type_name, kind, num_cols, num_parts) \ + FLUSH_FLAT_PARAM(type_name, kind, num_cols, 0, num_parts) \ + FLUSH_FLAT_PARAM(type_name, kind, num_cols, 25, num_parts) \ + FLUSH_FLAT_PARAM(type_name, kind, num_cols, 50, num_parts) \ + FLUSH_FLAT_PARAM(type_name, kind, num_cols, 75, num_parts) \ + FLUSH_FLAT_PARAM(type_name, kind, num_cols, 100, num_parts) \ + FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts) \ + FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts) + +#define FLUSH_FOR_PARTS(type_name, kind, num_cols) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 1) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 4) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 16) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 64) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 256) \ + FLUSH_FOR_NULLS(type_name, kind, num_cols, 1024) + +#define FLUSH_FOR_COLS(type_name, kind) \ + FLUSH_FOR_PARTS(type_name, kind, 1) \ + FLUSH_FOR_PARTS(type_name, kind, 4) \ + FLUSH_FOR_PARTS(type_name, kind, 16) \ + FLUSH_FOR_PARTS(type_name, kind, 64) + +FLUSH_FOR_COLS(bool, BOOLEAN) +FLUSH_FOR_COLS(int, INTEGER) +FLUSH_FOR_COLS(bigint, BIGINT) +FLUSH_FOR_COLS(ldec, HUGEINT) +// clang-format on + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize(memory::MemoryManager::Options{}); + PrestoVectorSerde::registerVectorSerde(); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/serializers/tests/CMakeLists.txt b/velox/serializers/tests/CMakeLists.txt index f7f69461ef4..2d1a40275b5 100644 --- a/velox/serializers/tests/CMakeLists.txt +++ b/velox/serializers/tests/CMakeLists.txt @@ -36,6 +36,7 @@ target_link_libraries( set( VELOX_SERIALIZER_TEST_SOURCES CompactRowSerializerTest.cpp + PrestoIterativePartitioningSerializerTest.cpp PrestoOutputStreamListenerTest.cpp PrestoSerializerTest.cpp SerializedPageFileTest.cpp @@ -51,6 +52,7 @@ set( velox_row_fast GTest::gtest GTest::gtest_main + GTest::gmock glog::glog ) diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp new file mode 100644 index 00000000000..4116632f762 --- /dev/null +++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp @@ -0,0 +1,1120 @@ +/* + * Copyright (c) International Business Machines Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "velox/common/base/BitUtil.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/serializers/PrestoIterativePartitioningSerializer.h" + +#include "velox/serializers/PrestoSerializerSerializationUtils.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook::velox; +using namespace facebook::velox::serializer::presto; +using namespace facebook::velox::test; + +namespace { + +int64_t simpleColumnPageBytes( + std::string_view encodingName, + int64_t numRows, + int64_t numNulls, + int64_t valueWidth) { + return serializer::presto::detail::kHeaderSize + 4 // page header + num cols + + 4 + static_cast(encodingName.size()) // column header + + 4 // num rows + + 1 + (numNulls > 0 ? bits::nbytes(numRows) : 0) // null flags + + (numRows - numNulls) * valueWidth; // values +} + +} // namespace + +// --------------------------------------------------------------------------- +// Shared base fixture +// --------------------------------------------------------------------------- + +class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase { + protected: + static void SetUpTestSuite() { + memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); + if (!isRegisteredVectorSerde()) { + PrestoVectorSerde::registerVectorSerde(); + } + } + + /// Deserializes an IOBuf produced by PartitioningSerializer::flush(). + RowVectorPtr deserialize(folly::IOBuf& iobuf, const RowTypePtr& type) { + auto ranges = byteRangesFromIOBuf(&iobuf); + BufferInputStream stream(std::move(ranges)); + RowVectorPtr result; + serde_.deserialize(&stream, pool_.get(), type, &result, nullptr); + return result; + } + + /// Extracts flat values from a column into a sorted vector. + template + std::vector sortedValues(const RowVectorPtr& row, int column) { + auto* flat = row->childAt(column)->as>(); + std::vector vals(flat->rawValues(), flat->rawValues() + row->size()); + std::sort(vals.begin(), vals.end()); + return vals; + } + + /// Extracts values from a nullable column, preserving order and nulls. + template + std::vector> nullableValues( + const RowVectorPtr& row, + int column) { + auto* vec = row->childAt(column).get(); + std::vector> result; + result.reserve(row->size()); + for (int i = 0; i < row->size(); ++i) { + if (vec->isNullAt(i)) { + result.push_back(std::nullopt); + } else { + result.push_back(vec->as>()->valueAt(i)); + } + } + return result; + } + + /// Builds a PrestoIterativePartitioningSerializer with default serde options. + std::unique_ptr makeSerializer( + const RowTypePtr& type, + uint32_t numPartitions) { + SerdeOpts opts; + return std::make_unique( + type, + numPartitions, + opts, + pool_.get(), + []() -> std::unique_ptr { + return std::make_unique(); + }); + } + + /// Builds a serializer that computes a CRC32 checksum on each flush via a + /// PrestoOutputStreamListener factory, matching the kOptimized path when + /// OutputBufferManager has a listener factory set. + std::unique_ptr + makeSerializerWithListener(const RowTypePtr& type, uint32_t numPartitions) { + SerdeOpts opts; + return std::make_unique( + type, + numPartitions, + opts, + pool_.get(), + []() -> std::unique_ptr { + return std::make_unique(); + }); + } + + // Presto page header layout: [numRows:4][codec:1][uncompressedSize:4] + // [compressedSize:4][checksum:8] + static constexpr int kCodecByteOffset = 4; + static constexpr int kChecksumOffset = 13; + static constexpr int8_t kChecksumBitMask = 4; + + /// Returns the codec byte from the Presto page header in `iobuf`. + static int8_t codecByte(const folly::IOBuf& iobuf) { + VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8); + return reinterpret_cast(iobuf.data())[kCodecByteOffset]; + } + + /// Returns the 8-byte checksum field from the Presto page header in `iobuf`. + static int64_t checksumField(const folly::IOBuf& iobuf) { + VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8); + int64_t value; + std::memcpy(&value, iobuf.data() + kChecksumOffset, sizeof(value)); + return value; + } + + int64_t totalFlushedBytes( + std::map< + uint32_t, + std::pair, vector_size_t>>& pages) + const { + int64_t totalBytes = 0; + for (const auto& [_, page] : pages) { + totalBytes += page.first->computeChainDataLength(); + } + return totalBytes; + } + + PrestoVectorSerde serde_; +}; + +template <> +std::vector PrestoIterativePartitioningSerializerTestBase::sortedValues< + bool>(const RowVectorPtr& row, int column) { + auto* flat = row->childAt(column)->as>(); + std::vector vals; + vals.reserve(row->size()); + for (int i = 0; i < row->size(); ++i) { + vals.push_back(flat->valueAtFast(i)); + } + std::sort(vals.begin(), vals.end()); + return vals; +} + +// --------------------------------------------------------------------------- +// Value-parameterized fixture — routing, null-handling over scalar TypePtrs. +// Uses BaseVector::create() + setNull() so no C++ type dispatch is needed. +// --------------------------------------------------------------------------- + +class PrestoIterativePartitioningSerializerParamTest + : public ::testing::TestWithParam, + public PrestoIterativePartitioningSerializerTestBase { + public: + static void SetUpTestSuite() { + PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite(); + } +}; + +// Short lowercase names for test output, matching the benchmark convention. +std::string scalarTypeName(const TypePtr& type) { + if (type->kind() == TypeKind::BOOLEAN) + return "bool"; + if (type->kind() == TypeKind::INTEGER) + return "int"; + if (type->kind() == TypeKind::BIGINT) + return "bigint"; + if (type->kind() == TypeKind::HUGEINT) + return "hugeint"; + return type->toString(); +} + +INSTANTIATE_TEST_SUITE_P( + ScalarTypes, + PrestoIterativePartitioningSerializerParamTest, + ::testing::Values(BOOLEAN(), INTEGER(), BIGINT(), HUGEINT()), + [](const ::testing::TestParamInfo& info) { + return scalarTypeName(info.param); + }); + +// ── Routing ────────────────────────────────────────────────────────────────── + +// Single append, two equal-sized partitions; also verifies rowsBuffered and +// bytesBuffered lifecycle counters. +TEST_P(PrestoIterativePartitioningSerializerParamTest, basicTwoPartitions) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 6, pool_.get()); + auto input = makeRowVector({"a"}, {col}); + + // Even rows → partition 0, odd rows → partition 1. + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 1, 0, 1, 0, 1}); + + EXPECT_EQ(serializer->rowsBuffered(), 6); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + EXPECT_EQ(serializer->rowsBuffered(), 0); + EXPECT_EQ(serializer->bytesBuffered(), 0); + + auto p0 = deserialize(*ioBufs.at(0).first, type); + auto p1 = deserialize(*ioBufs.at(1).first, type); + + EXPECT_EQ(p0->size(), 3); + EXPECT_EQ(p1->size(), 3); +} + +// All rows routed to one non-zero partition; other partitions are absent. +TEST_P(PrestoIterativePartitioningSerializerParamTest, allRowsToOnePartition) { + auto colType = GetParam(); + auto type = ROW({"x"}, {colType}); + auto col = BaseVector::create(colType, 5, pool_.get()); + auto input = makeRowVector({"x"}, {col}); + + auto serializer = makeSerializer(type, 4); + serializer->append(input, {2, 2, 2, 2, 2}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 1); + ASSERT_TRUE(ioBufs.count(2)); + EXPECT_EQ(deserialize(*ioBufs.at(2).first, type)->size(), 5); +} + +// Single partition (numPartitions=1): all rows go to partition 0. +TEST_P(PrestoIterativePartitioningSerializerParamTest, singlePartition) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 5, pool_.get()); + auto input = makeRowVector({"a"}, {col}); + + auto serializer = makeSerializer(type, 1); + serializer->append(input, std::vector(5, 0)); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 1); + EXPECT_EQ(deserialize(*ioBufs.at(0).first, type)->size(), 5); +} + +// Multiple columns of the same type: each is serialized independently by +// flushRowChildren. +TEST_P(PrestoIterativePartitioningSerializerParamTest, multipleColumns) { + auto colType = GetParam(); + auto type = ROW({"a", "b"}, {colType, colType}); + auto colA = BaseVector::create(colType, 4, pool_.get()); + auto colB = BaseVector::create(colType, 4, pool_.get()); + auto input = makeRowVector({"a", "b"}, {colA, colB}); + + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 0, 1, 1}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + EXPECT_EQ(r0->size(), 2); + EXPECT_EQ(r0->childAt(0)->size(), 2); + EXPECT_EQ(r0->childAt(1)->size(), 2); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + EXPECT_EQ(r1->size(), 2); + EXPECT_EQ(r1->childAt(0)->size(), 2); + EXPECT_EQ(r1->childAt(1)->size(), 2); +} + +// ── Null handling +// ───────────────────────────────────────────────────────────── + +// Nulls appear only in one partition; the other partition is null-free. +// Rows 0,1,2 → p0; rows 3,4 → p1. Row 1 is null. +// p0: [not-null, null, not-null]; p1: [not-null, not-null]. +TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInOnePartition) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 5, pool_.get()); + col->setNull(1, true); + auto input = makeRowVector({"a"}, {col}); + + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 0, 0, 1, 1}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 3); + EXPECT_FALSE(r0->childAt(0)->isNullAt(0)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(1)); + EXPECT_FALSE(r0->childAt(0)->isNullAt(2)); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 2); + EXPECT_FALSE(r1->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(1)); +} + +// Nulls contributed by different appends to the same partition. +// Append 1: rows 0,1 → p0 (row 1 null); row 2 → p1. +// Append 2: row 0 → p0 (null); row 1 → p1. +// p0: [not-null, null, null]; p1: [not-null, not-null]. +TEST_P( + PrestoIterativePartitioningSerializerParamTest, + nullsAcrossMultipleAppends) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto serializer = makeSerializer(type, 2); + + auto col1 = BaseVector::create(colType, 3, pool_.get()); + col1->setNull(1, true); + serializer->append(makeRowVector({"a"}, {col1}), {0, 0, 1}); + + auto col2 = BaseVector::create(colType, 2, pool_.get()); + col2->setNull(0, true); + serializer->append(makeRowVector({"a"}, {col2}), {0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 3); + EXPECT_FALSE(r0->childAt(0)->isNullAt(0)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(1)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(2)); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 2); + EXPECT_FALSE(r1->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(1)); +} + +// Partition boundary falls in the middle of a null-bitmap byte, exercising the +// bit-extraction carry-over logic. 5 rows → p0, 4 rows → p1. The boundary at +// bit 5 is inside the first byte of the null bitmap. Rows 1,3,5,7 are null. +// p0: [not-null, null, not-null, null, not-null]. +// p1: [null, not-null, null, not-null]. +TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsUnalignedBoundary) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 9, pool_.get()); + col->setNull(1, true); + col->setNull(3, true); + col->setNull(5, true); + col->setNull(7, true); + auto input = makeRowVector({"a"}, {col}); + + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 0, 0, 0, 0, 1, 1, 1, 1}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 5); + EXPECT_FALSE(r0->childAt(0)->isNullAt(0)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(1)); + EXPECT_FALSE(r0->childAt(0)->isNullAt(2)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(3)); + EXPECT_FALSE(r0->childAt(0)->isNullAt(4)); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 4); + EXPECT_TRUE(r1->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(1)); + EXPECT_TRUE(r1->childAt(0)->isNullAt(2)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(3)); +} + +// Both partitions contain nulls. +// Input: 4 rows, rows 1 and 2 null; rows 0,1 → p0; rows 2,3 → p1. +// p0: [not-null, null]; p1: [null, not-null]. +TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInBothPartitions) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 4, pool_.get()); + col->setNull(1, true); + col->setNull(2, true); + auto input = makeRowVector({"a"}, {col}); + + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 0, 1, 1}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 2); + EXPECT_FALSE(r0->childAt(0)->isNullAt(0)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(1)); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 2); + EXPECT_TRUE(r1->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(1)); +} + +// All rows in one partition are null; the other partition is non-null. +// Input: 3 rows, rows 0,1 null; rows 0,1 → p0; row 2 → p1. +TEST_P(PrestoIterativePartitioningSerializerParamTest, allNullsInPartition) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 3, pool_.get()); + col->setNull(0, true); + col->setNull(1, true); + auto input = makeRowVector({"a"}, {col}); + + auto serializer = makeSerializer(type, 2); + serializer->append(input, {0, 0, 1}); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 2); + EXPECT_TRUE(r0->childAt(0)->isNullAt(0)); + EXPECT_TRUE(r0->childAt(0)->isNullAt(1)); + + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 1); + EXPECT_FALSE(r1->childAt(0)->isNullAt(0)); +} + +// A null batch followed by a null-free batch for the same partition. +// Regression: bitmaps must be initialized to all-not-null so that rows from +// the null-free batch (rawNulls == nullptr) are not decoded as null. +TEST_P( + PrestoIterativePartitioningSerializerParamTest, + nullBatchFollowedByNullFreeBatch) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto serializer = makeSerializer(type, 2); + + // Append 1: row 0 → p0 (null); row 1 → p1 (not-null). rawNulls non-null. + auto col1 = BaseVector::create(colType, 2, pool_.get()); + col1->setNull(0, true); + serializer->append(makeRowVector({"a"}, {col1}), {0, 1}); + + // Append 2: all not-null (rawNulls == nullptr). row 0 → p0; row 1 → p1. + auto col2 = BaseVector::create(colType, 2, pool_.get()); + serializer->append(makeRowVector({"a"}, {col2}), {0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + // p0: [null (append 1), not-null (append 2)] + auto r0 = deserialize(*ioBufs.at(0).first, type); + ASSERT_EQ(r0->size(), 2); + EXPECT_TRUE(r0->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r0->childAt(0)->isNullAt(1)); + + // p1: [not-null (append 1), not-null (append 2)] + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r1->size(), 2); + EXPECT_FALSE(r1->childAt(0)->isNullAt(0)); + EXPECT_FALSE(r1->childAt(0)->isNullAt(1)); +} + +// --------------------------------------------------------------------------- +// Non-typed fixture (TEST_F) — lifecycle, structural, regression +// --------------------------------------------------------------------------- + +class PrestoIterativePartitioningSerializerTest + : public ::testing::Test, + public PrestoIterativePartitioningSerializerTestBase { + public: + static void SetUpTestSuite() { + PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite(); + } +}; + +// Appending an empty RowVector produces no ioBufs on flush. +TEST_F(PrestoIterativePartitioningSerializerTest, appendEmptyVector) { + auto type = ROW({"a"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + serializer->append(makeRowVector({"a"}, {makeFlatVector({})}), {}); + EXPECT_TRUE(serializer->flush().empty()); +} + +// ── Lifecycle +// ───────────────────────────────────────────────────────────────── + +// Multiple append() calls accumulate correctly before flush. +TEST_F(PrestoIterativePartitioningSerializerTest, multipleAppends) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 3); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({100, 200, 300})}), + {0, 1, 2}); + serializer->append( + makeRowVector({"v"}, {makeFlatVector({400, 500, 600})}), + {2, 0, 1}); + + EXPECT_EQ(serializer->rowsBuffered(), 6); + const auto bufferedBytes = serializer->bytesBuffered(); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 3); + EXPECT_EQ(bufferedBytes, totalFlushedBytes(ioBufs)); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + auto r2 = deserialize(*ioBufs.at(2).first, type); + + ASSERT_EQ(r0->size(), 2); + ASSERT_EQ(r1->size(), 2); + ASSERT_EQ(r2->size(), 2); + + EXPECT_EQ(sortedValues(r0, 0), (std::vector{100, 500})); + EXPECT_EQ(sortedValues(r1, 0), (std::vector{200, 600})); + EXPECT_EQ(sortedValues(r2, 0), (std::vector{300, 400})); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + bytesBufferedPartitionGrowth) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + + const auto singleRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 1, 0, 8); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({10})}), {0}); + EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes); + + auto input = makeRowVector({"v"}, {makeFlatVector({20})}); + EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes); + + serializer->append(input, {1}); + const auto bytesBuffered = serializer->bytesBuffered(); + EXPECT_EQ(serializer->bytesBuffered(), 2 * singleRowPageBytes); + + auto ioBufs = serializer->flush(); + EXPECT_EQ(serializer->bytesBuffered(), 0); + EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs)); +} + +TEST_F(PrestoIterativePartitioningSerializerTest, bytesBufferedNullFlagGrowth) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 1); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3, 4, 5, 6, 7, 8})}), + std::vector(8, 0)); + EXPECT_EQ( + serializer->bytesBuffered(), + simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8)); + + auto input = + makeRowVector({"v"}, {makeNullableFlatVector({std::nullopt})}); + EXPECT_EQ( + serializer->bytesBuffered(), + simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8)); + + serializer->append(input, {0}); + const auto bytesBuffered = serializer->bytesBuffered(); + EXPECT_EQ(bytesBuffered, simpleColumnPageBytes("LONG_ARRAY", 9, 1, 8)); + + auto ioBufs = serializer->flush(); + EXPECT_EQ(serializer->bytesBuffered(), 0); + EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs)); +} + +// A flush time output mapping serializes one input colum into multiple output +// columns. +TEST_F( + PrestoIterativePartitioningSerializerTest, + duplicateOutputColumnAtFlush) { + auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()}); + SerdeOpts opts; + auto serializer = std::make_unique( + outputType, 2, opts, pool_.get(), std::vector{0, 0}); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({10, 11, 12, 13})}), + {0, 1, 0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, outputType); + auto r1 = deserialize(*ioBufs.at(1).first, outputType); + + ASSERT_EQ(r0->size(), 2); + ASSERT_EQ(r1->size(), 2); + + EXPECT_EQ(sortedValues(r0, 0), (std::vector{10, 12})); + EXPECT_EQ(sortedValues(r0, 1), (std::vector{10, 12})); + EXPECT_EQ(sortedValues(r1, 0), (std::vector{11, 13})); + EXPECT_EQ(sortedValues(r1, 1), (std::vector{11, 13})); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + outputInputMappingOutOfRange) { + auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()}); + SerdeOpts opts; + auto serializer = std::make_unique( + outputType, 2, opts, pool_.get(), std::vector{0, 1}); + + VELOX_ASSERT_THROW( + serializer->append( + makeRowVector({"v"}, {makeFlatVector({10, 11})}), {0, 1}), + "Output column 1 maps to invalid input column 1"); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + outputInputMappingTypeMismatch) { + auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()}); + SerdeOpts opts; + auto serializer = std::make_unique( + outputType, 2, opts, pool_.get(), std::vector{0, 1}); + + VELOX_ASSERT_THROW( + serializer->append( + makeRowVector( + {"v1", "v2"}, + { + makeFlatVector({10, 11}), + makeFlatVector({12, 13}), + }), + {0, 1}), + "Output column 1 expects BIGINT, got INTEGER from input column 1"); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + estimateBytesAfterAppendExactForSinglePartition) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 1); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3, 4, 5, 6, 7, 8})}), + std::vector(8, 0)); + + auto input = + makeRowVector({"v"}, {makeNullableFlatVector({std::nullopt})}); + const auto estimatedAfter = serializer->estimateBytesAfterAppend(input); + + serializer->append(input, {0}); + EXPECT_EQ(estimatedAfter, serializer->bytesBuffered()); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + estimateBytesAfterAppendExactForConstant) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 1); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3, 4})}), + std::vector(4, 0)); + + auto input = makeRowVector({"v"}, {makeConstant(7, 2)}); + const auto estimatedAfter = serializer->estimateBytesAfterAppend(input); + + serializer->append(input, std::vector(2, 0)); + EXPECT_EQ(estimatedAfter, serializer->bytesBuffered()); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + estimateBytesAfterAppendExactForNullConstant) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 1); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3, 4, 5, 6, 7, 8})}), + std::vector(8, 0)); + + auto input = makeRowVector({"v"}, {makeConstant(std::nullopt, 80)}); + const auto estimatedAfter = serializer->estimateBytesAfterAppend(input); + + serializer->append(input, std::vector(80, 0)); + EXPECT_EQ(estimatedAfter, serializer->bytesBuffered()); +} + +TEST_F( + PrestoIterativePartitioningSerializerTest, + estimateBytesAfterAppendOverestimatesPartitionedAppend) { + auto type = ROW({"a", "b"}, {BIGINT(), INTEGER()}); + auto serializer = makeSerializer(type, 3); + + serializer->append( + makeRowVector( + {"a", "b"}, + { + makeFlatVector({10, 20}), + makeFlatVector({100, 200}), + }), + {0, 1}); + + auto input = makeRowVector( + {"a", "b"}, + { + makeNullableFlatVector({30, std::nullopt, 50, 60}), + makeNullableFlatVector({300, 400, std::nullopt, 600}), + }); + + // All rows land in an already non-empty partition, but + // estimateBytesAfterAppend still assume this input could go to the last empty + // partition before the real distribution is known. + const std::vector partitions{1, 1, 1, 1}; + + const auto estimatedAfter = serializer->estimateBytesAfterAppend(input); + + serializer->append(input, partitions); + EXPECT_GT(estimatedAfter, serializer->bytesBuffered()); +} + +// Flush twice: second flush on empty state returns an empty map. +TEST_F(PrestoIterativePartitioningSerializerTest, flushTwice) { + auto type = ROW({"a"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + serializer->append( + makeRowVector({"a"}, {makeFlatVector({10, 20})}), {0, 1}); + + auto ioBufs1 = serializer->flush(); + ASSERT_EQ(ioBufs1.size(), 2); + + EXPECT_TRUE(serializer->flush().empty()); +} + +// Append and flush multiple independent cycles. +TEST_F(PrestoIterativePartitioningSerializerTest, multipleCycles) { + auto type = ROW({"a"}, {INTEGER()}); + auto serializer = makeSerializer(type, 2); + + for (int cycle = 0; cycle < 3; ++cycle) { + serializer->append( + makeRowVector( + {"a"}, {makeFlatVector({cycle * 2, cycle * 2 + 1})}), + {0, 1}); + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2) << "cycle " << cycle; + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + ASSERT_EQ(r0->size(), 1) << "cycle " << cycle; + ASSERT_EQ(r1->size(), 1) << "cycle " << cycle; + EXPECT_EQ(r0->childAt(0)->as>()->valueAt(0), cycle * 2); + EXPECT_EQ( + r1->childAt(0)->as>()->valueAt(0), cycle * 2 + 1); + } +} + +// ── Encoding +// ───────────────────────────────────────────────────────────────── + +// Constant vectors are flattened across append() calls. +TEST_F(PrestoIterativePartitioningSerializerTest, constantColumnAcrossAppends) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 3); + + serializer->append( + makeRowVector({"v"}, {makeConstant(11, 4)}), {0, 1, 0, 2}); + serializer->append( + makeRowVector({"v"}, {makeConstant(22, 5)}), {2, 0, 1, 1, 2}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 3); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + auto r2 = deserialize(*ioBufs.at(2).first, type); + + EXPECT_EQ(sortedValues(r0, 0), (std::vector{11, 11, 22})); + EXPECT_EQ(sortedValues(r1, 0), (std::vector{11, 22, 22})); + EXPECT_EQ(sortedValues(r2, 0), (std::vector{11, 22, 22})); +} + +// Boolean constant vectors are flattened across append() calls. +TEST_F( + PrestoIterativePartitioningSerializerTest, + booleanConstantColumnAcrossAppends) { + auto type = ROW({"v"}, {BOOLEAN()}); + auto serializer = makeSerializer(type, 2); + + serializer->append( + makeRowVector({"v"}, {makeConstant(true, 4)}), {0, 1, 0, 1}); + serializer->append( + makeRowVector({"v"}, {makeConstant(false, 3)}), {1, 0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + + EXPECT_EQ(sortedValues(r0, 0), (std::vector{false, true, true})); + EXPECT_EQ( + sortedValues(r1, 0), (std::vector{false, false, true, true})); +} + +// Null constant vectors contribute only nulls but still advance row positions. +TEST_F( + PrestoIterativePartitioningSerializerTest, + nullConstantColumnAcrossAppends) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + + serializer->append( + makeRowVector({"v"}, {makeConstant(std::nullopt, 3)}), + {0, 1, 0}); + serializer->append( + makeRowVector({"v"}, {makeConstant(7, 3)}), {1, 0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + + auto actual0 = nullableValues(r0, 0); + std::sort(actual0.begin(), actual0.end()); + auto expected0 = + std::vector>{std::nullopt, std::nullopt, 7}; + EXPECT_EQ(actual0, expected0); + + auto actual1 = nullableValues(r1, 0); + std::sort(actual1.begin(), actual1.end()); + auto expected1 = std::vector>{std::nullopt, 7, 7}; + EXPECT_EQ(actual1, expected1); +} + +// Constant and flat vectors are flattened and serialized correctly across +// append() calls. +TEST_F(PrestoIterativePartitioningSerializerTest, mixedConstantFlatVector) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + + serializer->append( + makeRowVector({"v"}, {makeConstant(7, 3)}), {0, 1, 0}); + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3})}), {1, 1, 0}); + serializer->append( + makeRowVector({"v"}, {makeConstant(8, 2)}), {0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + + EXPECT_EQ(sortedValues(r0, 0), (std::vector{3, 7, 7, 8})); + EXPECT_EQ(sortedValues(r1, 0), (std::vector{1, 2, 7, 8})); +} + +// Null constant rows are preserved and serialized correctly with flat and +// nullable flat vectors across append() calls. +TEST_F(PrestoIterativePartitioningSerializerTest, mixedNullConstantFlatVector) { + auto type = ROW({"v"}, {BIGINT()}); + auto serializer = makeSerializer(type, 2); + + serializer->append( + makeRowVector({"v"}, {makeFlatVector({1, 2, 3, 4})}), + {0, 1, 1, 0}); + serializer->append( + makeRowVector({"v"}, {makeConstant(std::nullopt, 3)}), + {0, 1, 0}); + serializer->append( + makeRowVector( + {"v"}, {makeNullableFlatVector({std::nullopt, 7, 3})}), + {1, 0, 1}); + + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + auto r0 = deserialize(*ioBufs.at(0).first, type); + auto r1 = deserialize(*ioBufs.at(1).first, type); + + auto actual0 = nullableValues(r0, 0); + std::sort(actual0.begin(), actual0.end()); + auto expected0 = + std::vector>{std::nullopt, std::nullopt, 1, 4, 7}; + EXPECT_EQ(actual0, expected0); + + auto actual1 = nullableValues(r1, 0); + std::sort(actual1.begin(), actual1.end()); + auto expected1 = + std::vector>{std::nullopt, std::nullopt, 2, 3, 3}; + EXPECT_EQ(actual1, expected1); +} + +// ── Scale and regression +// ─────────────────────────────────────────────────────── + +// 1024 partitions with random int64 values: verify every value reaches +// exactly the right partition and nothing is lost or duplicated. +TEST_F(PrestoIterativePartitioningSerializerTest, manyPartitionsRandom) { + constexpr uint32_t kNumPartitions = 1024; + constexpr int32_t kNumRows = 64'000; + + std::mt19937_64 rng(42); + std::uniform_int_distribution valueDist; + std::uniform_int_distribution partDist(0, kNumPartitions - 1); + + std::vector inputValues(kNumRows); + std::vector partitions(kNumRows); + // expected[p] holds the sorted values assigned to partition p. + std::vector> expected(kNumPartitions); + + for (int i = 0; i < kNumRows; ++i) { + inputValues[i] = valueDist(rng); + partitions[i] = partDist(rng); + expected[partitions[i]].push_back(inputValues[i]); + } + for (auto& v : expected) { + std::sort(v.begin(), v.end()); + } + + auto type = ROW({"v"}, {BIGINT()}); + auto input = makeRowVector({"v"}, {makeFlatVector(inputValues)}); + + auto serializer = makeSerializer(type, kNumPartitions); + serializer->append(input, partitions); + auto ioBufs = serializer->flush(); + + // Every non-empty partition must have a page; empty partitions must not. + for (uint32_t p = 0; p < kNumPartitions; ++p) { + if (expected[p].empty()) { + EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p; + } else { + ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p; + auto result = deserialize(*ioBufs.at(p).first, type); + ASSERT_EQ(result->size(), static_cast(expected[p].size())) + << "partition " << p; + EXPECT_EQ(sortedValues(result, 0), expected[p]) + << "partition " << p; + } + } +} + +// 1024 partitions with random int64 values and ~25% nulls: verify every +// value and null reaches exactly the right partition in input order, and +// nothing is lost or duplicated. +TEST_F( + PrestoIterativePartitioningSerializerTest, + manyPartitionsRandomWithNulls) { + constexpr uint32_t kNumPartitions = 1024; + constexpr int32_t kNumRows = 64'000; + constexpr int32_t kNullPct = 25; + + std::mt19937_64 rng(43); + std::uniform_int_distribution valueDist; + std::uniform_int_distribution partDist(0, kNumPartitions - 1); + std::uniform_int_distribution nullDist(0, 99); + + std::vector> inputValues(kNumRows); + std::vector partitions(kNumRows); + // expected[p] holds the sequence of (value-or-null) assigned to partition p + // in input order. + std::vector>> expected(kNumPartitions); + + for (int i = 0; i < kNumRows; ++i) { + partitions[i] = partDist(rng); + if (nullDist(rng) < kNullPct) { + inputValues[i] = std::nullopt; + } else { + inputValues[i] = valueDist(rng); + } + expected[partitions[i]].push_back(inputValues[i]); + } + + auto type = ROW({"v"}, {BIGINT()}); + auto input = + makeRowVector({"v"}, {makeNullableFlatVector(inputValues)}); + + auto serializer = makeSerializer(type, kNumPartitions); + serializer->append(input, partitions); + auto ioBufs = serializer->flush(); + + // Partition rearranges values within each partition, so compare sorted. + // std::optional sorts with nullopt < any value, preserving null count. + for (uint32_t p = 0; p < kNumPartitions; ++p) { + if (expected[p].empty()) { + EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p; + } else { + ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p; + auto result = deserialize(*ioBufs.at(p).first, type); + ASSERT_EQ(result->size(), static_cast(expected[p].size())) + << "partition " << p; + + auto expectedSorted = expected[p]; + std::sort(expectedSorted.begin(), expectedSorted.end()); + + auto actual = nullableValues(result, 0); + std::sort(actual.begin(), actual.end()); + + EXPECT_EQ(actual, expectedSorted) << "partition " << p; + } + } +} + +// ── Checksum (CRC32) +// ────────────────────────────────────────────────────── + +// Verify the checksum bit is set and a non-zero checksum is written when a +// PrestoOutputStreamListener factory is provided, and that the standard +// deserializer (which validates the checksum) accepts the page. +TEST_P(PrestoIterativePartitioningSerializerParamTest, checksumRoundTrip) { + auto colType = GetParam(); + auto type = ROW({"a"}, {colType}); + auto col = BaseVector::create(colType, 6, pool_.get()); + col->setNull(1, true); + col->setNull(4, true); + + auto serializer = makeSerializerWithListener(type, 2); + serializer->append(makeRowVector({"a"}, {col}), {0, 1, 0, 1, 0, 1}); + auto ioBufs = serializer->flush(); + ASSERT_EQ(ioBufs.size(), 2); + + for (auto& [partition, pageData] : ioBufs) { + auto& iobuf = *pageData.first; + EXPECT_NE(codecByte(iobuf) & kChecksumBitMask, 0) + << "checksum bit must be set in codec byte"; + EXPECT_NE(checksumField(iobuf), 0) << "checksum field must be non-zero"; + // Deserializer validates the checksum internally; throws if wrong. + auto result = deserialize(iobuf, type); + EXPECT_GT(result->size(), 0); + } +} + +// --------------------------------------------------------------------------- +// Non-typed fixture (TEST_F) — lifecycle, structural, regression +// --------------------------------------------------------------------------- + +// Regression: flushNulls previously wrote null bitmaps by obtaining a raw +// pointer via writePosition() then advancing the stream via seekp(). This +// assumed the pre-allocated IOBufOutputStream had a single contiguous range, +// but StreamArena::newRange caps each range at the size of one allocator run, +// which can be smaller than the requested size. seekp() then failed because +// the target position exceeded the end of the first (and only) range. +// +// Reproducing condition: 16 columns × 10'000 rows × 50% nulls in one +// partition generates enough output (~100 KB) to trigger the run-size cap. +TEST_F( + PrestoIterativePartitioningSerializerTest, + flushNullsBitmapManyColumnsLargeRowCount) { + constexpr int32_t kNumCols = 16; + constexpr int32_t kNumRows = 10'000; + + std::vector names; + std::vector children; + names.reserve(kNumCols); + children.reserve(kNumCols); + + for (int col = 0; col < kNumCols; ++col) { + names.push_back(fmt::format("c{}", col)); + // Rows where (row % 2 == 0) are null; the rest hold (row * kNumCols + col). + children.push_back( + makeFlatVector( + kNumRows, + [col](auto row) { + return static_cast(row * kNumCols + col); + }, + [](auto row) { return (row % 2) == 0; })); + } + + auto input = makeRowVector(names, children); + auto rowType = std::static_pointer_cast(input->type()); + + auto serializer = makeSerializer(rowType, 1); + serializer->append(input, std::vector(kNumRows, 0)); + auto ioBufs = serializer->flush(); + + ASSERT_EQ(ioBufs.size(), 1); + + auto result = deserialize(*ioBufs.at(0).first, rowType); + ASSERT_EQ(result->size(), kNumRows); + + for (int col = 0; col < kNumCols; ++col) { + auto* flat = result->childAt(col)->as>(); + for (int row = 0; row < kNumRows; ++row) { + if ((row % 2) == 0) { + EXPECT_TRUE(result->childAt(col)->isNullAt(row)) + << "col=" << col << " row=" << row; + } else { + ASSERT_FALSE(result->childAt(col)->isNullAt(row)) + << "col=" << col << " row=" << row; + EXPECT_EQ( + flat->valueAt(row), static_cast(row * kNumCols + col)) + << "col=" << col << " row=" << row; + } + } + } +} diff --git a/velox/vector/CMakeLists.txt b/velox/vector/CMakeLists.txt index 9fd4f2ca9ea..6f76bc9bfb9 100644 --- a/velox/vector/CMakeLists.txt +++ b/velox/vector/CMakeLists.txt @@ -22,6 +22,7 @@ velox_add_library( FlatVector.cpp LazyVector.cpp MapConcat.cpp + PartitionedVector.cpp SelectivityVector.cpp SequenceVector.cpp SimpleVector.cpp diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp new file mode 100644 index 00000000000..233c932fee9 --- /dev/null +++ b/velox/vector/PartitionedVector.cpp @@ -0,0 +1,492 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/vector/PartitionedVector.h" + +#include "velox/vector/FlatVector.h" + +namespace facebook::velox { + +using Byte = uint8_t; +using BitIndex = uint8_t; + +namespace { + +inline void countPartitionSizes( + const std::vector& partitions, + vector_size_t* rowCounts) { + VELOX_DCHECK_NOT_NULL(rowCounts); + + for (vector_size_t i = 0; i < partitions.size(); i++) { + rowCounts[partitions[i]]++; + } +} + +inline void prefixSum(vector_size_t* offsets, uint32_t numPartitions) { + for (uint32_t i = 1; i < numPartitions; i++) { + offsets[i] += offsets[i - 1]; + } +} + +inline void calculateOffsets( + const std::vector& partitions, + uint32_t numPartitions, + vector_size_t* endPartitionOffsets) { + VELOX_DCHECK_NOT_NULL(endPartitionOffsets); + + if (numPartitions > 1) { + std::fill_n(endPartitionOffsets, numPartitions, 0); + countPartitionSizes(partitions, endPartitionOffsets); + prefixSum(endPartitionOffsets, numPartitions); + } else { + endPartitionOffsets[0] = static_cast(partitions.size()); + } +} + +// endPartitionOffsets is an array of length numPartitions where each entry i is +// the exclusive end position of partition i. cursorPartitionOffsets is +// initialized such that cursorPartitionOffsets[0] = 0 and for i>0, +// cursorPartitionOffsets[i] = endPartitionOffsets[i-1], i.e., the inclusive +// begin positions. +void initializeCursorPartitionOffsets( + BufferPtr& cursorPartitionOffsets, + const BufferPtr& endPartitionOffsets, + uint32_t numPartitions, + velox::memory::MemoryPool* pool) { + VELOX_DCHECK_NOT_NULL(endPartitionOffsets); + VELOX_DCHECK_EQ( + endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t)); + + ensureCapacity(cursorPartitionOffsets, numPartitions, pool); + cursorPartitionOffsets->asMutable()[0] = 0; + std::memcpy( + &cursorPartitionOffsets->asMutable()[1], + endPartitionOffsets->as(), + sizeof(vector_size_t) * (numPartitions - 1)); + cursorPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t)); +} + +// In-place partitioning algorithm for fixed-width values +// This algorithm rearranges elements so that each element ends up in its target +// partition by repeatedly swapping elements until the current element belongs +// to the current partition +template +void partitionFixedWidthValuesInPlace( + T* values, + const std::vector& partitions, + uint32_t numPartitions, + const BufferPtr& endPartitionOffsets, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + VELOX_DCHECK_NOT_NULL(values); + VELOX_DCHECK_NOT_NULL(endPartitionOffsets); + initializeCursorPartitionOffsets( + ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool); + auto* rawCursorOffsets = + ctx.cursorPartitionOffsets->asMutable(); + const auto* rawEndOffsets = endPartitionOffsets->as(); + + for (auto currentPartition = 0; currentPartition < numPartitions; + currentPartition++) { + auto& offset = rawCursorOffsets[currentPartition]; + auto endOffset = rawEndOffsets[currentPartition]; + + while (offset < endOffset) { + uint32_t targetPartition = partitions[offset]; + + while (targetPartition != currentPartition) { + auto destinationOffset = rawCursorOffsets[targetPartition]++; + std::swap(values[destinationOffset], values[offset]); + targetPartition = partitions[destinationOffset]; + } + offset = ++rawCursorOffsets[currentPartition]; + } + } +} + +// Swap two bits between two bytes +void swapBit(Byte& byte1, BitIndex bit1, Byte& byte2, BitIndex bit2) { + // Calculate the difference between the bits + char bitDiff = ((byte1 >> bit1) & 1) ^ ((byte2 >> bit2) & 1); + + // Apply the difference to toggle the bits + byte1 ^= (bitDiff << bit1); + byte2 ^= (bitDiff << bit2); +} + +void partitionBitsInPlace( + Byte* bits, + const std::vector& partitions, + uint32_t numPartitions, + PartitionBuildContext& ctx, + const BufferPtr& endPartitionOffsets, + velox::memory::MemoryPool* pool) { + initializeCursorPartitionOffsets( + ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool); + + auto* rawCursorOffsets = + ctx.cursorPartitionOffsets->asMutable(); + const auto* rawEndOffsets = endPartitionOffsets->as(); + + for (uint32_t partition = 0; partition < numPartitions; partition++) { + auto& offset = rawCursorOffsets[partition]; + auto endOffset = rawEndOffsets[partition]; + while (offset < endOffset) { + uint32_t p = partitions[offset]; + while (p != partition) { + vector_size_t destinationOffset = rawCursorOffsets[p]++; + + // Calculate the byte address and bit index within the byte for the + // source and destination bits. Since each byte contains 8 bits, we + // divide the offset by 8 to get the byte address and take the modulus + // by 8 to get the bit index within that byte. + vector_size_t destinationAddr = destinationOffset >> 3; + int8_t destinationBitInByte = destinationOffset & 7; + vector_size_t fromAddr = offset >> 3; + int8_t fromBitInByte = offset & 7; + + swapBit( + bits[destinationAddr], + destinationBitInByte, + bits[fromAddr], + fromBitInByte); + p = partitions[destinationOffset]; + } + offset = ++rawCursorOffsets[partition]; + } + } +} + +template +void partitionFixedWidthValues( + BufferPtr& inputBuffer, + const std::vector& partitions, + const BufferPtr& endPartitionOffsets, + uint32_t numPartitions, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + VELOX_DCHECK_NOT_NULL(inputBuffer); + + auto input = inputBuffer->asMutable(); + partitionFixedWidthValuesInPlace( + input, partitions, numPartitions, endPartitionOffsets, ctx, pool); +} + +template <> +void partitionFixedWidthValues( + BufferPtr& inputBuffer, + const std::vector& partitions, + const BufferPtr& endPartitionOffsets, + uint32_t numPartitions, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + VELOX_DCHECK_NOT_NULL(inputBuffer); + + auto input = inputBuffer->asMutable(); + partitionBitsInPlace( + input, partitions, numPartitions, ctx, endPartitionOffsets, pool); +} + +template +PartitionedVectorPtr createPartitionedFlatVector( + VectorPtr vector, + const std::vector& partitions, + uint32_t numPartitions, + const BufferPtr& endPartitionOffsets, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + using T = typename TypeTraits::NativeType; + auto flatVector = std::dynamic_pointer_cast>(vector); + VELOX_CHECK_NOT_NULL(flatVector); + + auto partitionedFlatVector = std::make_shared>( + flatVector, numPartitions, endPartitionOffsets, pool); + + // Always call partition() so that numNullsPerPartition_ is populated, + // even when numPartitions == 1 and no data movement is required. + partitionedFlatVector->partition(partitions, ctx); + + return partitionedFlatVector; +} + +PartitionedVectorPtr createPartitionedRowVector( + VectorPtr vector, + const std::vector& partitions, + uint32_t numPartitions, + const BufferPtr& endPartitionOffsets, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + auto rowVector = std::dynamic_pointer_cast(vector); + VELOX_CHECK_NOT_NULL(rowVector); + + auto partitionedRowVector = std::make_shared( + rowVector, numPartitions, endPartitionOffsets, pool); + + // Always call partition() to initialize partitionedChildren_, even when + // numPartitions == 1, so that partitionAt() can reconstruct the RowVector. + partitionedRowVector->partition(partitions, ctx); + + return partitionedRowVector; +} + +} // namespace + +PartitionedVector::~PartitionedVector() = default; + +PartitionedVectorPtr PartitionedVector::create( + const VectorPtr& vector, + const std::vector& partitions, + uint32_t numPartitions, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(vector); + VELOX_CHECK_EQ(vector->size(), partitions.size()); + VELOX_CHECK_GT(numPartitions, 0); + VELOX_CHECK_NOT_NULL(pool); + + // Calculate the end offsets for each partition. For example, if there are 3 + // partitions with 2, 3, and 1 rows respectively, then endPartitionOffsets[0] + // = 2, endPartitionOffsets[1] = 5, and endPartitionOffsets[2] = 6. + BufferPtr endPartitionOffsets; + ensureCapacity(endPartitionOffsets, numPartitions, pool); + calculateOffsets( + partitions, + numPartitions, + endPartitionOffsets->asMutable()); + endPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t)); + + auto raw = endPartitionOffsets->as(); + VELOX_DCHECK_EQ(raw[numPartitions - 1], partitions.size()); + + return create( + vector, partitions, numPartitions, endPartitionOffsets, ctx, pool); +} + +PartitionedVectorPtr PartitionedVector::create( + const VectorPtr& vector, + const std::vector& partitions, + uint32_t numPartitions, + const BufferPtr& endPartitionOffsets, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(endPartitionOffsets); + VELOX_CHECK_EQ( + endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t)); + + auto encoding = vector->encoding(); + auto typeKind = vector->typeKind(); + + switch (encoding) { + case VectorEncoding::Simple::FLAT: { + auto partitionedFlatVector = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + createPartitionedFlatVector, + typeKind, + vector, + partitions, + numPartitions, + endPartitionOffsets, + ctx, + pool); + return partitionedFlatVector; + } + + case VectorEncoding::Simple::ROW: { + return createPartitionedRowVector( + vector, partitions, numPartitions, endPartitionOffsets, ctx, pool); + } + + case VectorEncoding::Simple::CONSTANT: { + auto partitionedConstantVector = + std::make_shared( + vector, numPartitions, endPartitionOffsets, pool); + partitionedConstantVector->partition(partitions, ctx); + return partitionedConstantVector; + } + + case VectorEncoding::Simple::ARRAY: + case VectorEncoding::Simple::MAP: + case VectorEncoding::Simple::DICTIONARY: + case VectorEncoding::Simple::BIASED: + case VectorEncoding::Simple::SEQUENCE: + case VectorEncoding::Simple::LAZY: + VELOX_UNSUPPORTED( + "Unsupported vector encoding for PartitionedVector: {}", + mapSimpleToName(encoding)); + default: + VELOX_UNREACHABLE( + "Invalid vector encoding for PartitionedVector: {}", encoding); + } +} + +VectorPtr PartitionedVector::baseVector() const { + return vector_; +} + +std::string PartitionedVector::toString() const { + std::string offsets; + for (vector_size_t i = 0; i < numPartitions_; ++i) { + if (i > 0) { + offsets += ','; + } + offsets += fmt::format("{}", rawEndPartitionOffsets_[i]); + } + + return fmt::format( + "PartitionedVector[numPartitions: {}, offsets: {}]", + numPartitions_, + offsets); +} + +template +void PartitionedFlatVector::partition( + const std::vector& partitions, + PartitionBuildContext& ctx) { + if (vector_->rawNulls()) { + Byte* rawNulls = reinterpret_cast(vector_->mutableRawNulls()); + partitionBitsInPlace( + rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_); + } + + auto valuesBuffer = vector_->as>()->values(); + partitionFixedWidthValues( + valuesBuffer, + partitions, + endPartitionOffsets_, + numPartitions_, + ctx, + pool_); + + // Count nulls per partition from the now-partitioned null bitmap. + if (const uint64_t* rawNulls = vector_->rawNulls()) { + for (uint32_t p = 0; p < numPartitions_; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1]; + const vector_size_t end = rawEndPartitionOffsets_[p]; + if (begin < end) { + numNullsPerPartition_[p] = + static_cast(bits::countNulls(rawNulls, begin, end)); + } + } + } +} + +template +VectorPtr PartitionedFlatVector::partitionAt(uint32_t partition) const { + VELOX_CHECK_LT(partition, numPartitions_); + + vector_size_t beginOffset = + partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1]; + vector_size_t numRowsInPartition = + rawEndPartitionOffsets_[partition] - beginOffset; + + return vector_->slice(beginOffset, numRowsInPartition); +} + +void PartitionedRowVector::partition( + const std::vector& partitions, + PartitionBuildContext& ctx) { + auto* rowVector = vector_->as(); + partitionedChildren_.reserve(rowVector->childrenSize()); + + for (const auto& child : rowVector->children()) { + partitionedChildren_.push_back( + PartitionedVector::create( + child, + partitions, + numPartitions_, + endPartitionOffsets_, + ctx, + pool_)); + } + + if (numPartitions_ > 1 && vector_->rawNulls()) { + Byte* rawNulls = reinterpret_cast(vector_->mutableRawNulls()); + partitionBitsInPlace( + rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_); + } + + // Count nulls per partition from the now-partitioned null bitmap. + if (const uint64_t* rawNulls = vector_->rawNulls()) { + for (uint32_t p = 0; p < numPartitions_; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1]; + const vector_size_t end = rawEndPartitionOffsets_[p]; + if (begin < end) { + numNullsPerPartition_[p] = + static_cast(bits::countNulls(rawNulls, begin, end)); + } + } + } +} + +VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const { + VELOX_CHECK_LT(partition, numPartitions_); + + vector_size_t beginOffset = + partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1]; + vector_size_t numRowsInPartition = + rawEndPartitionOffsets_[partition] - beginOffset; + + std::vector children; + children.reserve(partitionedChildren_.size()); + for (const auto& child : partitionedChildren_) { + children.push_back(child->partitionAt(partition)); + } + + BufferPtr nulls = nullptr; + if (numRowsInPartition > 0 && vector_->rawNulls()) { + nulls = AlignedBuffer::allocate(numRowsInPartition, pool_); + bits::copyBits( + vector_->rawNulls(), + beginOffset, + nulls->asMutable(), + 0, + numRowsInPartition); + } + + return std::make_shared( + pool_, + vector_->type(), + std::move(nulls), + numRowsInPartition, + std::move(children)); +} + +void PartitionedConstantVector::partition( + const std::vector& /*partitions*/, + PartitionBuildContext& /*ctx*/) { + if (!vector_->isNullAt(0)) { + return; + } + + for (uint32_t p = 0; p < numPartitions_; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1]; + const vector_size_t end = rawEndPartitionOffsets_[p]; + if (begin < end) { + numNullsPerPartition_[p] = end - begin; + } + } +} + +VectorPtr PartitionedConstantVector::partitionAt(uint32_t partition) const { + VELOX_CHECK_LT(partition, numPartitions_); + + const vector_size_t beginOffset = + partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1]; + const vector_size_t numRowsInPartition = + rawEndPartitionOffsets_[partition] - beginOffset; + + return vector_->slice(0, numRowsInPartition); +} + +} // namespace facebook::velox diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h new file mode 100644 index 00000000000..24dec3f03fb --- /dev/null +++ b/velox/vector/PartitionedVector.h @@ -0,0 +1,321 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/vector/BaseVector.h" +#include "velox/vector/ComplexVector.h" + +namespace facebook::velox { + +class PartitionedVector; +using PartitionedVectorPtr = std::shared_ptr; + +namespace { + +// TODO: This was copied from dwio::common::BufferUtil.h. However the vector +// module should not depend on dwio. Move this to a common place +template +inline void ensureCapacity( + BufferPtr& data, + size_t numElements, + velox::memory::MemoryPool* pool, + bool preserveOldData = false, + bool clearBits = false) { + size_t oldSize = 0; + size_t newCapacity = BaseVector::byteSize(numElements); + if (!data) { + data = AlignedBuffer::allocate(numElements, pool); + } else { + oldSize = data->size(); + if (!data->isMutable() || data->capacity() < newCapacity) { + auto newData = AlignedBuffer::allocate(numElements, pool); + if (preserveOldData) { + std::memcpy( + newData->template asMutable(), + data->as(), + oldSize); + } + data = newData; + } + } + + if (clearBits && newCapacity > oldSize) { + std::memset( + (void*)(data->asMutable() + oldSize), + 0L, + newCapacity - oldSize); + } +} + +} // namespace + +/// Construction-time context used to build a PartitionedVector. +/// +/// This struct contains only transient execution context needed during +/// construction. None of the fields here define the logical state of +/// PartitionedVector and none are retained after create(). +/// All fields are only valid during the PartitionedVector::create() call. +struct PartitionBuildContext { + BufferPtr cursorPartitionOffsets = nullptr; + + PartitionBuildContext() = default; +}; + +/// PartitionedVector provides an in-place, partition-aware layout of a vector +/// based on per-row partition IDs. +/// +/// This is a low-level execution abstraction, analogous to DecodedVector: +/// - it owns partitioning metadata (offsets, indices) +/// - it does not encode operator-specific semantics +/// - it is intended to be reused by multiple exec components +/// (aggregation, sorting, shuffle, etc.) +/// +/// The partitioning operation rearranges rows so that rows belonging to the +/// same partition occupy a contiguous range. +/// +/// Thread-safety: +/// This class is NOT thread-safe. All methods must be called from a single +/// thread. Internal buffers are mutated during create(). +class PartitionedVector { + public: + /// Disable default constructor. + PartitionedVector() = delete; + + /// Disable copy constructor and assignment. + PartitionedVector(const PartitionedVector& other) = delete; + PartitionedVector& operator=(const PartitionedVector& other) = delete; + + // Use default move constructor and move assignment operator. + PartitionedVector(PartitionedVector&&) noexcept = default; + PartitionedVector& operator=(PartitionedVector&&) noexcept = default; + + /// Virtual destructor. + virtual ~PartitionedVector(); + + /// Factory method to create a PartitionedVector. This is the main entry point + /// for constructing a PartitionedVector. The partitioning operation + /// rearranges rows in the base vector so that rows belonging to the same + /// partition occupy a contiguous range. + /// + /// Params: + /// - vector: the base vector to be partitioned. This is modified during + /// partitioning, and becomes the underlying vector of the created + /// PartitionedVector. + /// - partitions: a vector of partition IDs for each row in the base vector. + /// The length of this vector must be the same as the number of rows in the + /// base vector. Each entry must be a value between 0 and numPartitions - 1. + /// - numPartitions: the total number of partitions. This must be greater than + /// 0. + /// - ctx: the context object for building the partitioned vector. This + /// contains transient execution context needed during construction, such as + /// intermediate buffers. None of the fields in this context define the + /// logical state of the PartitionedVector, and none are retained after + /// create(). All fields in this context are only valid during the create() + /// call. + /// - pool: the memory pool for allocating any necessary buffers during the + /// creation of the PartitionedVector. + static PartitionedVectorPtr create( + const VectorPtr& vector, + const std::vector& partitions, + uint32_t numPartitions, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool); + + /// Returns the underlying vector. + VectorPtr baseVector() const; + + /// Returns the partitioned vector at partition p. If the number of rows in + /// that partition is 0, returns an empty vector. + virtual VectorPtr partitionAt(uint32_t partition) const = 0; + + template + T* as() { + static_assert(std::is_base_of_v); + return dynamic_cast(this); + } + + /// Returns the number of null rows in the given partition. + vector_size_t numNullsAt(uint32_t partition) const { + VELOX_DCHECK_LT(partition, numPartitions_); + return numNullsPerPartition_[partition]; + } + + vector_size_t numRowsAt(uint32_t partition) const { + auto beginOffset = + partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1]; + auto endOffset = rawEndPartitionOffsets_[partition]; + return endOffset - beginOffset; + } + + TypeKind typeKind() const { + return vector_->typeKind(); + } + + vector_size_t* rawPartitionOffsets() { + return rawEndPartitionOffsets_; + } + + virtual const vector_size_t* rawSizes() = 0; + + /// Returns string representation of the value in the specified row. + virtual std::string toString() const; + + protected: + // Internal create method that accepts pre-computed endPartitionOffsets + // buffer. + static PartitionedVectorPtr create( + const VectorPtr& vector, + const std::vector& partitions, + uint32_t numPartitions, + const BufferPtr& partitionOffsetsBuffer, + PartitionBuildContext& ctx, + velox::memory::MemoryPool* pool); + + PartitionedVector( + const VectorPtr& vector, + uint32_t numPartitions, + const BufferPtr& endPartitionOffsets, + velox::memory::MemoryPool* pool) + : vector_(vector), + numPartitions_(numPartitions), + endPartitionOffsets_(endPartitionOffsets), + numNullsPerPartition_(numPartitions, 0), + pool_(pool) { + VELOX_CHECK_NOT_NULL(vector_); + VELOX_CHECK_GT(numPartitions_, 0); + VELOX_CHECK_NOT_NULL(endPartitionOffsets_); + VELOX_CHECK_EQ( + endPartitionOffsets_->size(), numPartitions_ * sizeof(vector_size_t)); + VELOX_CHECK_NOT_NULL(pool_); + + rawEndPartitionOffsets_ = endPartitionOffsets_->asMutable(); + } + + virtual void partition( + const std::vector& partitions, + PartitionBuildContext& ctx) = 0; + + // The base vector that is being partitioned. This is modified during + // partitioning. + VectorPtr vector_; + + // Total number of partitions. This is set at construction and does not change + // during partitioning. It doesn't have const quantifier because we want to + // allow move assignment operator. + uint32_t numPartitions_; + + // The cumulative end row offsets for each partition. For example, if there + // are 3 partitions with 2, 3, and 1 rows respectively, then + // endPartitionOffsets_[0] = 2, endPartitionOffsets_[1] = 5, and + // endPartitionOffsets_[2] = 6. + BufferPtr endPartitionOffsets_; + + // The raw pointer to the endPartitionOffsets_ buffer for easy access during + // partitioning. + vector_size_t* rawEndPartitionOffsets_; + + /// Null row counts per partition, computed during partition(). + std::vector numNullsPerPartition_; + + velox::memory::MemoryPool* pool_; +}; + +using PartitionedVectorPtr = std::shared_ptr; + +template +class PartitionedFlatVector : public PartitionedVector { + public: + PartitionedFlatVector( + const VectorPtr& flatVector, + uint32_t numPartitions, + const BufferPtr& partitionOffsets, + velox::memory::MemoryPool* pool) + : PartitionedVector(flatVector, numPartitions, partitionOffsets, pool) {} + + void partition( + const std::vector& partitions, + PartitionBuildContext& ctx) override; + + VectorPtr partitionAt(uint32_t partition) const override; + + const vector_size_t* rawSizes() override { + VELOX_UNREACHABLE("PartitionedFlatVector does not implement rawSizes()"); + } +}; + +/// Partitions a RowVector in-place so that rows belonging to the same +/// partition occupy a contiguous range. Recursively partitions each child +/// column using PartitionedVector. +class PartitionedRowVector : public PartitionedVector { + public: + PartitionedRowVector( + const VectorPtr& rowVector, + uint32_t numPartitions, + const BufferPtr& partitionOffsets, + velox::memory::MemoryPool* pool) + : PartitionedVector(rowVector, numPartitions, partitionOffsets, pool) {} + + void partition( + const std::vector& partitions, + PartitionBuildContext& ctx) override; + + VectorPtr partitionAt(uint32_t partition) const override; + + /// Returns the partitioned child vector at the given column index. + PartitionedVectorPtr childAt(uint32_t col) const { + VELOX_DCHECK_LT(col, partitionedChildren_.size()); + return partitionedChildren_[col]; + } + + const vector_size_t* rawSizes() override { + VELOX_UNREACHABLE("PartitionedRowVector does not implement rawSizes()"); + } + + private: + /// Partitioned child columns, one per child of the underlying RowVector. + std::vector partitionedChildren_; +}; + +/// Partitions a ConstantVector by reusing the same constant payload and +/// returning constant slices sized to each partition. +class PartitionedConstantVector : public PartitionedVector { + public: + PartitionedConstantVector( + const VectorPtr& constantVector, + uint32_t numPartitions, + const BufferPtr& partitionOffsets, + velox::memory::MemoryPool* pool) + : PartitionedVector( + constantVector, + numPartitions, + partitionOffsets, + pool) {} + + void partition( + const std::vector& partitions, + PartitionBuildContext& ctx) override; + + VectorPtr partitionAt(uint32_t partition) const override; + + const vector_size_t* rawSizes() override { + VELOX_UNREACHABLE( + "PartitionedConstantVector does not implement rawSizes()"); + } +}; + +} // namespace facebook::velox diff --git a/velox/vector/benchmarks/CMakeLists.txt b/velox/vector/benchmarks/CMakeLists.txt index 0cb3c78bfd8..8c1840daa1b 100644 --- a/velox/vector/benchmarks/CMakeLists.txt +++ b/velox/vector/benchmarks/CMakeLists.txt @@ -45,3 +45,13 @@ target_link_libraries( gflags::gflags glog::glog ) + +add_executable(velox_vector_partitioned_vector_benchmark PartitionedVectorBenchmark.cpp) +target_link_libraries( + velox_vector_partitioned_vector_benchmark + velox_dwio_common_test_utils + velox_vector + velox_vector_test_lib + Folly::folly + Folly::follybenchmark +) diff --git a/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp new file mode 100644 index 00000000000..8589bbec0a0 --- /dev/null +++ b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp @@ -0,0 +1,271 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include + +#include "velox/vector/PartitionedVector.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +// Add the following definitions to allow Clion runs +DEFINE_bool(gtest_color, false, ""); +DEFINE_string(gtest_filter, "*", ""); + +using namespace facebook::velox; +using namespace facebook::velox::test; + +namespace facebook::velox::test { + +namespace { + +thread_local auto gen = std::mt19937(42); + +const std::function noNulls; + +auto allNulls = [](vector_size_t) { return true; }; + +auto halfNulls = [](vector_size_t row) { return row % 2 == 0; }; + +template +RowTypePtr scalarTypeGenerator(int32_t numColumns) { + return ROW(std::vector(numColumns, createScalarType())); +} + +RowTypePtr dateTypeGenerator(int32_t numColumns) { + return ROW(std::vector(numColumns, DATE())); +} + +RowTypePtr shortDecimalTypeGenerator(int32_t numColumns) { + return ROW(std::vector(numColumns, DECIMAL(10, 2))); +} + +RowTypePtr longDecimalTypeGenerator(int32_t numColumns) { + return ROW(std::vector(numColumns, DECIMAL(20, 3))); +} + +RowTypePtr mixedFlatTypeGenerator(int32_t numColumns) { + const std::vector typeSelection = { + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + HUGEINT(), + REAL(), + DOUBLE(), + TIMESTAMP(), + DATE(), + DECIMAL(10, 2), + DECIMAL(20, 3), + }; + + std::vector types; + types.reserve(numColumns); + + for (int i = 0; i < numColumns; ++i) { + types.push_back(typeSelection[i % typeSelection.size()]); + } + + std::ranges::shuffle(types, gen); + + return ROW(std::move(types)); +} + +auto randomPartitionFunction = [](const RowVectorPtr& vector, + uint32_t numPartitions, + std::vector& partitions) { + partitions.resize(vector->size()); + for (int i = 0; i < vector->size(); ++i) { + partitions[i] = gen() % numPartitions; + } +}; + +/// Builds benchmark row vectors, one column at a time. +class VectorBuilder : public VectorTestBase { + public: + RowVectorPtr makeRowVector( + const RowTypePtr& rowType, + vector_size_t numRows, + const std::function& isNullAt) { + std::vector children; + children.reserve(rowType->size()); + for (auto i = 0; i < rowType->size(); ++i) { + children.push_back(makeColumn(rowType->childAt(i), numRows, isNullAt)); + } + return VectorTestBase::makeRowVector(children); + } + + private: + VectorPtr makeColumn( + const TypePtr& type, + vector_size_t size, + const std::function& isNullAt) { + switch (type->kind()) { + case TypeKind::BOOLEAN: + return makeFlatVector( + size, [](auto row) { return row % 2 == 0; }, isNullAt, type); + case TypeKind::TINYINT: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::SMALLINT: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::INTEGER: + if (type->isDate()) { + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + } + return makeFlatVector( + size, [](auto row) { return row; }, isNullAt, type); + case TypeKind::BIGINT: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::HUGEINT: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::REAL: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::DOUBLE: + return makeFlatVector( + size, + [](auto row) { return static_cast(row); }, + isNullAt, + type); + case TypeKind::TIMESTAMP: + return makeFlatVector( + size, + [](auto row) { return Timestamp(row, row * 1'000); }, + isNullAt, + type); + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + // Alternate between short inlined strings (≤12 bytes) and long + // out-of-line strings (>12 bytes) to exercise both StringView paths. + return makeFlatVector( + size, + [](auto row) -> std::string { + if (row % 2 == 0) { + return fmt::format("v-{}", row); + } + return fmt::format("velox_benchmark_string_{:08d}", row); + }, + isNullAt, + type); + default: + VELOX_UNSUPPORTED("Unsupported benchmark type: {}", type->toString()); + } + } +}; + +} // namespace + +/// Constructs all benchmark state and runs the benchmark. Called once per +/// benchmark entry; construction is outside the timed region. +void runBM( + uint32_t iterations, + const std::function& rowTypeGenerator, + int32_t numColumns, + uint32_t numPartitions, + const std::function& isNullAt = noNulls, + vector_size_t numRows = 10'000) { + folly::BenchmarkSuspender suspender; + VectorBuilder vectorBuilder; + auto pool = memory::memoryManager()->addLeafPool(); + PartitionBuildContext ctx; + auto vector = vectorBuilder.makeRowVector( + rowTypeGenerator(numColumns), numRows, isNullAt); + std::vector partitions; + randomPartitionFunction(vector, numPartitions, partitions); + for (uint32_t i = 0; i < iterations; ++i) { + const auto vectorCopy = std::static_pointer_cast( + BaseVector::copy(*vector, pool.get())); + suspender.dismiss(); + PartitionedVector::create( + vectorCopy, partitions, numPartitions, ctx, pool.get()); + suspender.rehire(); + } +} + +#define BENCHMARK_CONFIG(name, generator, numCols, nulls, numParts) \ + BENCHMARK_NAMED_PARAM( \ + runBM, \ + name##_##numCols##Cols_##nulls##_P##numParts, \ + generator, \ + numCols, \ + numParts, \ + nulls); + +#define BENCHMARK_PARTITIONS(name, generator, numCols, nulls) \ + BENCHMARK_CONFIG(name, generator, numCols, nulls, 4) \ + BENCHMARK_CONFIG(name, generator, numCols, nulls, 16) \ + BENCHMARK_CONFIG(name, generator, numCols, nulls, 64) \ + BENCHMARK_CONFIG(name, generator, numCols, nulls, 256) \ + BENCHMARK_CONFIG(name, generator, numCols, nulls, 1024) + +#define BENCHMARK_SIZES(name, generator, nulls) \ + BENCHMARK_PARTITIONS(name, generator, 1, nulls) \ + BENCHMARK_PARTITIONS(name, generator, 10, nulls) \ + BENCHMARK_PARTITIONS(name, generator, 100, nulls) \ + BENCHMARK_PARTITIONS(name, generator, 1000, nulls) + +#define BENCHMARK_TYPE(name, generator) \ + BENCHMARK_SIZES(name, generator, noNulls) \ + BENCHMARK_SIZES(name, generator, allNulls) \ + BENCHMARK_SIZES(name, generator, halfNulls) + +BENCHMARK_TYPE(BOOLEAN, scalarTypeGenerator); +BENCHMARK_TYPE(SMALLINT, scalarTypeGenerator); +BENCHMARK_TYPE(INTEGER, scalarTypeGenerator); +BENCHMARK_TYPE(BIGINT, scalarTypeGenerator); +BENCHMARK_TYPE(HUGEINT, scalarTypeGenerator); +BENCHMARK_TYPE(REAL, scalarTypeGenerator); +BENCHMARK_TYPE(DOUBLE, scalarTypeGenerator); +BENCHMARK_TYPE(TIMESTAMP, scalarTypeGenerator); +BENCHMARK_TYPE(VARCHAR, scalarTypeGenerator); +BENCHMARK_TYPE(VARBINARY, scalarTypeGenerator); +BENCHMARK_TYPE(DATE, dateTypeGenerator); +BENCHMARK_TYPE(ShortDecimal, shortDecimalTypeGenerator); +BENCHMARK_TYPE(LongDecimal, longDecimalTypeGenerator); +BENCHMARK_TYPE(Mixed, mixedFlatTypeGenerator); + +} // namespace facebook::velox::test + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize(memory::MemoryManager::Options{}); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/vector/tests/CMakeLists.txt b/velox/vector/tests/CMakeLists.txt index 24478b9c8e5..08277820124 100644 --- a/velox/vector/tests/CMakeLists.txt +++ b/velox/vector/tests/CMakeLists.txt @@ -25,6 +25,7 @@ add_executable( LazyVectorTest.cpp MapConcatTest.cpp MayHaveNullsRecursiveTest.cpp + PartitionedVectorTest.cpp SelectivityVectorTest.cpp StringVectorBufferTest.cpp VariantToVectorTest.cpp diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp new file mode 100644 index 00000000000..569a6e6ae9f --- /dev/null +++ b/velox/vector/tests/PartitionedVectorTest.cpp @@ -0,0 +1,416 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +#include "vector/tests/utils/VectorTestBase.h" +#include "velox/vector/PartitionedVector.h" +#include "velox/vector/tests/utils/PartitionedVectorTestBase.h" + +namespace facebook::velox::test { + +class PartitioningVectorTest : public testing::TestWithParam, + public test::PartitionedVectorTestBase { + protected: + std::mt19937 gen_ = std::mt19937(std::random_device{}()); + + PartitionBuildContext ctx_; + BufferPtr partitionOffsets_; + + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void testPartitionedVector( + VectorPtr vector, + const std::vector& partitions, + uint32_t numPartitions) { + // Back up the vector before calling PartitionedVector::create() + VectorPtr vectorCopy = BaseVector::copy(*vector); + // Build the expected vector using the reference implementation + std::vector expectedVectors = + partitionVectorByWrapping(vectorCopy, partitions, numPartitions); + + // Initialize buffers needed for PartitionedVector::create() + ensureCapacity( + ctx_.cursorPartitionOffsets, numPartitions, pool_.get()); + + // Calculate the number of values for each partition + std::vector partitionRowCounts(numPartitions, 0); + for (auto partition : partitions) { + partitionRowCounts[partition]++; + } + + // Create the partitioned vector using the actual implementation + auto partitionedVector = PartitionedVector::create( + vector, + partitions, + numPartitions, + // partitionOffsets_, + ctx_, + pool_.get()); + VELOX_CHECK_NOT_NULL(partitionedVector); + + // Extract each partition and compare with expected results + std::vector partitionedVectors; + for (uint32_t i = 0; i < numPartitions; ++i) { + auto partition = partitionedVector->partitionAt(i); + partitionedVectors.push_back(partition); + } + + for (uint32_t i = 0; i < numPartitions; ++i) { + test::assertEqualVectors( + expectedVectors[i], canonicalize(partitionedVectors[i])); + } + } + + void testVectorPartitioning(VectorPtr vector) { + auto numRows = vector->size(); + std::vector partitions(numRows); + + // Test with single partition + std::fill(partitions.begin(), partitions.end(), 0); + auto vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, 1); + + // Test with two partitions + if (vector->size() >= 3) { + for (uint32_t i = 0; i < partitions.size(); ++i) { + partitions[i] = i % 2; + } + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, 2); + } + + // Test with three partitions + for (uint32_t i = 0; i < partitions.size(); ++i) { + partitions[i] = i % 3; + } + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, 3); + + if (vector->size() > 4) { + // Test with four partitions where the first partition is empty + for (uint32_t i = 0; i < partitions.size(); ++i) { + partitions[i] = i % 3 + 1; + } + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, 4); + + // Test with four partitions where the last partition is empty + for (uint32_t i = 0; i < partitions.size(); ++i) { + partitions[i] = i % 3; + } + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, 4); + } + + // Test with one value per partition + if (vector->size() > 0) { + std::iota(partitions.begin(), partitions.end(), 0); + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, numRows); + } + + // Test with random partitions (number of partitions <= number of values) + std::uniform_int_distribution<> dis(0, numRows - 1); + uint32_t maxPartition = 0; + for (uint32_t i = 0; i < numRows; ++i) { + partitions[i] = dis(gen_); + maxPartition = std::max(maxPartition, partitions[i]); + } + vectorCopy = BaseVector::copy(*vector, pool_.get()); + testPartitionedVector(vectorCopy, partitions, maxPartition + 1); + } +}; + +TEST_P(PartitioningVectorTest, testFlatVector) { + // Number of values in the vector to be partitioned. This is passed as a test + // parameter and is used to test different vector sizes, including edge cases + // like 0 and 1. + const int numValues = GetParam(); + + // Random values, no nulls + testVectorPartitioning( + makeFlatVector(numValues, [](auto row) { return row; })); + + // Random values, with half number of nulls + testVectorPartitioning( + makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(2, 1))); + + // All nulls + testVectorPartitioning(makeAllNullFlatVector(numValues)); +} + +TEST_P(PartitioningVectorTest, testFlatBoolVector) { + const int numValues = GetParam(); + + // Random values, no nulls + testVectorPartitioning( + makeFlatVector(numValues, [](auto row) { return row % 2 == 0; })); + + // Random values, with half number of nulls + testVectorPartitioning( + makeFlatVector( + numValues, [](auto row) { return row % 2 == 0; }, nullEvery(2, 1))); + + // All nulls + testVectorPartitioning(makeAllNullFlatVector(numValues)); +} + +TEST_P(PartitioningVectorTest, testRowVector) { + const int numValues = GetParam(); + + // Two flat columns, no nulls at any level. + testVectorPartitioning(makeRowVector({ + makeFlatVector(numValues, [](auto row) { return row; }), + makeFlatVector(numValues, [](auto row) { return row * 10; }), + })); + + // Two flat columns with nullable children. + testVectorPartitioning(makeRowVector({ + makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(2)), + makeFlatVector( + numValues, [](auto row) { return row * 10; }, nullEvery(3)), + })); + + // Row-level nulls with no child nulls. + testVectorPartitioning(makeRowVector( + {makeFlatVector(numValues, [](auto row) { return row; })}, + nullEvery(2))); + + // Row-level nulls combined with nullable children. + testVectorPartitioning(makeRowVector( + {makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(3))}, + nullEvery(2))); + + // All rows null. + testVectorPartitioning(makeRowVector( + {makeFlatVector(numValues, [](auto row) { return row; })}, + [](auto /*row*/) { return true; })); + + // Nested RowVector. + testVectorPartitioning(makeRowVector({ + makeFlatVector(numValues, [](auto row) { return row; }), + makeRowVector({ + makeFlatVector(numValues, [](auto row) { return row; }), + }), + })); +} + +TEST_P(PartitioningVectorTest, testConstantVector) { + const int numValues = GetParam(); + + testVectorPartitioning(makeConstant(7, numValues)); + testVectorPartitioning(makeConstant(std::nullopt, numValues)); + testVectorPartitioning(makeConstantRow( + ROW({"c0", "c1"}, {INTEGER(), VARCHAR()}), + variant::row({variant(11), variant("constant")}), + numValues)); +} + +// Partitioning a null-free vector must not allocate a null buffer. +TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeFlat) { + const int numValues = GetParam(); + if (numValues == 0) { + return; + } + + auto flat = makeFlatVector(numValues, [](auto row) { return row; }); + ASSERT_FALSE(flat->mayHaveNulls()); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 2; + } + + auto pv = PartitionedVector::create(flat, partitions, 2, ctx_, pool_.get()); + EXPECT_FALSE(pv->baseVector()->mayHaveNulls()) + << "partition() must not allocate a null buffer for a null-free FlatVector"; +} + +// Partitioning a null-free RowVector must not allocate null buffers on the +// row vector or any of its children. +TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeRow) { + const int numValues = GetParam(); + if (numValues == 0) { + return; + } + + auto row = makeRowVector({ + makeFlatVector(numValues, [](auto row) { return row; }), + makeFlatVector(numValues, [](auto row) { return row * 10; }), + }); + ASSERT_FALSE(row->mayHaveNulls()); + ASSERT_FALSE(row->childAt(0)->mayHaveNulls()); + ASSERT_FALSE(row->childAt(1)->mayHaveNulls()); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 2; + } + + auto pv = PartitionedVector::create(row, partitions, 2, ctx_, pool_.get()); + auto* base = pv->baseVector()->as(); + EXPECT_FALSE(base->mayHaveNulls()) + << "partition() must not allocate a null buffer for a null-free RowVector"; + EXPECT_FALSE(base->childAt(0)->mayHaveNulls()) + << "partition() must not allocate a null buffer for null-free child 0"; + EXPECT_FALSE(base->childAt(1)->mayHaveNulls()) + << "partition() must not allocate a null buffer for null-free child 1"; +} + +// numNullsAt() tests +// --------------------------------------------------------------------------- + +// A null-free flat vector must report zero nulls for every partition. +TEST_P(PartitioningVectorTest, numNullsAtFlatNoNulls) { + const int numValues = GetParam(); + auto flat = makeFlatVector(numValues, [](auto row) { return row; }); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 3; + } + auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get()); + for (uint32_t p = 0; p < 3; ++p) { + EXPECT_EQ(pv->numNullsAt(p), 0) << "partition " << p; + } +} + +// A flat vector with every other row null must report the exact per-partition +// null count. The sum across all partitions must equal the total null count. +TEST_P(PartitioningVectorTest, numNullsAtFlatSomeNulls) { + const int numValues = GetParam(); + auto flat = makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(2)); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 3; + } + auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get()); + + // Per-partition counts must agree with manual bit-scan of the base vector. + const auto* rawNulls = pv->baseVector()->rawNulls(); + const auto* rawOffsets = pv->rawPartitionOffsets(); + for (uint32_t p = 0; p < 3; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1]; + const vector_size_t end = rawOffsets[p]; + const vector_size_t expected = rawNulls + ? BaseVector::countNulls(pv->baseVector()->nulls(), begin, end) + : 0; + EXPECT_EQ(pv->numNullsAt(p), expected) << "partition " << p; + } + + // Sum across partitions must equal the total null count in the source vector. + const vector_size_t total = + pv->numNullsAt(0) + pv->numNullsAt(1) + pv->numNullsAt(2); + EXPECT_EQ(total, BaseVector::countNulls(flat->nulls(), 0, numValues)); +} + +// An all-null flat vector must report numNullsAt(p) == rows in that partition. +TEST_P(PartitioningVectorTest, numNullsAtFlatAllNulls) { + const int numValues = GetParam(); + auto flat = makeAllNullFlatVector(numValues); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 3; + } + auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get()); + + const auto* rawOffsets = pv->rawPartitionOffsets(); + for (uint32_t p = 0; p < 3; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1]; + const vector_size_t numRowsInPartition = rawOffsets[p] - begin; + EXPECT_EQ(pv->numNullsAt(p), numRowsInPartition) << "partition " << p; + } +} + +// A row vector with no row-level nulls must report zero per-partition nulls at +// the row level, even when child columns have nulls. +TEST_P(PartitioningVectorTest, numNullsAtRowNoRowLevelNulls) { + const int numValues = GetParam(); + auto row = makeRowVector({ + makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(2)), + }); + ASSERT_FALSE(row->mayHaveNulls()); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 3; + } + auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get()); + for (uint32_t p = 0; p < 3; ++p) { + EXPECT_EQ(pv->numNullsAt(p), 0) + << "Row-level numNullsAt() must not count child nulls, partition " << p; + } +} + +// A row vector with row-level nulls must report per-partition counts that match +// a manual bit-scan. Child null counts must be counted independently. +TEST_P(PartitioningVectorTest, numNullsAtRowRowLevelNulls) { + const int numValues = GetParam(); + auto row = makeRowVector( + {makeFlatVector( + numValues, [](auto row) { return row; }, nullEvery(3))}, + nullEvery(2)); + + std::vector partitions(numValues); + for (int i = 0; i < numValues; ++i) { + partitions[i] = i % 3; + } + auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get()); + + const auto* rawOffsets = pv->rawPartitionOffsets(); + for (uint32_t p = 0; p < 3; ++p) { + const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1]; + const vector_size_t end = rawOffsets[p]; + const vector_size_t expected = + BaseVector::countNulls(pv->baseVector()->nulls(), begin, end); + EXPECT_EQ(pv->numNullsAt(p), expected) + << "Row-level null count mismatch, partition " << p; + } + + // Child null counts must be tracked independently of row-level nulls. + auto* prv = dynamic_cast(pv.get()); + ASSERT_NE(prv, nullptr); + auto child = prv->childAt(0); + const auto* childOffsets = child->rawPartitionOffsets(); + for (uint32_t p = 0; p < 3; ++p) { + const vector_size_t begin = p == 0 ? 0 : childOffsets[p - 1]; + const vector_size_t end = childOffsets[p]; + const vector_size_t expected = + BaseVector::countNulls(child->baseVector()->nulls(), begin, end); + EXPECT_EQ(child->numNullsAt(p), expected) + << "Child null count mismatch, partition " << p; + } +} + +// Test with different vector sizes, including edge cases like 0 and 1. +INSTANTIATE_TEST_SUITE_P( + FlatVectorSizes, + PartitioningVectorTest, + ::testing::Values(0, 1, 10, 10000)); + +} // namespace facebook::velox::test diff --git a/velox/vector/tests/utils/CMakeLists.txt b/velox/vector/tests/utils/CMakeLists.txt index 9e7fbae65b6..35a56901ccf 100644 --- a/velox/vector/tests/utils/CMakeLists.txt +++ b/velox/vector/tests/utils/CMakeLists.txt @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_vector_test_lib VectorMaker.cpp VectorTestBase.cpp) +add_library(velox_vector_test_lib PartitionedVectorTestBase.cpp VectorMaker.cpp VectorTestBase.cpp) velox_add_test_headers( velox_vector_test_lib VectorMaker-inl.h diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.cpp b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp new file mode 100644 index 00000000000..e9191ba0b8f --- /dev/null +++ b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/vector/tests/utils/PartitionedVectorTestBase.h" + +namespace facebook::velox::test { + +VectorPtr PartitionedVectorTestBase::canonicalize(VectorPtr vector) { + auto numRows = vector->size(); + + auto indices = makeIndices(numRows, [&](auto row) { return row; }); + vector_size_t* indicesRange = indices->asMutable(); + + // Sort the indices based on the vector values + std::stable_sort( + indicesRange, + indicesRange + numRows, + [&](vector_size_t left, vector_size_t right) { + return vector->compare(vector.get(), left, right) < 0; + }); + + auto sortedVector = wrapInDictionary(indices, numRows, vector); + return sortedVector; +} + +std::vector PartitionedVectorTestBase::partitionVectorByWrapping( + VectorPtr vector, + const std::vector& partitions, + uint32_t numPartitions) { + auto numRows = vector->size(); + + // Count the number of rows in each partition + std::vector partitionRowCounts(numPartitions, 0); + for (int i = 0; i < numRows; i++) { + partitionRowCounts[partitions[i]]++; + } + + std::vector partitionedVectors(numPartitions, nullptr); + + for (int p = 0; p < numPartitions; p++) { + auto numRowsInPartition = partitionRowCounts[p]; + + if (numRowsInPartition == 0) { + partitionedVectors[p] = + BaseVector::create(vector->type(), 0, pool_.get()); + continue; + } + + // Create an indices buffer for each partition, and fill it with the row + // indices for that partition. + std::vector rowIdsInPartition(numRowsInPartition); + vector_size_t offset = 0; + for (vector_size_t i = 0; i < numRows; ++i) { + if (partitions[i] == p) { + VELOX_DCHECK_LT(offset, numRowsInPartition); + rowIdsInPartition[offset++] = i; + } + } + VELOX_CHECK_EQ(offset, numRowsInPartition); + auto indices = makeIndices(partitionRowCounts[p], [&](auto row) { + return rowIdsInPartition[row]; + }); + + // Simulate partitioning by building the DictionaryVector with the + // partitioned indices + // Copy firsts because wrapInDictionary would take the ownership of the + // vector + VectorPtr vectorCopy = BaseVector::copy(*vector, pool_.get()); + auto dictionaryVector = BaseVector::wrapInDictionary( + nullptr, indices, numRowsInPartition, vectorCopy); + partitionedVectors[p] = canonicalize(dictionaryVector); + } + return partitionedVectors; +} + +std::vector PartitionedVectorTestBase::partitionRowVectors( + const std::vector& rowVectors, + int32_t numPartitions, + core::PartitionFunction* partitionFunction) { + // RowVectorPtr mergedRowVector = mergeRowVectors(rowVectors); + VectorPtr mergedRowVector = + mergeVectors((const std::vector&)rowVectors); + auto totalNumRows = mergedRowVector->size(); + + std::vector partitions(totalNumRows, 0); + if (numPartitions > 1) { + auto rowType = asRowType(mergedRowVector->type()); + std::optional singlePartition = partitionFunction->partition( + *mergedRowVector->as(), partitions); + if (singlePartition.has_value()) { + // All rows go to the same partition + std::fill(partitions.begin(), partitions.end(), singlePartition.value()); + } + } + + std::vector partitionedVectors = + partitionVectorByWrapping(mergedRowVector, partitions, numPartitions); + + for (auto& vector : partitionedVectors) { + vector = canonicalize(vector); + } + return partitionedVectors; +} + +VectorPtr PartitionedVectorTestBase::mergeVectors( + const std::vector& vectors) { + // We have to count the total number of rows first in order to allocate the + // mergedRowVector. + auto mergedVector = BaseVector::copy(*vectors[0]); + for (auto i = 1; i < vectors.size(); ++i) { + mergedVector->append(vectors[i].get()); + } + + return mergedVector; +} + +} // namespace facebook::velox::test diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.h b/velox/vector/tests/utils/PartitionedVectorTestBase.h new file mode 100644 index 00000000000..b2c50761edc --- /dev/null +++ b/velox/vector/tests/utils/PartitionedVectorTestBase.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/core/PlanNode.h" +#include "velox/vector/PartitionedVector.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +namespace facebook::velox::test { + +class PartitionedVectorTestBase : public VectorTestBase { + protected: + std::vector partitionVectorByWrapping( + VectorPtr vector, + const std::vector& partitions, + uint32_t numPartitions); + + std::vector partitionRowVectors( + const std::vector& rowVectors, + int32_t numPartitions, + core::PartitionFunction* partitionFunction); + + VectorPtr canonicalize(VectorPtr vector); + + VectorPtr mergeVectors(const std::vector& vectors); +}; + +} // namespace facebook::velox::test