Skip to content

Commit da33c72

Browse files
infvgLakehouse Engine Bot
authored andcommitted
Fix iceberg min max statistics for decimal type when encoded as int32
Signed-off-by: Hazmi <ialhazmim@gmail.com> Alchemy-item: (ID = 1203) Fix iceberg min max statistics for decimal type when encoded as int32 commit 1/1 - 0ac9930
1 parent f1ea61d commit da33c72

2 files changed

Lines changed: 45 additions & 1 deletion

File tree

velox/dwio/parquet/writer/arrow/Statistics.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,9 @@ TypedComparatorImpl<false, ByteArrayType>::getMinMax(
555555
template <typename T>
556556
std::string encodeDecimalToBigEndian(T value) {
557557
uint8_t buffer[sizeof(T)];
558-
if constexpr (std::is_same_v<T, int64_t>) {
558+
if constexpr (std::is_same_v<T, int32_t>) {
559+
*reinterpret_cast<int32_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
560+
} else if constexpr (std::is_same_v<T, int64_t>) {
559561
*reinterpret_cast<int64_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
560562
} else if constexpr (std::is_same_v<T, int128_t>) {
561563
*reinterpret_cast<int128_t*>(buffer) = DecimalUtil::bigEndian(value);
@@ -813,6 +815,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
813815
}
814816

815817
std::string MinValue() const override {
818+
if constexpr (std::is_same_v<T, int32_t>) {
819+
if (descr_->logicalType()->isDecimal()) {
820+
return encodeDecimalToBigEndian(min_);
821+
}
822+
}
816823
if constexpr (std::is_same_v<T, int64_t>) {
817824
if (descr_->logicalType()->isDecimal()) {
818825
return encodeDecimalToBigEndian(min_);
@@ -841,6 +848,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
841848
}
842849

843850
std::string MaxValue() const override {
851+
if constexpr (std::is_same_v<T, int32_t>) {
852+
if (descr_->logicalType()->isDecimal()) {
853+
return encodeDecimalToBigEndian(max_);
854+
}
855+
}
844856
if constexpr (std::is_same_v<T, int64_t>) {
845857
if (descr_->logicalType()->isDecimal()) {
846858
return encodeDecimalToBigEndian(max_);

velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2188,6 +2188,38 @@ TEST(IcebergStatistics, unboundedUpperBound) {
21882188
}
21892189
}
21902190

2191+
TEST(IcebergStatistics, maxValueWithNulls) {
2192+
const NodePtr node = PrimitiveNode::make(
2193+
"decimal_col",
2194+
Repetition::kOptional,
2195+
LogicalType::decimal(7, 2),
2196+
Type::kInt32);
2197+
ColumnDescriptor descr(node, 1, 1);
2198+
2199+
auto stats = makeStatistics<Int32Type>(&descr);
2200+
2201+
std::vector<int32_t> values = {19900, 20000};
2202+
stats->update(values.data(), values.size(), 1);
2203+
2204+
ASSERT_TRUE(stats->hasMinMax());
2205+
EXPECT_EQ(stats->min(), 19900);
2206+
EXPECT_EQ(stats->max(), 20000);
2207+
2208+
const auto maxValue = stats->MaxValue();
2209+
EXPECT_FALSE(maxValue.empty()) << "MaxValue() should not be empty";
2210+
2211+
int32_t decodedMax = ::arrow::bit_util::FromBigEndian(
2212+
*reinterpret_cast<const int32_t*>(maxValue.data()));
2213+
EXPECT_EQ(decodedMax, 20000) << "MaxValue() should return 20000";
2214+
2215+
const auto minValue = stats->MinValue();
2216+
EXPECT_FALSE(minValue.empty()) << "MinValue() should not be empty";
2217+
2218+
int32_t decodedMin = ::arrow::bit_util::FromBigEndian(
2219+
*reinterpret_cast<const int32_t*>(minValue.data()));
2220+
EXPECT_EQ(decodedMin, 19900) << "MinValue() should return 19900";
2221+
}
2222+
21912223
TEST(StatisticsComparison, withInt64) {
21922224
NodePtr Node =
21932225
PrimitiveNode::make("int_col", Repetition::kRequired, Type::kInt64);

0 commit comments

Comments
 (0)