Skip to content

Commit 5e5bacb

Browse files
infvgLakehouse Engine Bot
authored andcommitted
Fix iceberg min max statistics for decimal type when encoded as int32
Signed-off-by: Hazmi <ialhazmim@gmail.com> Alchemy-item: (ID = 1203) Fix iceberg min max statistics for decimal type when encoded as int32 commit 1/1 - 0ac9930
1 parent a5a6fd1 commit 5e5bacb

2 files changed

Lines changed: 45 additions & 1 deletion

File tree

velox/dwio/parquet/writer/arrow/Statistics.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,9 @@ TypedComparatorImpl<false, ByteArrayType>::getMinMax(
559559
template <typename T>
560560
std::string encodeDecimalToBigEndian(T value) {
561561
uint8_t buffer[sizeof(T)];
562-
if constexpr (std::is_same_v<T, int64_t>) {
562+
if constexpr (std::is_same_v<T, int32_t>) {
563+
*reinterpret_cast<int32_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
564+
} else if constexpr (std::is_same_v<T, int64_t>) {
563565
*reinterpret_cast<int64_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
564566
} else if constexpr (std::is_same_v<T, int128_t>) {
565567
*reinterpret_cast<int128_t*>(buffer) = DecimalUtil::bigEndian(value);
@@ -809,6 +811,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
809811
}
810812

811813
std::string MinValue() const override {
814+
if constexpr (std::is_same_v<T, int32_t>) {
815+
if (descr_->logicalType()->isDecimal()) {
816+
return encodeDecimalToBigEndian(min_);
817+
}
818+
}
812819
if constexpr (std::is_same_v<T, int64_t>) {
813820
if (descr_->logicalType()->isDecimal()) {
814821
return encodeDecimalToBigEndian(min_);
@@ -835,6 +842,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
835842
}
836843

837844
std::string MaxValue() const override {
845+
if constexpr (std::is_same_v<T, int32_t>) {
846+
if (descr_->logicalType()->isDecimal()) {
847+
return encodeDecimalToBigEndian(max_);
848+
}
849+
}
838850
if constexpr (std::is_same_v<T, int64_t>) {
839851
if (descr_->logicalType()->isDecimal()) {
840852
return encodeDecimalToBigEndian(max_);

velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2152,6 +2152,38 @@ TEST(IcebergStatistics, unboundedUpperBound) {
21522152
}
21532153
}
21542154

2155+
TEST(IcebergStatistics, maxValueWithNulls) {
2156+
const NodePtr node = PrimitiveNode::make(
2157+
"decimal_col",
2158+
Repetition::kOptional,
2159+
LogicalType::decimal(7, 2),
2160+
Type::kInt32);
2161+
ColumnDescriptor descr(node, 1, 1);
2162+
2163+
auto stats = makeStatistics<Int32Type>(&descr);
2164+
2165+
std::vector<int32_t> values = {19900, 20000};
2166+
stats->update(values.data(), values.size(), 1);
2167+
2168+
ASSERT_TRUE(stats->hasMinMax());
2169+
EXPECT_EQ(stats->min(), 19900);
2170+
EXPECT_EQ(stats->max(), 20000);
2171+
2172+
const auto maxValue = stats->MaxValue();
2173+
EXPECT_FALSE(maxValue.empty()) << "MaxValue() should not be empty";
2174+
2175+
int32_t decodedMax = ::arrow::bit_util::FromBigEndian(
2176+
*reinterpret_cast<const int32_t*>(maxValue.data()));
2177+
EXPECT_EQ(decodedMax, 20000) << "MaxValue() should return 20000";
2178+
2179+
const auto minValue = stats->MinValue();
2180+
EXPECT_FALSE(minValue.empty()) << "MinValue() should not be empty";
2181+
2182+
int32_t decodedMin = ::arrow::bit_util::FromBigEndian(
2183+
*reinterpret_cast<const int32_t*>(minValue.data()));
2184+
EXPECT_EQ(decodedMin, 19900) << "MinValue() should return 19900";
2185+
}
2186+
21552187
TEST(StatisticsComparison, withInt64) {
21562188
NodePtr Node =
21572189
PrimitiveNode::make("int_col", Repetition::kRequired, Type::kInt64);

0 commit comments

Comments
 (0)