Skip to content

Commit d7c033f

Browse files
authored
[Fix](variance) Fix sample variance/stddev NaN res for single value (#63605)
Problem Summary: Fix `VAR_SAMP`, `VARIANCE_SAMP`, and `STDDEV_SAMP` to return `NaN` when the number of valid input values is less than or equal to 1. Sample variance/stddev are undefined for `n <= 1`, so returning `0.0` is misleading. before: ```sql CREATE TABLE t (id INT, v DOUBLE) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES('replication_num'='1'); INSERT INTO t VALUES (1, 5.0); -- 单行 SELECT VAR_SAMP(v), STDDEV_SAMP(v) FROM t; +-------------+----------------+ | VAR_SAMP(v) | STDDEV_SAMP(v) | +-------------+----------------+ | 0 | 0 | +-------------+----------------+ ``` now: ```sql SELECT VAR_SAMP(v), STDDEV_SAMP(v) FROM t; +-------------+----------------+ | VAR_SAMP(v) | STDDEV_SAMP(v) | +-------------+----------------+ | NaN | NaN | +-------------+----------------+ ``` doc: apache/doris-website#3765
1 parent 3c9c40f commit d7c033f

11 files changed

Lines changed: 988 additions & 1006 deletions

File tree

be/src/exprs/aggregate/aggregate_function_stddev.h

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,11 @@
2121
#include <cmath>
2222
#include <cstddef>
2323
#include <cstdint>
24+
#include <limits>
2425
#include <memory>
25-
#include <type_traits>
2626

2727
#include "core/assert_cast.h"
2828
#include "core/column/column.h"
29-
#include "core/column/column_nullable.h"
30-
#include "core/data_type/data_type_decimal.h"
3129
#include "core/data_type/data_type_number.h"
3230
#include "core/types.h"
3331
#include "exprs/aggregate/aggregate_function.h"
@@ -37,8 +35,6 @@ class Arena;
3735
class BufferReadable;
3836
class BufferWritable;
3937
template <PrimitiveType T>
40-
class ColumnDecimal;
41-
template <PrimitiveType T>
4238
class ColumnVector;
4339

4440
template <PrimitiveType T, bool is_stddev>
@@ -71,7 +67,7 @@ struct BaseData {
7167
// In MySQL, this will directly result in an error due to exceeding the double range.
7268
// For performance reasons, we are uniformly changing it to nan
7369
if (std::isinf(val)) {
74-
return std::nan("");
70+
return std::numeric_limits<double>::quiet_NaN();
7571
}
7672
return val;
7773
};
@@ -125,14 +121,9 @@ struct BaseData {
125121

126122
template <PrimitiveType T, typename Name, bool is_stddev>
127123
struct PopData : BaseData<T, is_stddev>, Name {
128-
using ColVecResult = std::conditional_t<is_decimal(T), ColumnDecimal128V2, ColumnFloat64>;
129124
void insert_result_into(IColumn& to) const {
130-
auto& col = assert_cast<ColVecResult&>(to);
131-
if constexpr (is_decimal(T)) {
132-
col.get_data().push_back(this->get_pop_result().value());
133-
} else {
134-
col.get_data().push_back(this->get_pop_result());
135-
}
125+
auto& col = assert_cast<ColumnFloat64&>(to);
126+
col.get_data().push_back(this->get_pop_result());
136127
}
137128

138129
static DataTypePtr get_return_type() { return std::make_shared<DataTypeFloat64>(); }
@@ -144,17 +135,12 @@ struct PopData : BaseData<T, is_stddev>, Name {
144135

145136
template <PrimitiveType T, typename Name, bool is_stddev>
146137
struct SampData : BaseData<T, is_stddev>, Name {
147-
using ColVecResult = std::conditional_t<is_decimal(T), ColumnDecimal128V2, ColumnFloat64>;
148138
void insert_result_into(IColumn& to) const {
149-
auto& col = assert_cast<ColVecResult&>(to);
139+
auto& col = assert_cast<ColumnFloat64&>(to);
150140
if (this->count == 1 || this->count == 0) {
151-
col.insert_default();
141+
col.get_data().push_back(std::numeric_limits<double>::quiet_NaN());
152142
} else {
153-
if constexpr (is_decimal(T)) {
154-
col.get_data().push_back(this->get_samp_result().value());
155-
} else {
156-
col.get_data().push_back(this->get_samp_result());
157-
}
143+
col.get_data().push_back(this->get_samp_result());
158144
}
159145
}
160146

0 commit comments

Comments
 (0)