Skip to content

Commit 651fbbf

Browse files
author
Rafał Hibner
committed
Merge branch 'tdigest_map' into combined3
2 parents 0692de1 + 22b3f6c commit 651fbbf

6 files changed

Lines changed: 850 additions & 75 deletions

File tree

cpp/src/arrow/compute/api_aggregate.cc

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,17 @@ static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
141141
DataMember("skip_nulls", &TDigestOptions::skip_nulls),
142142
DataMember("min_count", &TDigestOptions::min_count),
143143
DataMember("scaler", &TDigestOptions::scaler));
144+
static auto kTDigestMapOptionsType = GetFunctionOptionsType<TDigestMapOptions>(
145+
DataMember("delta", &TDigestMapOptions::delta),
146+
DataMember("buffer_size", &TDigestMapOptions::buffer_size),
147+
DataMember("skip_nulls", &TDigestMapOptions::skip_nulls),
148+
DataMember("scaler", &TDigestMapOptions::scaler));
149+
static auto kTDigestReduceOptionsType = GetFunctionOptionsType<TDigestReduceOptions>(
150+
DataMember("scaler", &TDigestReduceOptions::scaler));
151+
static auto kTDigestQuantileOptionsType = GetFunctionOptionsType<TDigestQuantileOptions>(
152+
DataMember("q", &TDigestQuantileOptions::q),
153+
DataMember("min_count", &TDigestQuantileOptions::min_count),
154+
DataMember("scaler", &TDigestQuantileOptions::scaler));
144155
static auto kPivotOptionsType = GetFunctionOptionsType<PivotWiderOptions>(
145156
DataMember("key_names", &PivotWiderOptions::key_names),
146157
DataMember("unexpected_key_behavior", &PivotWiderOptions::unexpected_key_behavior));
@@ -216,6 +227,34 @@ TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
216227
scaler{scaler} {}
217228
constexpr char TDigestOptions::kTypeName[];
218229

230+
TDigestMapOptions::TDigestMapOptions(uint32_t delta, uint32_t buffer_size,
231+
bool skip_nulls, Scaler scaler)
232+
: FunctionOptions(internal::kTDigestMapOptionsType),
233+
delta{delta},
234+
buffer_size{buffer_size},
235+
skip_nulls{skip_nulls},
236+
scaler{scaler} {}
237+
constexpr char TDigestMapOptions::kTypeName[];
238+
239+
TDigestReduceOptions::TDigestReduceOptions(Scaler scaler)
240+
: FunctionOptions(internal::kTDigestReduceOptionsType), scaler{scaler} {}
241+
constexpr char TDigestReduceOptions::kTypeName[];
242+
243+
TDigestQuantileOptions::TDigestQuantileOptions(double q, uint32_t min_count,
244+
Scaler scaler)
245+
: FunctionOptions(internal::kTDigestQuantileOptionsType),
246+
q{q},
247+
min_count{min_count},
248+
scaler{scaler} {}
249+
250+
TDigestQuantileOptions::TDigestQuantileOptions(std::vector<double> q, uint32_t min_count,
251+
Scaler scaler)
252+
: FunctionOptions(internal::kTDigestQuantileOptionsType),
253+
q{std::move(q)},
254+
min_count{min_count},
255+
scaler{scaler} {}
256+
constexpr char TDigestReduceOptions::kTypeName[];
257+
219258
PivotWiderOptions::PivotWiderOptions(std::vector<std::string> key_names,
220259
UnexpectedKeyBehavior unexpected_key_behavior)
221260
: FunctionOptions(internal::kPivotOptionsType),
@@ -237,6 +276,9 @@ void RegisterAggregateOptions(FunctionRegistry* registry) {
237276
DCHECK_OK(registry->AddFunctionOptionsType(kSkewOptionsType));
238277
DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
239278
DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
279+
DCHECK_OK(registry->AddFunctionOptionsType(kTDigestMapOptionsType));
280+
DCHECK_OK(registry->AddFunctionOptionsType(kTDigestReduceOptionsType));
281+
DCHECK_OK(registry->AddFunctionOptionsType(kTDigestQuantileOptionsType));
240282
DCHECK_OK(registry->AddFunctionOptionsType(kPivotOptionsType));
241283
DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
242284
}
@@ -321,6 +363,21 @@ Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
321363
return CallFunction("tdigest", {value}, &options, ctx);
322364
}
323365

366+
Result<Datum> TDigestMap(const Datum& value, const TDigestMapOptions& options,
367+
ExecContext* ctx) {
368+
return CallFunction("tdigest_map", {value}, &options, ctx);
369+
}
370+
371+
Result<Datum> TDigestReduce(const Datum& value, const TDigestReduceOptions& options,
372+
ExecContext* ctx) {
373+
return CallFunction("tdigest_reduce", {value}, &options, ctx);
374+
}
375+
376+
Result<Datum> TDigestQuantile(const Datum& value, const TDigestQuantileOptions& options,
377+
ExecContext* ctx) {
378+
return CallFunction("tdigest_quantile", {value}, &options, ctx);
379+
}
380+
324381
Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
325382
return CallFunction("index", {value}, &options, ctx);
326383
}

cpp/src/arrow/compute/api_aggregate.h

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,10 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions {
179179

180180
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
181181
uint32_t buffer_size = 500, bool skip_nulls = true,
182-
uint32_t min_count = 0, enum Scaler scaler = K0);
182+
uint32_t min_count = 0, enum Scaler scaler = K1);
183183
explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
184184
uint32_t buffer_size = 500, bool skip_nulls = true,
185-
uint32_t min_count = 0, enum Scaler scaler = K0);
185+
uint32_t min_count = 0, enum Scaler scaler = K1);
186186
static constexpr char const kTypeName[] = "TDigestOptions";
187187
static TDigestOptions Defaults() { return TDigestOptions{}; }
188188

@@ -201,6 +201,66 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions {
201201
enum Scaler scaler;
202202
};
203203

204+
/// \brief Control TDigest controid calculation
205+
///
206+
/// By default, returns the median value.
207+
class ARROW_EXPORT TDigestMapOptions : public FunctionOptions {
208+
public:
209+
using Scaler = TDigestOptions::Scaler;
210+
211+
explicit TDigestMapOptions(uint32_t delta = 100, uint32_t buffer_size = 500,
212+
bool skip_nulls = true, Scaler scaler = Scaler::K1);
213+
static constexpr char const kTypeName[] = "TDigestMapOptions";
214+
static TDigestMapOptions Defaults() { return TDigestMapOptions{}; }
215+
216+
/// compression parameter, default 100
217+
uint32_t delta;
218+
/// input buffer size, default 500
219+
uint32_t buffer_size;
220+
/// If true (the default), null values are ignored. Otherwise, if any value is null,
221+
/// emit null.
222+
bool skip_nulls;
223+
/// select scaler implementation
224+
Scaler scaler;
225+
};
226+
227+
/// \brief Control TDigest reduce behavior
228+
///
229+
/// By default, returns the median value.
230+
class ARROW_EXPORT TDigestReduceOptions : public FunctionOptions {
231+
public:
232+
using Scaler = TDigestOptions::Scaler;
233+
234+
explicit TDigestReduceOptions(Scaler scaler = Scaler::K1);
235+
static constexpr char const kTypeName[] = "TDigestReduceOptions";
236+
static TDigestReduceOptions Defaults() { return TDigestReduceOptions{}; }
237+
238+
/// select scaler implementation
239+
Scaler scaler;
240+
};
241+
242+
/// \brief Control TDigest approximate quantile kernel behavior
243+
///
244+
/// By default, returns the median value.
245+
class ARROW_EXPORT TDigestQuantileOptions : public FunctionOptions {
246+
public:
247+
using Scaler = TDigestOptions::Scaler;
248+
249+
explicit TDigestQuantileOptions(double q = 0.5, uint32_t min_count = 0,
250+
Scaler scaler = Scaler::K1);
251+
explicit TDigestQuantileOptions(std::vector<double> q, uint32_t min_count = 0,
252+
Scaler scaler = Scaler::K1);
253+
static constexpr char const kTypeName[] = "TDigestQuantileOptions";
254+
static TDigestQuantileOptions Defaults() { return TDigestQuantileOptions{}; }
255+
256+
/// probability level of quantile must be between 0 and 1 inclusive
257+
std::vector<double> q;
258+
/// If less than this many non-null values are observed, emit null.
259+
uint32_t min_count;
260+
/// select scaler implementation
261+
Scaler scaler;
262+
};
263+
204264
/// \brief Control Pivot kernel behavior
205265
///
206266
/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions.
@@ -586,6 +646,50 @@ Result<Datum> TDigest(const Datum& value,
586646
const TDigestOptions& options = TDigestOptions::Defaults(),
587647
ExecContext* ctx = NULLPTR);
588648

649+
/// \brief Calculate centroids of a numeric array with T-Digest algorithm
650+
///
651+
/// \param[in] value input datum, expecting Array or ChunkedArray
652+
/// \param[in] options see TDigestMapOptions for more information
653+
/// \param[in] ctx the function execution context, optional
654+
/// \return resulting struct of mean and weight arrays
655+
///
656+
/// \since 22.0.0
657+
/// \note API not yet finalized
658+
ARROW_EXPORT
659+
Result<Datum> TDigestMap(const Datum& value,
660+
const TDigestMapOptions& options = TDigestMapOptions::Defaults(),
661+
ExecContext* ctx = NULLPTR);
662+
663+
/// \brief Merge multiple centroid sets into one
664+
///
665+
/// \param[in] value input centroid sets, expecting Scalar, Array or ChunkedArray of
666+
/// centroid structs \param[in] options see TDigestReduceOptions for more information
667+
/// \param[in] ctx the function execution context, optional
668+
/// \return resulting struct of mean and weight arrays
669+
///
670+
/// \since 22.0.0
671+
/// \note API not yet finalized
672+
ARROW_EXPORT
673+
Result<Datum> TDigestReduce(
674+
const Datum& value,
675+
const TDigestReduceOptions& options = TDigestReduceOptions::Defaults(),
676+
ExecContext* ctx = NULLPTR);
677+
678+
/// \brief Calculate the approximate quantiles using centroids with T-Digest algorithm
679+
///
680+
/// \param[in] value input centroid sets, expecting Scalar, Array or ChunkedArray of
681+
/// centroid structs \param[in] options see TDigestQuantileOptions for more information
682+
/// \param[in] ctx the function execution context, optional
683+
/// \return resulting struct of mean and weight arrays
684+
///
685+
/// \since 22.0.0
686+
/// \note API not yet finalized
687+
ARROW_EXPORT
688+
Result<Datum> TDigestQuantile(
689+
const Datum& value,
690+
const TDigestQuantileOptions& options = TDigestQuantileOptions::Defaults(),
691+
ExecContext* ctx = NULLPTR);
692+
589693
/// \brief Find the first index of a value in an array.
590694
///
591695
/// \param[in] value The array to search.

0 commit comments

Comments
 (0)