Skip to content

Commit 9c5d10e

Browse files
authored
[FEA] Add GroupBy.collectList and GroupBy.collectSet (#370)
* add collect_list and collect_set aggregations * add DataFrame and ListSeries flatten * Propagate nested child field names through DataFrame operations
1 parent dd1459e commit 9c5d10e

19 files changed

Lines changed: 878 additions & 347 deletions

File tree

modules/cudf/src/data_frame.ts

Lines changed: 130 additions & 80 deletions
Large diffs are not rendered by default.

modules/cudf/src/groupby.cpp

Lines changed: 80 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020-2021, NVIDIA CORPORATION.
1+
// Copyright (c) 2021-2022, NVIDIA CORPORATION.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
#include "node_cudf/groupby.hpp"
16-
#include "node_cudf/table.hpp"
17-
#include "node_cudf/utilities/error.hpp"
18-
#include "node_cudf/utilities/napi_to_cpp.hpp"
15+
#include <node_cudf/groupby.hpp>
16+
#include <node_cudf/table.hpp>
17+
#include <node_cudf/utilities/error.hpp>
18+
#include <node_cudf/utilities/napi_to_cpp.hpp>
1919

2020
#include <cudf/groupby.hpp>
2121
#include <cudf/types.hpp>
@@ -48,6 +48,8 @@ Napi::Function GroupBy::Init(Napi::Env const& env, Napi::Object exports) {
4848
InstanceMethod<&GroupBy::sum>("_sum"),
4949
InstanceMethod<&GroupBy::var>("_var"),
5050
InstanceMethod<&GroupBy::quantile>("_quantile"),
51+
InstanceMethod<&GroupBy::collect_list>("_collect_list"),
52+
InstanceMethod<&GroupBy::collect_set>("_collect_set"),
5153
});
5254
}
5355

@@ -111,168 +113,147 @@ Napi::Value GroupBy::get_groups(Napi::CallbackInfo const& info) {
111113
}
112114

113115
Napi::Value GroupBy::argmax(Napi::CallbackInfo const& info) {
114-
auto args = _get_basic_args(info);
115-
auto values = args.first;
116-
auto mr = args.second;
116+
auto [values, mr] = _get_basic_args(info);
117117
return _single_aggregation(
118-
[&]() { return cudf::make_argmax_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
118+
info, values, mr, []() { return cudf::make_argmax_aggregation<cudf::groupby_aggregation>(); });
119119
}
120120

121121
Napi::Value GroupBy::argmin(Napi::CallbackInfo const& info) {
122-
auto args = _get_basic_args(info);
123-
auto values = args.first;
124-
auto mr = args.second;
122+
auto [values, mr] = _get_basic_args(info);
125123
return _single_aggregation(
126-
[&]() { return cudf::make_argmin_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
124+
info, values, mr, []() { return cudf::make_argmin_aggregation<cudf::groupby_aggregation>(); });
127125
}
128126

129127
Napi::Value GroupBy::count(Napi::CallbackInfo const& info) {
130-
auto args = _get_basic_args(info);
131-
auto values = args.first;
132-
auto mr = args.second;
128+
auto [values, mr] = _get_basic_args(info);
133129
return _single_aggregation(
134-
[&]() { return cudf::make_count_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
130+
info, values, mr, []() { return cudf::make_count_aggregation<cudf::groupby_aggregation>(); });
135131
}
136132

137133
Napi::Value GroupBy::max(Napi::CallbackInfo const& info) {
138-
auto args = _get_basic_args(info);
139-
auto values = args.first;
140-
auto mr = args.second;
134+
auto [values, mr] = _get_basic_args(info);
141135
return _single_aggregation(
142-
[&]() { return cudf::make_max_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
136+
info, values, mr, []() { return cudf::make_max_aggregation<cudf::groupby_aggregation>(); });
143137
}
144138

145139
Napi::Value GroupBy::mean(Napi::CallbackInfo const& info) {
146-
auto args = _get_basic_args(info);
147-
auto values = args.first;
148-
auto mr = args.second;
140+
auto [values, mr] = _get_basic_args(info);
149141
return _single_aggregation(
150-
[&]() { return cudf::make_mean_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
142+
info, values, mr, []() { return cudf::make_mean_aggregation<cudf::groupby_aggregation>(); });
151143
}
152144

153145
Napi::Value GroupBy::median(Napi::CallbackInfo const& info) {
154-
auto args = _get_basic_args(info);
155-
auto values = args.first;
156-
auto mr = args.second;
146+
auto [values, mr] = _get_basic_args(info);
157147
return _single_aggregation(
158-
[&]() { return cudf::make_median_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
148+
info, values, mr, []() { return cudf::make_median_aggregation<cudf::groupby_aggregation>(); });
159149
}
160150

161151
Napi::Value GroupBy::min(Napi::CallbackInfo const& info) {
162-
auto args = _get_basic_args(info);
163-
auto values = args.first;
164-
auto mr = args.second;
152+
auto [values, mr] = _get_basic_args(info);
165153
return _single_aggregation(
166-
[&]() { return cudf::make_min_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
154+
info, values, mr, []() { return cudf::make_min_aggregation<cudf::groupby_aggregation>(); });
167155
}
168156

169157
Napi::Value GroupBy::nth(Napi::CallbackInfo const& info) {
170158
CallbackArgs args{info};
171-
172-
cudf::size_type n = args[0];
173-
174-
auto values = args[1];
175-
NODE_CUDA_EXPECT(Table::IsInstance(values),
176-
"aggregation expects options to have a 'values' table");
177-
nv::Table* values_table = Table::Unwrap(values.ToObject());
178-
179-
auto mr = MemoryResource::IsInstance(info[2]) ? *MemoryResource::Unwrap(info[2].ToObject())
180-
: rmm::mr::get_current_device_resource();
181-
182-
return _single_aggregation(
183-
[&]() { return cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(n); },
184-
values_table,
185-
mr,
186-
info);
159+
auto [values, mr] = _get_basic_args(info);
160+
cudf::size_type n = args[2];
161+
auto include_nulls =
162+
info[3].ToBoolean() ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
163+
return _single_aggregation(info, values, mr, [&]() {
164+
return cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(n, include_nulls);
165+
});
187166
}
188167

189168
Napi::Value GroupBy::nunique(Napi::CallbackInfo const& info) {
190-
auto args = _get_basic_args(info);
191-
auto values = args.first;
192-
auto mr = args.second;
193-
return _single_aggregation(
194-
[&]() { return cudf::make_nunique_aggregation<cudf::groupby_aggregation>(); },
195-
values,
196-
mr,
197-
info);
169+
auto [values, mr] = _get_basic_args(info);
170+
auto include_nulls =
171+
info[3].ToBoolean() ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
172+
return _single_aggregation(info, values, mr, [&]() {
173+
return cudf::make_nunique_aggregation<cudf::groupby_aggregation>(include_nulls);
174+
});
198175
}
199176

200177
Napi::Value GroupBy::std(Napi::CallbackInfo const& info) {
201-
auto args = _get_basic_args(info);
202-
auto values = args.first;
203-
auto mr = args.second;
204-
return _single_aggregation(
205-
[&]() { return cudf::make_std_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
178+
auto [values, mr] = _get_basic_args(info);
179+
cudf::size_type ddof = info[3].IsNumber() ? info[3].ToNumber() : 1;
180+
return _single_aggregation(info, values, mr, [&]() {
181+
return cudf::make_std_aggregation<cudf::groupby_aggregation>(ddof);
182+
});
206183
}
207184

208185
Napi::Value GroupBy::sum(Napi::CallbackInfo const& info) {
209-
auto args = _get_basic_args(info);
210-
auto values = args.first;
211-
auto mr = args.second;
186+
auto [values, mr] = _get_basic_args(info);
212187
return _single_aggregation(
213-
[&]() { return cudf::make_sum_aggregation<cudf::groupby_aggregation>(); }, values, mr, info);
188+
info, values, mr, []() { return cudf::make_sum_aggregation<cudf::groupby_aggregation>(); });
214189
}
215190

216191
Napi::Value GroupBy::var(Napi::CallbackInfo const& info) {
217-
auto args = _get_basic_args(info);
218-
auto values = args.first;
219-
auto mr = args.second;
220-
return _single_aggregation(
221-
[&]() { return cudf::make_variance_aggregation<cudf::groupby_aggregation>(); },
222-
values,
223-
mr,
224-
info);
192+
auto [values, mr] = _get_basic_args(info);
193+
cudf::size_type ddof = info[3].IsNumber() ? info[3].ToNumber() : 1;
194+
return _single_aggregation(info, values, mr, [&]() {
195+
return cudf::make_variance_aggregation<cudf::groupby_aggregation>(ddof);
196+
});
225197
}
226198

227199
Napi::Value GroupBy::quantile(Napi::CallbackInfo const& info) {
228200
CallbackArgs args{info};
201+
auto [values, mr] = _get_basic_args(info);
202+
std::vector<double> quantiles{args[2]};
203+
cudf::interpolation interp = args[3];
204+
return _single_aggregation(info, values, mr, [&]() {
205+
return cudf::make_quantile_aggregation<cudf::groupby_aggregation>(quantiles, interp);
206+
});
207+
}
229208

230-
double q = args[0];
231-
std::vector<double> qs{q};
232-
233-
auto values = args[1];
234-
NODE_CUDA_EXPECT(Table::IsInstance(values),
235-
"GroupBy quantile_agg expects options to have a 'values' table");
236-
nv::Table* values_table = Table::Unwrap(values.ToObject());
237-
238-
cudf::interpolation interpolation = args[2];
239-
240-
auto mr = MemoryResource::IsInstance(info[3]) ? *MemoryResource::Unwrap(info[3].ToObject())
241-
: rmm::mr::get_current_device_resource();
209+
Napi::Value GroupBy::collect_list(Napi::CallbackInfo const& info) {
210+
auto [values, mr] = _get_basic_args(info);
211+
auto include_nulls =
212+
info[2].ToBoolean() ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
213+
return _single_aggregation(info, values, mr, [&]() {
214+
return cudf::make_collect_list_aggregation<cudf::groupby_aggregation>(include_nulls);
215+
});
216+
}
242217

243-
return _single_aggregation(
244-
[&]() { return cudf::make_quantile_aggregation<cudf::groupby_aggregation>(qs, interpolation); },
245-
values_table,
246-
mr,
247-
info);
218+
Napi::Value GroupBy::collect_set(Napi::CallbackInfo const& info) {
219+
auto [values, mr] = _get_basic_args(info);
220+
auto include_nulls =
221+
info[2].ToBoolean() ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
222+
auto nulls_equal =
223+
info[3].ToBoolean() ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
224+
auto nans_equal =
225+
info[4].ToBoolean() ? cudf::nan_equality::UNEQUAL : cudf::nan_equality::ALL_EQUAL;
226+
return _single_aggregation(info, values, mr, [&]() {
227+
return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
228+
include_nulls, nulls_equal, nans_equal);
229+
});
248230
}
249231

250-
std::pair<nv::Table*, rmm::mr::device_memory_resource*> GroupBy::_get_basic_args(
232+
std::pair<Table::wrapper_t, rmm::mr::device_memory_resource*> GroupBy::_get_basic_args(
251233
Napi::CallbackInfo const& info) {
252234
CallbackArgs args{info};
253235

254236
auto values = args[0];
255237
NODE_CUDA_EXPECT(Table::IsInstance(values), "aggregation expects to have a 'values' table");
256238

257-
rmm::mr::device_memory_resource* mr = args[1];
258-
259-
return std::pair<Table*, rmm::mr::device_memory_resource*>(Table::Unwrap(values.ToObject()), mr);
239+
return std::make_pair(values.ToObject(), args[1]);
260240
}
261241

262242
template <typename MakeAggregation>
263-
Napi::Value GroupBy::_single_aggregation(MakeAggregation const& make_aggregation,
264-
const nv::Table* const values_table,
243+
Napi::Value GroupBy::_single_aggregation(Napi::CallbackInfo const& info,
244+
Table::wrapper_t const& values_table,
265245
rmm::mr::device_memory_resource* const mr,
266-
Napi::CallbackInfo const& info) {
246+
MakeAggregation const& make_aggregation) {
267247
auto env = info.Env();
268248

269249
std::vector<cudf::groupby::aggregation_request> requests;
250+
requests.reserve(values_table->num_columns());
270251

271252
for (cudf::size_type i = 0; i < values_table->num_columns(); ++i) {
272253
auto request = cudf::groupby::aggregation_request();
273254
request.values = values_table->get_column(i).view();
274255
request.aggregations.push_back(std::move(make_aggregation()));
275-
requests.emplace_back(std::move(request));
256+
requests.push_back(std::move(request));
276257
}
277258

278259
std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result;

modules/cudf/src/groupby/base.ts

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020-2021, NVIDIA CORPORATION.
1+
// Copyright (c) 2020-2022, NVIDIA CORPORATION.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -41,22 +41,57 @@ export type Groups<KeysMap extends TypeMap, ValuesMap extends TypeMap> = {
4141

4242
interface CudfGroupBy {
4343
_getGroups(values?: Table,
44-
memoryResource?: MemoryResource): {keys: Table, offsets: Int32Array, values?: Table};
45-
46-
_argmax(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
47-
_argmin(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
48-
_count(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
49-
_max(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
50-
_mean(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
51-
_median(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
52-
_min(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
53-
_nth(n: number, values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
54-
_nunique(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
55-
_std(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
56-
_sum(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
57-
_var(values: Table, memoryResource?: MemoryResource): {keys: Table, cols: Column[]};
58-
_quantile(q: number, values: Table, interpolation?: number, memoryResource?: MemoryResource):
44+
memoryResource?: MemoryResource): //
45+
{keys: Table, offsets: Int32Array, values?: Table};
46+
47+
_argmax(values: Table, memoryResource?: MemoryResource): //
48+
{keys: Table, cols: Column[]};
49+
50+
_argmin(values: Table, memoryResource?: MemoryResource): //
51+
{keys: Table, cols: Column[]};
52+
53+
_count(values: Table, memoryResource?: MemoryResource): //
54+
{keys: Table, cols: Column[]};
55+
56+
_max(values: Table, memoryResource?: MemoryResource): //
57+
{keys: Table, cols: Column[]};
58+
59+
_mean(values: Table, memoryResource?: MemoryResource): //
60+
{keys: Table, cols: Column[]};
61+
62+
_median(values: Table, memoryResource?: MemoryResource): //
63+
{keys: Table, cols: Column[]};
64+
65+
_min(values: Table, memoryResource?: MemoryResource): //
66+
{keys: Table, cols: Column[]};
67+
68+
_nth(values: Table, memoryResource?: MemoryResource, n?: number, include_nulls?: boolean):
69+
{keys: Table, cols: Column[]};
70+
71+
_nunique(values: Table, memoryResource?: MemoryResource, include_nulls?: boolean):
72+
{keys: Table, cols: Column[]};
73+
74+
_std(values: Table, memoryResource?: MemoryResource, ddof?: number):
75+
{keys: Table, cols: Column[]};
76+
77+
_sum(values: Table, memoryResource?: MemoryResource): //
78+
{keys: Table, cols: Column[]};
79+
80+
_var(values: Table, memoryResource?: MemoryResource, ddof?: number):
81+
{keys: Table, cols: Column[]};
82+
83+
_quantile(values: Table, memoryResource?: MemoryResource, q?: number, interpolation?: number):
5984
{keys: Table, cols: [Column]};
85+
86+
_collect_list(values: Table, memoryResource?: MemoryResource, include_nulls?: boolean):
87+
{keys: Table, cols: Column[]};
88+
89+
_collect_set(values: Table,
90+
memoryResource?: MemoryResource,
91+
include_nulls?: boolean,
92+
nulls_equal?: boolean,
93+
nans_equal?: boolean): //
94+
{keys: Table, cols: Column[]};
6095
}
6196

6297
export class GroupByBase<T extends TypeMap, R extends keyof T> {

0 commit comments

Comments
 (0)