Skip to content

Commit 11b0f24

Browse files
author
Rafał Hibner
committed
Merge branch 'SchedulerKeepAlive' into combined2
2 parents 9433ea8 + 4fc235f commit 11b0f24

61 files changed

Lines changed: 918 additions & 447 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/comment_bot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
run: pip install -e arrow/dev/archery[bot]
5050
- name: Handle GitHub comment event
5151
env:
52-
ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
52+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
5353
CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }}
5454
run: |
5555
archery --debug trigger-bot \

.github/workflows/dev.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,9 @@ jobs:
107107
gem install test-unit
108108
pip install "cython>=3" setuptools pytest requests setuptools-scm
109109
- name: Run Release Test
110-
env:
111-
ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
112110
shell: bash
113111
run: |
112+
echo "GH_TOKEN=${{ secrets.GITHUB_TOKEN }}" > dev/release/.env
114113
ci/scripts/release_test.sh $(pwd)
115114
- name: Run Merge Script Test
116115
shell: bash

.github/workflows/pr_bot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
run: pip install -e arrow/dev/archery[bot]
9090
- name: Handle PR workflow event
9191
env:
92-
ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
92+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
9393
run: |
9494
if [ "${GITHUB_EVENT_NAME}" = "workflow_run" ]; then
9595
# workflow_run is executed on PR review. Update to original event.

c_glib/arrow-glib/basic-array.cpp

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include <arrow/c/bridge.h>
2929

30+
#include <cmath>
3031
#include <sstream>
3132

3233
G_BEGIN_DECLS
@@ -491,6 +492,22 @@ garrow_array_statistics_has_distinct_count(GArrowArrayStatistics *statistics)
491492
return priv->statistics.distinct_count.has_value();
492493
}
493494

495+
/**
496+
* garrow_array_statistics_is_distinct_count_exact:
497+
* @statistics: A #GArrowArrayStatistics.
498+
*
499+
* Returns: %TRUE if the distinct count is available and exact, %FALSE otherwise.
500+
*
501+
* Since: 22.0.0
502+
*/
503+
gboolean
504+
garrow_array_statistics_is_distinct_count_exact(GArrowArrayStatistics *statistics)
505+
{
506+
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
507+
return priv->statistics.distinct_count.has_value() &&
508+
std::holds_alternative<int64_t>(priv->statistics.distinct_count.value());
509+
}
510+
494511
/**
495512
* garrow_array_statistics_get_distinct_count:
496513
* @statistics: A #GArrowArrayStatistics.
@@ -499,19 +516,59 @@ garrow_array_statistics_has_distinct_count(GArrowArrayStatistics *statistics)
499516
* -1 otherwise.
500517
*
501518
* Since: 21.0.0
519+
*
520+
* Deprecated: 22.0.0. Use garrow_array_statistics_is_distinct_count_exact(),
521+
* garrow_array_statistics_get_distinct_count_exact() and
522+
* garrow_array_statistics_get_distinct_count_approximate() instead.
502523
*/
503524
gint64
504525
garrow_array_statistics_get_distinct_count(GArrowArrayStatistics *statistics)
526+
{
527+
return garrow_array_statistics_get_distinct_count_exact(statistics);
528+
}
529+
530+
/**
531+
* garrow_array_statistics_get_distinct_count_exact:
532+
* @statistics: A #GArrowArrayStatistics.
533+
*
534+
* Returns: 0 or larger value if @statistics has a valid exact distinct count
535+
* value, -1 otherwise.
536+
*
537+
* Since: 22.0.0
538+
*/
539+
gint64
540+
garrow_array_statistics_get_distinct_count_exact(GArrowArrayStatistics *statistics)
505541
{
506542
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
507543
const auto &distinct_count = priv->statistics.distinct_count;
508-
if (distinct_count) {
509-
return distinct_count.value();
544+
if (distinct_count && std::holds_alternative<int64_t>(distinct_count.value())) {
545+
return std::get<int64_t>(distinct_count.value());
510546
} else {
511547
return -1;
512548
}
513549
}
514550

551+
/**
552+
* garrow_array_statistics_get_distinct_count_approximate:
553+
* @statistics: A #GArrowArrayStatistics.
554+
*
555+
* Returns: Non `NaN` value if @statistics has a valid approximate distinct count
556+
* value, `NaN` otherwise.
557+
*
558+
* Since: 22.0.0
559+
*/
560+
gdouble
561+
garrow_array_statistics_get_distinct_count_approximate(GArrowArrayStatistics *statistics)
562+
{
563+
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
564+
const auto &distinct_count = priv->statistics.distinct_count;
565+
if (distinct_count && std::holds_alternative<double>(distinct_count.value())) {
566+
return std::get<double>(distinct_count.value());
567+
} else {
568+
return std::nan("");
569+
}
570+
}
571+
515572
typedef struct GArrowArrayPrivate_
516573
{
517574
std::shared_ptr<arrow::Array> array;

c_glib/arrow-glib/basic-array.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,21 @@ garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics);
6161
GARROW_AVAILABLE_IN_21_0
6262
gboolean
6363
garrow_array_statistics_has_distinct_count(GArrowArrayStatistics *statistics);
64+
GARROW_AVAILABLE_IN_22_0
65+
gboolean
66+
garrow_array_statistics_is_distinct_count_exact(GArrowArrayStatistics *statistics);
67+
#ifndef GARROW_DISABLE_DEPRECATED
6468
GARROW_AVAILABLE_IN_21_0
69+
GARROW_DEPRECATED_IN_22_0_FOR(garrow_array_statistics_get_distinct_count_exact)
6570
gint64
6671
garrow_array_statistics_get_distinct_count(GArrowArrayStatistics *statistics);
72+
#endif
73+
GARROW_AVAILABLE_IN_22_0
74+
gint64
75+
garrow_array_statistics_get_distinct_count_exact(GArrowArrayStatistics *statistics);
76+
GARROW_AVAILABLE_IN_22_0
77+
gdouble
78+
garrow_array_statistics_get_distinct_count_approximate(GArrowArrayStatistics *statistics);
6779

6880
GARROW_AVAILABLE_IN_6_0
6981
GArrowArray *

c_glib/test/test-array-statistics.rb

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,23 @@ def setup
5151

5252
test("#has_distinct_count?") do
5353
assert do
54-
!@statistics.has_distinct_count?
54+
not @statistics.has_distinct_count?
5555
end
5656
end
5757

58-
test ("#distinct_count") do
59-
assert_equal(-1, @statistics.distinct_count)
58+
test("#distinct_count_exact?") do
59+
assert do
60+
not @statistics.distinct_count_exact?
61+
end
62+
end
63+
64+
test ("#distinct_count_exact") do
65+
assert_equal(-1, @statistics.distinct_count_exact)
66+
end
67+
68+
test ("#distinct_count_approximate") do
69+
assert do
70+
@statistics.distinct_count_approximate.nan?
71+
end
6072
end
6173
end

cpp/cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1767,6 +1767,10 @@ function(build_thrift)
17671767
set(WITH_QT5 OFF)
17681768
set(WITH_ZLIB OFF)
17691769

1770+
# Apache Thrift may change CMAKE_DEBUG_POSTFIX. So we'll restore the
1771+
# original CMAKE_DEBUG_POSTFIX later.
1772+
set(CMAKE_DEBUG_POSTFIX_KEEP ${CMAKE_DEBUG_POSTFIX})
1773+
17701774
# Remove Apache Arrow's CMAKE_MODULE_PATH to ensure using Apache
17711775
# Thrift's cmake_modules/.
17721776
#
@@ -1775,6 +1779,12 @@ function(build_thrift)
17751779
list(POP_FRONT CMAKE_MODULE_PATH)
17761780
fetchcontent_makeavailable(thrift)
17771781

1782+
# Apache Thrift may change CMAKE_DEBUG_POSTFIX. So we restore
1783+
# CMAKE_DEBUG_POSTFIX.
1784+
set(CMAKE_DEBUG_POSTFIX
1785+
${CMAKE_DEBUG_POSTFIX_KEEP}
1786+
CACHE BOOL "" FORCE)
1787+
17781788
if(CMAKE_VERSION VERSION_LESS 3.28)
17791789
set_property(DIRECTORY ${thrift_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE)
17801790
endif()
@@ -2549,7 +2559,7 @@ if(ARROW_USE_XSIMD)
25492559
IS_RUNTIME_DEPENDENCY
25502560
FALSE
25512561
REQUIRED_VERSION
2552-
"8.1.0")
2562+
"13.0.0")
25532563

25542564
if(xsimd_SOURCE STREQUAL "BUNDLED")
25552565
set(ARROW_XSIMD arrow::xsimd)

cpp/src/arrow/acero/exec_plan.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,11 @@ struct ExecPlanImpl : public ExecPlan {
127127
// call.
128128
auto scope = START_SCOPED_SPAN(span_, "ExecPlan", {{"plan", ToString()}});
129129
Future<> scheduler_finished = arrow::util::AsyncTaskScheduler::Make(
130-
[this](arrow::util::AsyncTaskScheduler* async_scheduler) {
130+
[this](std::shared_ptr<arrow::util::AsyncTaskScheduler> async_scheduler) {
131131
QueryContext* ctx = query_context();
132-
RETURN_NOT_OK(ctx->Init(async_scheduler));
133132

133+
RETURN_NOT_OK(ctx->Init(async_scheduler.get()));
134+
async_scheduler_ = async_scheduler;
134135
#ifdef ARROW_WITH_OPENTELEMETRY
135136
if (HasMetadata()) {
136137
auto pairs = metadata().get()->sorted_pairs();
@@ -231,8 +232,8 @@ struct ExecPlanImpl : public ExecPlan {
231232
// If an error occurs during StopProducing then we submit a task to fail. If we
232233
// have already aborted then this will be ignored. This way the failing status
233234
// will get communicated to finished_.
234-
query_context()->async_scheduler()->AddSimpleTask(
235-
[st] { return st; }, "ExecPlan::StopProducingErrorReporter"sv);
235+
async_scheduler_->AddSimpleTask([st] { return st; },
236+
"ExecPlan::StopProducingErrorReporter"sv);
236237
}
237238
}
238239
}
@@ -341,6 +342,7 @@ struct ExecPlanImpl : public ExecPlan {
341342
arrow::util::tracing::Span span_;
342343
std::shared_ptr<const KeyValueMetadata> metadata_;
343344
QueryContext query_context_;
345+
std::shared_ptr<arrow::util::AsyncTaskScheduler> async_scheduler_;
344346
// This field only exists for backwards compatibility. Remove once the deprecated
345347
// ExecPlan::Make overloads have been removed.
346348
std::shared_ptr<ThreadPool> owned_thread_pool_;

cpp/src/arrow/array/array_test.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3900,6 +3900,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39003900
void SetUp() {
39013901
valids_ = {1, 0, 1, 1};
39023902
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
3903+
distinct_count_ = 3.0;
39033904
average_byte_width_ = 4.0;
39043905
null_buffer_ = *internal::BytesToBits(valids_);
39053906
values_ = {1, 0, 3, -4};
@@ -3910,6 +3911,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39103911
null_count_);
39113912
data_->statistics = std::make_shared<ArrayStatistics>();
39123913
data_->statistics->null_count = null_count_;
3914+
data_->statistics->distinct_count = distinct_count_;
39133915
data_->statistics->average_byte_width = average_byte_width_;
39143916
data_->statistics->is_average_byte_width_exact = true;
39153917
data_->statistics->min = min_;
@@ -3921,6 +3923,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39213923
protected:
39223924
std::vector<uint8_t> valids_;
39233925
size_t null_count_;
3926+
double distinct_count_;
39243927
double average_byte_width_;
39253928
std::shared_ptr<Buffer> null_buffer_;
39263929
std::vector<int32_t> values_;
@@ -3937,6 +3940,10 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
39373940
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
39383941
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
39393942

3943+
ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
3944+
ASSERT_DOUBLE_EQ(distinct_count_,
3945+
std::get<double>(moved_data.statistics->distinct_count.value()));
3946+
39403947
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
39413948
ASSERT_DOUBLE_EQ(average_byte_width_,
39423949
moved_data.statistics->average_byte_width.value());
@@ -3959,6 +3966,10 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
39593966
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
39603967
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
39613968

3969+
ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
3970+
ASSERT_DOUBLE_EQ(distinct_count_,
3971+
std::get<double>(copied_data.statistics->distinct_count.value()));
3972+
39623973
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
39633974
ASSERT_DOUBLE_EQ(average_byte_width_,
39643975
copied_data.statistics->average_byte_width.value());
@@ -3983,6 +3994,10 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
39833994
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
39843995
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
39853996

3997+
ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
3998+
ASSERT_DOUBLE_EQ(distinct_count_,
3999+
std::get<double>(moved_data.statistics->distinct_count.value()));
4000+
39864001
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
39874002
ASSERT_DOUBLE_EQ(average_byte_width_,
39884003
moved_data.statistics->average_byte_width.value());
@@ -4006,6 +4021,10 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
40064021
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
40074022
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
40084023

4024+
ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
4025+
ASSERT_DOUBLE_EQ(distinct_count_,
4026+
std::get<double>(copied_data.statistics->distinct_count.value()));
4027+
40094028
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
40104029
ASSERT_DOUBLE_EQ(average_byte_width_,
40114030
copied_data.statistics->average_byte_width.value());

cpp/src/arrow/array/statistics.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ struct ARROW_EXPORT ArrayStatistics {
3939
/// value exists, one of them is used. `std::nullopt` is used
4040
/// otherwise.
4141
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
42+
using NumericType = std::variant<int64_t, double>;
43+
using CountType = NumericType;
4244

4345
static const std::shared_ptr<DataType>& ValueToArrowType(
4446
const std::optional<ValueType>& value,
@@ -76,7 +78,9 @@ struct ARROW_EXPORT ArrayStatistics {
7678
std::optional<int64_t> null_count = std::nullopt;
7779

7880
/// \brief The number of distinct values, may not be set
79-
std::optional<int64_t> distinct_count = std::nullopt;
81+
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,
82+
/// and when set to `double`, it represents `approximate_distinct_count`.
83+
std::optional<CountType> distinct_count = std::nullopt;
8084

8185
/// \brief The average size in bytes of a row in an array, may not be set.
8286
std::optional<double> average_byte_width = std::nullopt;

0 commit comments

Comments
 (0)