From 676aebcef7dbf02becfc95503376aa9b64ef3c60 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 10 Apr 2026 07:43:33 -0700 Subject: [PATCH 01/16] Add GPU quantile property tests --- tests/cpp/common/test_quantile.cu | 446 +++++++++++++++++++++++------- 1 file changed, 349 insertions(+), 97 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 3c3a39f6e3f8..79f1cfca59c7 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -33,24 +33,6 @@ struct RepeatedValueOp { } }; -auto GenerateDenseData(std::size_t rows, std::size_t cols, std::uint64_t seed) - -> std::vector { - HostDeviceVector storage; - RandomDataGenerator{static_cast(rows), cols, 0}.Seed(seed).GenerateDense(&storage); - return storage.HostVector(); -} - -auto MakeFullRowSplitDMatrix(std::size_t rows_per_worker, std::size_t cols, std::int32_t world, - std::int32_t seed) -> std::shared_ptr { - std::vector full_data; - full_data.reserve(rows_per_worker * cols * world); - for (std::int32_t rank = 0; rank < world; ++rank) { - auto block = GenerateDenseData(rows_per_worker, cols, rank + seed); - full_data.insert(full_data.end(), block.cbegin(), block.cend()); - } - return GetDMatrixFromData(full_data, rows_per_worker * world, cols); -} - auto MakeHostSummary(std::vector> const& items) -> common::WQSummaryContainer { common::WQSummaryContainer summary; @@ -69,6 +51,355 @@ auto CopySummaryEntries(common::WQSummaryContainer const& summary) namespace common { class MGPUQuantileTest : public collective::BaseMGPUTest {}; +namespace { +enum class WeightKind { kNone, kRow }; +enum class FeatureKind { kNumerical, kMixed }; + +inline constexpr double kMaxNormalizedRankError = 2.0; +inline constexpr double kMaxWeightedNormalizedRankError = 10.0; + +struct WeightedValue { + float value; + double weight; +}; + +struct ReferenceColumn { + std::vector values; + std::vector prefix_weights; +}; + +struct ContainerCase { + std::string name; + std::size_t rows{0}; + std::size_t cols{0}; + float sparsity{0.0f}; + bst_bin_t max_bin{0}; + WeightKind weights{WeightKind::kNone}; + FeatureKind features{FeatureKind::kNumerical}; + std::uint32_t seed{0}; +}; + +auto ContainerAnchorCases() -> std::vector { + return { + {"empty_numeric_bins16", 0, 32, 0.0f, 16, WeightKind::kNone, FeatureKind::kNumerical, 10}, + {"dense_numeric_unweighted_bins2", 256, 32, 0.0f, 2, WeightKind::kNone, + FeatureKind::kNumerical, 11}, + {"dense_numeric_unweighted_bins16", 256, 32, 0.0f, 16, WeightKind::kNone, + FeatureKind::kNumerical, 12}, + {"dense_numeric_weighted_bins256", 512, 32, 0.0f, 256, WeightKind::kRow, + FeatureKind::kNumerical, 13}, + {"sparse_numeric_weighted_bins32", 512, 48, 0.7f, 32, WeightKind::kRow, + FeatureKind::kNumerical, 14}, + {"dense_mixed_unweighted_bins16", 256, 24, 0.0f, 16, WeightKind::kNone, FeatureKind::kMixed, + 15}, + {"sparse_mixed_weighted_bins64", 512, 40, 0.8f, 64, WeightKind::kRow, FeatureKind::kMixed, + 16}, + }; +} + +auto FeatureTypes(ContainerCase const& c) -> std::vector { + std::vector ft(c.cols, FeatureType::kNumerical); + if (c.features == FeatureKind::kMixed) { + for (std::size_t i = 0; i < ft.size(); ++i) { + ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; + } + } + return ft; +} + +auto GenerateWeights(std::size_t rows, std::uint32_t seed) -> std::vector { + std::vector weights(rows, 1.0f); + SimpleLCG lcg{seed}; + SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); + std::generate(weights.begin(), weights.end(), [&] { return std::exp(6.0f * unit_dist(&lcg)); }); + return weights; +} + +auto CollectWeightedColumns(DMatrix* dmat) -> std::vector> { + std::vector> columns(dmat->Info().num_col_); + if (dmat->Info().num_row_ == 0) { + return columns; + } + std::vector weights = dmat->Info().group_ptr_.empty() + ? dmat->Info().weights_.HostVector() + : detail::UnrollGroupWeights(dmat->Info()); + + bst_idx_t row_idx{0}; + Context ctx; + for (auto const& batch : dmat->GetBatches(&ctx)) { + auto page = batch.GetView(); + for (std::size_t i = 0; i < batch.Size(); ++i) { + auto row_weight = + weights.empty() ? 1.0 + : static_cast(weights.at(static_cast(row_idx + i))); + for (auto e : page[i]) { + columns[e.index].push_back({e.fvalue, row_weight}); + } + } + row_idx += batch.Size(); + } + + for (auto& column : columns) { + std::sort(column.begin(), column.end(), + [](auto const& lhs, auto const& rhs) { return lhs.value < rhs.value; }); + } + return columns; +} + +auto AggregateWeightedColumn(std::vector const& sorted_column) -> ReferenceColumn { + ReferenceColumn ref; + ref.prefix_weights.push_back(0.0); + for (auto const& entry : sorted_column) { + if (!ref.values.empty() && ref.values.back() == entry.value) { + ref.prefix_weights.back() += entry.weight; + } else { + ref.values.push_back(entry.value); + ref.prefix_weights.push_back(ref.prefix_weights.back() + entry.weight); + } + } + return ref; +} + +double DistanceToInterval(double target, double lo, double hi) { + if (target < lo) { + return lo - target; + } + if (target > hi) { + return target - hi; + } + return 0.0; +} + +struct CutRankErrorSummary { + double max_normalized_error{0.0}; + double max_absolute_error{0.0}; + double target_rank{0.0}; + double rank_lo{0.0}; + double rank_hi{0.0}; + double total_weight{0.0}; + bst_feature_t feature{0}; + std::size_t cut_index{0}; + std::size_t num_interior_cuts{0}; +}; + +auto MeasureCutRankError(HistogramCuts const& cuts, bst_feature_t column_idx, + ReferenceColumn const& ref) -> CutRankErrorSummary { + CutRankErrorSummary summary; + summary.feature = column_idx; + if (ref.values.empty()) { + return summary; + } + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + auto num_cuts = end - beg; + if (num_cuts <= 1) { + return summary; + } + summary.num_interior_cuts = num_cuts - 1; + summary.total_weight = ref.prefix_weights.back(); + if (summary.total_weight == 0.0 || summary.num_interior_cuts == 0) { + return summary; + } + + auto avg_bin_weight = summary.total_weight / static_cast(summary.num_interior_cuts); + for (std::size_t cut_idx = 0; cut_idx < summary.num_interior_cuts; ++cut_idx) { + auto cut_value = cuts.Values()[beg + cut_idx]; + auto lb = std::lower_bound(ref.values.cbegin(), ref.values.cend(), cut_value); + auto ub = std::upper_bound(ref.values.cbegin(), ref.values.cend(), cut_value); + auto rank_lo = ref.prefix_weights[std::distance(ref.values.cbegin(), lb)]; + auto rank_hi = ref.prefix_weights[std::distance(ref.values.cbegin(), ub)]; + auto target_rank = static_cast(cut_idx + 1) * summary.total_weight / + static_cast(summary.num_interior_cuts); + auto absolute_error = DistanceToInterval(target_rank, rank_lo, rank_hi); + auto normalized_error = absolute_error / avg_bin_weight; + if (normalized_error > summary.max_normalized_error) { + summary.max_normalized_error = normalized_error; + summary.max_absolute_error = absolute_error; + summary.target_rank = target_rank; + summary.rank_lo = rank_lo; + summary.rank_hi = rank_hi; + summary.cut_index = cut_idx; + } + } + + return summary; +} + +void ValidateNumericalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, + std::vector const& sorted_column, std::size_t num_bins, + double max_normalized_rank_error) { + auto ref = AggregateWeightedColumn(sorted_column); + CHECK(!ref.values.empty()); + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + auto first_bin = HistogramCuts::NumericBinLowerBound(cuts.Ptrs(), cuts.Values(), column_idx, beg); + EXPECT_TRUE(std::isinf(first_bin)); + EXPECT_LT(first_bin, 0.0f); + EXPECT_GT(cuts.Values()[beg], ref.values.front()); + EXPECT_GE(cuts.Values()[end - 1], ref.values.back()); + + if (ref.values.size() <= num_bins) { + for (std::size_t i = 0; i < ref.values.size(); ++i) { + ASSERT_EQ(cuts.SearchBin(ref.values[i], column_idx), beg + i) + << "feature=" << column_idx << ", value_index=" << i; + } + } else { + auto stats = MeasureCutRankError(cuts, column_idx, ref); + EXPECT_LE(stats.max_normalized_error, max_normalized_rank_error) + << "feature=" << column_idx << ", cut=" << stats.cut_index; + } +} + +void ValidateCategoricalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, + std::vector const& sorted_column) { + std::vector categories; + categories.reserve(sorted_column.size()); + for (auto const& entry : sorted_column) { + categories.push_back(entry.value); + } + std::sort(categories.begin(), categories.end()); + categories.erase(std::unique(categories.begin(), categories.end()), categories.end()); + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + ASSERT_EQ(static_cast(end - beg), categories.size()) << "feature=" << column_idx; + for (std::size_t i = 0; i < categories.size(); ++i) { + EXPECT_EQ(cuts.Values()[beg + i], categories[i]) << "feature=" << column_idx; + } +} + +void TestCutInvariants(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, + std::vector> const& columns, + std::size_t f_begin = 0, + std::size_t f_end = std::numeric_limits::max()) { + ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; + auto ft = dmat->Info().feature_types.ConstHostSpan(); + auto max_error = + c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; + f_end = std::min(f_end, columns.size()); + for (std::size_t i = f_begin; i < f_end; ++i) { + auto beg = cuts.Ptrs()[i]; + auto end = cuts.Ptrs()[i + 1]; + ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; + for (auto j = beg + 1; j < end; ++j) { + EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; + } + if (columns[i].empty()) { + continue; + } + if (!ft.empty() && IsCat(ft, i)) { + ValidateCategoricalCuts(cuts, i, columns[i]); + } else { + ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); + } + } +} + +void DoGPUContainerProperty(ContainerCase const& c) { + auto ctx = MakeCUDACtx(0); + auto ft = FeatureTypes(c); + auto m = RandomDataGenerator{c.rows, c.cols, c.sparsity} + .Seed(c.seed) + .Lower(.0f) + .Upper(1.0f) + .Type(ft) + .MaxCategory(13) + .GenerateDMatrix(); + if (c.weights == WeightKind::kRow) { + m->Info().weights_.HostVector() = GenerateWeights(c.rows, c.seed + 1024); + } + auto cuts = DeviceSketch(&ctx, m.get(), c.max_bin); + auto columns = CollectWeightedColumns(m.get()); + TestCutInvariants(c, cuts, m.get(), columns); +} + +void DoMGPURowSplitProperty(ContainerCase const& c) { + auto const world = collective::GetWorldSize(); + auto const rank = collective::GetRank(); + auto ctx = MakeCUDACtx(GPUIDX); + auto ft = FeatureTypes(c); + auto full_m = RandomDataGenerator{c.rows * static_cast(world), c.cols, c.sparsity} + .Seed(c.seed) + .Lower(.0f) + .Upper(1.0f) + .Type(ft) + .MaxCategory(13) + .GenerateDMatrix(); + if (c.weights == WeightKind::kRow) { + full_m->Info().weights_.HostVector() = + GenerateWeights(c.rows * static_cast(world), c.seed + 4096); + } + + std::vector ridxs(c.rows); + auto row_begin = static_cast(rank) * c.rows; + std::iota(ridxs.begin(), ridxs.end(), static_cast(row_begin)); + auto m = + std::shared_ptr{full_m->Slice(Span{ridxs.data(), ridxs.size()})}; + m->Info().data_split_mode = DataSplitMode::kRow; + + auto cuts = DeviceSketch(&ctx, m.get(), c.max_bin); + collective::Finalize(); + CHECK_EQ(collective::GetWorldSize(), 1); + + auto columns = CollectWeightedColumns(full_m.get()); + TestCutInvariants(c, cuts, full_m.get(), columns); +} + +void DoMGPUColumnSplitProperty(ContainerCase const& c) { + auto const world = collective::GetWorldSize(); + auto const rank = collective::GetRank(); + auto ctx = MakeCUDACtx(GPUIDX); + auto ft = FeatureTypes(c); + auto full_m = RandomDataGenerator{c.rows, c.cols, c.sparsity} + .Seed(c.seed) + .Lower(.0f) + .Upper(1.0f) + .Type(ft) + .MaxCategory(13) + .GenerateDMatrix(); + if (c.weights == WeightKind::kRow) { + full_m->Info().weights_.HostVector() = GenerateWeights(c.rows, c.seed + 2048); + } + auto m = std::shared_ptr{full_m->SliceCol(world, rank)}; + + auto cuts = DeviceSketch(&ctx, m.get(), c.max_bin); + auto const slice_size = c.cols / world; + auto const slice_start = slice_size * rank; + auto const slice_end = (rank == world - 1) ? c.cols : slice_start + slice_size; + + collective::Finalize(); + CHECK_EQ(collective::GetWorldSize(), 1); + + auto columns = CollectWeightedColumns(full_m.get()); + TestCutInvariants(c, cuts, full_m.get(), columns, slice_start, slice_end); +} +} // namespace + +TEST(GPUQuantileProperty, Invariants) { + for (auto const& c : ContainerAnchorCases()) { + SCOPED_TRACE(c.name); + DoGPUContainerProperty(c); + } +} + +TEST_F(MGPUQuantileTest, RowSplitProperty) { + for (auto const& c : ContainerAnchorCases()) { + SCOPED_TRACE(c.name); + this->DoTest([&] { DoMGPURowSplitProperty(c); }, true); + this->DoTest([&] { DoMGPURowSplitProperty(c); }, false); + } +} + +TEST_F(MGPUQuantileTest, ColumnSplitProperty) { + for (auto const& c : ContainerAnchorCases()) { + SCOPED_TRACE(c.name); + this->DoTest([&] { DoMGPUColumnSplitProperty(c); }, true); + this->DoTest([&] { DoMGPUColumnSplitProperty(c); }, false); + } +} + TEST(GPUQuantile, Basic) { auto ctx = MakeCUDACtx(0); constexpr size_t kCols = 100, kBins = 256; @@ -529,85 +860,6 @@ TEST(GPUQuantile, MissingColumns) { ASSERT_TRUE(cuts.HasCategorical()); } -namespace { -inline constexpr double kMaxDistributedWeightedNormalizedRankError = 20.0; - -void TestAllReduceBasic() { - auto const world = collective::GetWorldSize(); - constexpr size_t kRows = 1000, kCols = 100; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { - auto const device = DeviceOrd::CUDA(GPUIDX); - auto ctx = MakeCUDACtx(device.ordinal); - - /** - * Set up distributed version. We rely on using rank as seed to generate - * the exact same copy of data. - */ - auto rank = collective::GetRank(); - HostDeviceVector ft({}, device); - SketchContainer sketch_distributed(ft, n_bins, kCols, device); - HostDeviceVector storage({}, device); - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(device) - .Seed(rank + seed) - .GenerateArrayInterface(&storage); - data::CupyAdapter adapter(interface_str); - AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_distributed); - auto distributed_cuts = sketch_distributed.MakeCuts(&ctx, false); - TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); - auto full = MakeFullRowSplitDMatrix(kRows, kCols, world, seed); - auto max_rank_error = info.weights_.Empty() ? kMaxNormalizedRankError - : kMaxDistributedWeightedNormalizedRankError; - ValidateCuts(distributed_cuts, full.get(), n_bins, max_rank_error); - }); -} -} // anonymous namespace - -TEST_F(MGPUQuantileTest, AllReduceBasic) { - this->DoTest([] { TestAllReduceBasic(); }, true); - this->DoTest([] { TestAllReduceBasic(); }, false); -} - -namespace { -void TestColumnSplit(DMatrix* dmat) { - auto const world = collective::GetWorldSize(); - auto const rank = collective::GetRank(); - auto m = std::unique_ptr{dmat->SliceCol(world, rank)}; - - // Generate cuts for distributed environment. - auto ctx = MakeCUDACtx(GPUIDX); - std::size_t constexpr kBins = 64; - HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins); - ValidateCuts(distributed_cuts, m.get(), kBins); -} -} // anonymous namespace - -TEST_F(MGPUQuantileTest, ColumnSplitBasic) { - std::size_t constexpr kRows = 1000, kCols = 100; - auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); - this->DoTest([&] { TestColumnSplit(dmat.get()); }, true); - this->DoTest([&] { TestColumnSplit(dmat.get()); }, false); -} - -TEST_F(MGPUQuantileTest, ColumnSplitCategorical) { - std::size_t constexpr kRows = 1000, kCols = 100; - auto sparsity = 0.5f; - std::vector ft(kCols); - for (size_t i = 0; i < ft.size(); ++i) { - ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; - } - auto dmat = RandomDataGenerator{kRows, kCols, sparsity} - .Seed(0) - .Lower(.0f) - .Upper(1.0f) - .Type(ft) - .MaxCategory(13) - .GenerateDMatrix(); - this->DoTest([&] { TestColumnSplit(dmat.get()); }, true); - this->DoTest([&] { TestColumnSplit(dmat.get()); }, false); -} - namespace { void TestSameOnAllWorkers() { auto world = collective::GetWorldSize(); From f236a494ba8b08831a197684572e2beea7014bd6 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 11 Apr 2026 07:23:18 -0700 Subject: [PATCH 02/16] Share quantile cut invariants across CPU and GPU tests --- tests/cpp/common/test_quantile.cc | 72 +----- tests/cpp/common/test_quantile.cu | 288 ++--------------------- tests/cpp/common/test_quantile_helpers.h | 27 +++ 3 files changed, 58 insertions(+), 329 deletions(-) diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index 5f8b35db26aa..5f0b11fcecc6 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -65,34 +65,6 @@ void TestSummaryInvariants(SummaryCase const& c, WQSummaryContainer const& summa } } } -void TestContainerInvariants(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, - std::vector> const& columns) { - ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; - // Every feature should contribute at least one strictly increasing cut value sequence. - for (std::size_t fidx = 0; fidx < c.cols; ++fidx) { - auto beg = cuts.Ptrs()[fidx]; - auto end = cuts.Ptrs()[fidx + 1]; - ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << fidx; - for (auto i = beg + 1; i < end; ++i) { - EXPECT_LT(cuts.Values()[i - 1], cuts.Values()[i]) - << "case=" << c.name << ", feature=" << fidx; - } - } - auto ft = dmat->Info().feature_types.ConstHostSpan(); - auto max_error = - c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; - for (std::size_t i = 0; i < columns.size(); ++i) { - if (columns[i].empty()) { - continue; - } - if (!ft.empty() && IsCat(ft, i)) { - ValidateCategoricalCuts(cuts, i, columns[i]); - } else { - ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); - } - } -} - void AssertSameOnAllWorkers(Context const* ctx, HistogramCuts const& cuts) { auto const world = collective::GetWorldSize(); if (world <= 1) { @@ -208,7 +180,7 @@ TEST_P(QuantileContainerTest, Invariants) { } auto row_cuts = row_sketch.MakeCuts(&ctx, m->Info()); auto columns = CollectWeightedColumns(m.get()); - TestContainerInvariants(c, row_cuts, m.get(), columns); + ValidateContainerCuts(c, row_cuts, m.get(), columns); HostSketchContainer sorted_sketch(&ctx, c.max_bin, m->Info().feature_types.ConstHostSpan(), column_size, false); @@ -216,7 +188,7 @@ TEST_P(QuantileContainerTest, Invariants) { sorted_sketch.PushColPage(page, m->Info(), hess); } auto sorted_cuts = sorted_sketch.MakeCuts(&ctx, m->Info()); - TestContainerInvariants(c, sorted_cuts, m.get(), columns); + ValidateContainerCuts(c, sorted_cuts, m.get(), columns); } TEST_P(QuantileSketchOnDMatrixTest, Invariants) { @@ -238,10 +210,10 @@ TEST_P(QuantileSketchOnDMatrixTest, Invariants) { std::vector hessian(c.rows, 1.0f); auto hess = Span{hessian}; auto row_cuts = SketchOnDMatrix(&ctx, m.get(), c.max_bin, false, hess); - TestContainerInvariants(c, row_cuts, m.get(), columns); + ValidateContainerCuts(c, row_cuts, m.get(), columns); auto sorted_cuts = SketchOnDMatrix(&ctx, m.get(), c.max_bin, true, hess); - TestContainerInvariants(c, sorted_cuts, m.get(), columns); + ValidateContainerCuts(c, sorted_cuts, m.get(), columns); } namespace { @@ -286,8 +258,8 @@ void DoPropertyDistributedQuantile(ContainerCase const& c) { collective::Finalize(); CHECK_EQ(collective::GetWorldSize(), 1); auto columns = CollectWeightedColumns(full_m.get()); - TestContainerInvariants(c, row_cuts, full_m.get(), columns); - TestContainerInvariants(c, sorted_cuts, full_m.get(), columns); + ValidateContainerCuts(c, row_cuts, full_m.get(), columns); + ValidateContainerCuts(c, sorted_cuts, full_m.get(), columns); } void DoSameOnAllWorkersDistributedQuantile(ContainerCase const& c) { @@ -404,33 +376,6 @@ TEST(Quantile, TrackSketchElementsSorted) { ASSERT_EQ(sketch.NumElements(), 3); } namespace { -void TestColumnSplitInvariants( - quantile_test::ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, - std::vector> const& columns, std::size_t f_begin, - std::size_t f_end) { - ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; - auto ft = dmat->Info().feature_types.ConstHostSpan(); - auto max_error = c.weights == quantile_test::WeightKind::kRow - ? quantile_test::kMaxWeightedNormalizedRankError - : quantile_test::kMaxNormalizedRankError; - for (std::size_t i = f_begin; i < f_end; ++i) { - auto beg = cuts.Ptrs()[i]; - auto end = cuts.Ptrs()[i + 1]; - ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; - for (auto j = beg + 1; j < end; ++j) { - EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; - } - if (columns[i].empty()) { - continue; - } - if (!ft.empty() && IsCat(ft, i)) { - quantile_test::ValidateCategoricalCuts(cuts, i, columns[i]); - } else { - quantile_test::ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); - } - } -} - void DoPropertyColumnSplitQuantile(size_t rows, size_t cols) { Context ctx; auto const world = collective::GetWorldSize(); @@ -493,8 +438,9 @@ void DoPropertyColumnSplitQuantile(size_t rows, size_t cols) { collective::Finalize(); CHECK_EQ(collective::GetWorldSize(), 1); - TestColumnSplitInvariants(c, row_cuts, full_m.get(), columns, slice_start, slice_end); - TestColumnSplitInvariants(c, sorted_cuts, full_m.get(), columns, slice_start, slice_end); + quantile_test::ValidateContainerCuts(c, row_cuts, full_m.get(), columns, slice_start, slice_end); + quantile_test::ValidateContainerCuts(c, sorted_cuts, full_m.get(), columns, slice_start, + slice_end); } } // anonymous namespace diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 79f1cfca59c7..73d03ef2b029 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -14,6 +14,7 @@ #include "../helpers.h" #include "test_hist_util.h" #include "test_quantile.h" +#include "test_quantile_helpers.h" namespace xgboost { namespace { @@ -52,254 +53,9 @@ namespace common { class MGPUQuantileTest : public collective::BaseMGPUTest {}; namespace { -enum class WeightKind { kNone, kRow }; -enum class FeatureKind { kNumerical, kMixed }; - -inline constexpr double kMaxNormalizedRankError = 2.0; -inline constexpr double kMaxWeightedNormalizedRankError = 10.0; - -struct WeightedValue { - float value; - double weight; -}; - -struct ReferenceColumn { - std::vector values; - std::vector prefix_weights; -}; - -struct ContainerCase { - std::string name; - std::size_t rows{0}; - std::size_t cols{0}; - float sparsity{0.0f}; - bst_bin_t max_bin{0}; - WeightKind weights{WeightKind::kNone}; - FeatureKind features{FeatureKind::kNumerical}; - std::uint32_t seed{0}; -}; - -auto ContainerAnchorCases() -> std::vector { - return { - {"empty_numeric_bins16", 0, 32, 0.0f, 16, WeightKind::kNone, FeatureKind::kNumerical, 10}, - {"dense_numeric_unweighted_bins2", 256, 32, 0.0f, 2, WeightKind::kNone, - FeatureKind::kNumerical, 11}, - {"dense_numeric_unweighted_bins16", 256, 32, 0.0f, 16, WeightKind::kNone, - FeatureKind::kNumerical, 12}, - {"dense_numeric_weighted_bins256", 512, 32, 0.0f, 256, WeightKind::kRow, - FeatureKind::kNumerical, 13}, - {"sparse_numeric_weighted_bins32", 512, 48, 0.7f, 32, WeightKind::kRow, - FeatureKind::kNumerical, 14}, - {"dense_mixed_unweighted_bins16", 256, 24, 0.0f, 16, WeightKind::kNone, FeatureKind::kMixed, - 15}, - {"sparse_mixed_weighted_bins64", 512, 40, 0.8f, 64, WeightKind::kRow, FeatureKind::kMixed, - 16}, - }; -} - -auto FeatureTypes(ContainerCase const& c) -> std::vector { - std::vector ft(c.cols, FeatureType::kNumerical); - if (c.features == FeatureKind::kMixed) { - for (std::size_t i = 0; i < ft.size(); ++i) { - ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; - } - } - return ft; -} - -auto GenerateWeights(std::size_t rows, std::uint32_t seed) -> std::vector { - std::vector weights(rows, 1.0f); - SimpleLCG lcg{seed}; - SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); - std::generate(weights.begin(), weights.end(), [&] { return std::exp(6.0f * unit_dist(&lcg)); }); - return weights; -} - -auto CollectWeightedColumns(DMatrix* dmat) -> std::vector> { - std::vector> columns(dmat->Info().num_col_); - if (dmat->Info().num_row_ == 0) { - return columns; - } - std::vector weights = dmat->Info().group_ptr_.empty() - ? dmat->Info().weights_.HostVector() - : detail::UnrollGroupWeights(dmat->Info()); - - bst_idx_t row_idx{0}; - Context ctx; - for (auto const& batch : dmat->GetBatches(&ctx)) { - auto page = batch.GetView(); - for (std::size_t i = 0; i < batch.Size(); ++i) { - auto row_weight = - weights.empty() ? 1.0 - : static_cast(weights.at(static_cast(row_idx + i))); - for (auto e : page[i]) { - columns[e.index].push_back({e.fvalue, row_weight}); - } - } - row_idx += batch.Size(); - } - - for (auto& column : columns) { - std::sort(column.begin(), column.end(), - [](auto const& lhs, auto const& rhs) { return lhs.value < rhs.value; }); - } - return columns; -} - -auto AggregateWeightedColumn(std::vector const& sorted_column) -> ReferenceColumn { - ReferenceColumn ref; - ref.prefix_weights.push_back(0.0); - for (auto const& entry : sorted_column) { - if (!ref.values.empty() && ref.values.back() == entry.value) { - ref.prefix_weights.back() += entry.weight; - } else { - ref.values.push_back(entry.value); - ref.prefix_weights.push_back(ref.prefix_weights.back() + entry.weight); - } - } - return ref; -} - -double DistanceToInterval(double target, double lo, double hi) { - if (target < lo) { - return lo - target; - } - if (target > hi) { - return target - hi; - } - return 0.0; -} - -struct CutRankErrorSummary { - double max_normalized_error{0.0}; - double max_absolute_error{0.0}; - double target_rank{0.0}; - double rank_lo{0.0}; - double rank_hi{0.0}; - double total_weight{0.0}; - bst_feature_t feature{0}; - std::size_t cut_index{0}; - std::size_t num_interior_cuts{0}; -}; - -auto MeasureCutRankError(HistogramCuts const& cuts, bst_feature_t column_idx, - ReferenceColumn const& ref) -> CutRankErrorSummary { - CutRankErrorSummary summary; - summary.feature = column_idx; - if (ref.values.empty()) { - return summary; - } - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - auto num_cuts = end - beg; - if (num_cuts <= 1) { - return summary; - } - summary.num_interior_cuts = num_cuts - 1; - summary.total_weight = ref.prefix_weights.back(); - if (summary.total_weight == 0.0 || summary.num_interior_cuts == 0) { - return summary; - } - - auto avg_bin_weight = summary.total_weight / static_cast(summary.num_interior_cuts); - for (std::size_t cut_idx = 0; cut_idx < summary.num_interior_cuts; ++cut_idx) { - auto cut_value = cuts.Values()[beg + cut_idx]; - auto lb = std::lower_bound(ref.values.cbegin(), ref.values.cend(), cut_value); - auto ub = std::upper_bound(ref.values.cbegin(), ref.values.cend(), cut_value); - auto rank_lo = ref.prefix_weights[std::distance(ref.values.cbegin(), lb)]; - auto rank_hi = ref.prefix_weights[std::distance(ref.values.cbegin(), ub)]; - auto target_rank = static_cast(cut_idx + 1) * summary.total_weight / - static_cast(summary.num_interior_cuts); - auto absolute_error = DistanceToInterval(target_rank, rank_lo, rank_hi); - auto normalized_error = absolute_error / avg_bin_weight; - if (normalized_error > summary.max_normalized_error) { - summary.max_normalized_error = normalized_error; - summary.max_absolute_error = absolute_error; - summary.target_rank = target_rank; - summary.rank_lo = rank_lo; - summary.rank_hi = rank_hi; - summary.cut_index = cut_idx; - } - } - - return summary; -} - -void ValidateNumericalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, - std::vector const& sorted_column, std::size_t num_bins, - double max_normalized_rank_error) { - auto ref = AggregateWeightedColumn(sorted_column); - CHECK(!ref.values.empty()); - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - auto first_bin = HistogramCuts::NumericBinLowerBound(cuts.Ptrs(), cuts.Values(), column_idx, beg); - EXPECT_TRUE(std::isinf(first_bin)); - EXPECT_LT(first_bin, 0.0f); - EXPECT_GT(cuts.Values()[beg], ref.values.front()); - EXPECT_GE(cuts.Values()[end - 1], ref.values.back()); - - if (ref.values.size() <= num_bins) { - for (std::size_t i = 0; i < ref.values.size(); ++i) { - ASSERT_EQ(cuts.SearchBin(ref.values[i], column_idx), beg + i) - << "feature=" << column_idx << ", value_index=" << i; - } - } else { - auto stats = MeasureCutRankError(cuts, column_idx, ref); - EXPECT_LE(stats.max_normalized_error, max_normalized_rank_error) - << "feature=" << column_idx << ", cut=" << stats.cut_index; - } -} - -void ValidateCategoricalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, - std::vector const& sorted_column) { - std::vector categories; - categories.reserve(sorted_column.size()); - for (auto const& entry : sorted_column) { - categories.push_back(entry.value); - } - std::sort(categories.begin(), categories.end()); - categories.erase(std::unique(categories.begin(), categories.end()), categories.end()); - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - ASSERT_EQ(static_cast(end - beg), categories.size()) << "feature=" << column_idx; - for (std::size_t i = 0; i < categories.size(); ++i) { - EXPECT_EQ(cuts.Values()[beg + i], categories[i]) << "feature=" << column_idx; - } -} - -void TestCutInvariants(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, - std::vector> const& columns, - std::size_t f_begin = 0, - std::size_t f_end = std::numeric_limits::max()) { - ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; - auto ft = dmat->Info().feature_types.ConstHostSpan(); - auto max_error = - c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; - f_end = std::min(f_end, columns.size()); - for (std::size_t i = f_begin; i < f_end; ++i) { - auto beg = cuts.Ptrs()[i]; - auto end = cuts.Ptrs()[i + 1]; - ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; - for (auto j = beg + 1; j < end; ++j) { - EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; - } - if (columns[i].empty()) { - continue; - } - if (!ft.empty() && IsCat(ft, i)) { - ValidateCategoricalCuts(cuts, i, columns[i]); - } else { - ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); - } - } -} - -void DoGPUContainerProperty(ContainerCase const& c) { +void DoGPUContainerProperty(quantile_test::ContainerCase const& c) { auto ctx = MakeCUDACtx(0); - auto ft = FeatureTypes(c); + auto ft = quantile_test::FeatureTypes(c); auto m = RandomDataGenerator{c.rows, c.cols, c.sparsity} .Seed(c.seed) .Lower(.0f) @@ -307,19 +63,19 @@ void DoGPUContainerProperty(ContainerCase const& c) { .Type(ft) .MaxCategory(13) .GenerateDMatrix(); - if (c.weights == WeightKind::kRow) { - m->Info().weights_.HostVector() = GenerateWeights(c.rows, c.seed + 1024); + if (c.weights == quantile_test::WeightKind::kRow) { + m->Info().weights_.HostVector() = quantile_test::GenerateWeights(c.rows, c.seed + 1024); } auto cuts = DeviceSketch(&ctx, m.get(), c.max_bin); - auto columns = CollectWeightedColumns(m.get()); - TestCutInvariants(c, cuts, m.get(), columns); + auto columns = quantile_test::CollectWeightedColumns(m.get()); + quantile_test::ValidateContainerCuts(c, cuts, m.get(), columns); } -void DoMGPURowSplitProperty(ContainerCase const& c) { +void DoMGPURowSplitProperty(quantile_test::ContainerCase const& c) { auto const world = collective::GetWorldSize(); auto const rank = collective::GetRank(); auto ctx = MakeCUDACtx(GPUIDX); - auto ft = FeatureTypes(c); + auto ft = quantile_test::FeatureTypes(c); auto full_m = RandomDataGenerator{c.rows * static_cast(world), c.cols, c.sparsity} .Seed(c.seed) .Lower(.0f) @@ -327,9 +83,9 @@ void DoMGPURowSplitProperty(ContainerCase const& c) { .Type(ft) .MaxCategory(13) .GenerateDMatrix(); - if (c.weights == WeightKind::kRow) { + if (c.weights == quantile_test::WeightKind::kRow) { full_m->Info().weights_.HostVector() = - GenerateWeights(c.rows * static_cast(world), c.seed + 4096); + quantile_test::GenerateWeights(c.rows * static_cast(world), c.seed + 4096); } std::vector ridxs(c.rows); @@ -343,15 +99,15 @@ void DoMGPURowSplitProperty(ContainerCase const& c) { collective::Finalize(); CHECK_EQ(collective::GetWorldSize(), 1); - auto columns = CollectWeightedColumns(full_m.get()); - TestCutInvariants(c, cuts, full_m.get(), columns); + auto columns = quantile_test::CollectWeightedColumns(full_m.get()); + quantile_test::ValidateContainerCuts(c, cuts, full_m.get(), columns); } -void DoMGPUColumnSplitProperty(ContainerCase const& c) { +void DoMGPUColumnSplitProperty(quantile_test::ContainerCase const& c) { auto const world = collective::GetWorldSize(); auto const rank = collective::GetRank(); auto ctx = MakeCUDACtx(GPUIDX); - auto ft = FeatureTypes(c); + auto ft = quantile_test::FeatureTypes(c); auto full_m = RandomDataGenerator{c.rows, c.cols, c.sparsity} .Seed(c.seed) .Lower(.0f) @@ -359,8 +115,8 @@ void DoMGPUColumnSplitProperty(ContainerCase const& c) { .Type(ft) .MaxCategory(13) .GenerateDMatrix(); - if (c.weights == WeightKind::kRow) { - full_m->Info().weights_.HostVector() = GenerateWeights(c.rows, c.seed + 2048); + if (c.weights == quantile_test::WeightKind::kRow) { + full_m->Info().weights_.HostVector() = quantile_test::GenerateWeights(c.rows, c.seed + 2048); } auto m = std::shared_ptr{full_m->SliceCol(world, rank)}; @@ -372,20 +128,20 @@ void DoMGPUColumnSplitProperty(ContainerCase const& c) { collective::Finalize(); CHECK_EQ(collective::GetWorldSize(), 1); - auto columns = CollectWeightedColumns(full_m.get()); - TestCutInvariants(c, cuts, full_m.get(), columns, slice_start, slice_end); + auto columns = quantile_test::CollectWeightedColumns(full_m.get()); + quantile_test::ValidateContainerCuts(c, cuts, full_m.get(), columns, slice_start, slice_end); } } // namespace TEST(GPUQuantileProperty, Invariants) { - for (auto const& c : ContainerAnchorCases()) { + for (auto const& c : quantile_test::ContainerAnchorCases()) { SCOPED_TRACE(c.name); DoGPUContainerProperty(c); } } TEST_F(MGPUQuantileTest, RowSplitProperty) { - for (auto const& c : ContainerAnchorCases()) { + for (auto const& c : quantile_test::ContainerAnchorCases()) { SCOPED_TRACE(c.name); this->DoTest([&] { DoMGPURowSplitProperty(c); }, true); this->DoTest([&] { DoMGPURowSplitProperty(c); }, false); @@ -393,7 +149,7 @@ TEST_F(MGPUQuantileTest, RowSplitProperty) { } TEST_F(MGPUQuantileTest, ColumnSplitProperty) { - for (auto const& c : ContainerAnchorCases()) { + for (auto const& c : quantile_test::ContainerAnchorCases()) { SCOPED_TRACE(c.name); this->DoTest([&] { DoMGPUColumnSplitProperty(c); }, true); this->DoTest([&] { DoMGPUColumnSplitProperty(c); }, false); diff --git a/tests/cpp/common/test_quantile_helpers.h b/tests/cpp/common/test_quantile_helpers.h index 99cd95521a23..d6e5566eeda0 100644 --- a/tests/cpp/common/test_quantile_helpers.h +++ b/tests/cpp/common/test_quantile_helpers.h @@ -317,6 +317,33 @@ inline void ValidateCategoricalCuts(HistogramCuts const& cuts, bst_feature_t col } } +inline void ValidateContainerCuts(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, + std::vector> const& columns, + std::size_t f_begin = 0, + std::size_t f_end = std::numeric_limits::max()) { + ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; + auto ft = dmat->Info().feature_types.ConstHostSpan(); + auto max_error = + c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; + f_end = std::min(f_end, columns.size()); + for (std::size_t i = f_begin; i < f_end; ++i) { + auto beg = cuts.Ptrs()[i]; + auto end = cuts.Ptrs()[i + 1]; + ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; + for (auto j = beg + 1; j < end; ++j) { + EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; + } + if (columns[i].empty()) { + continue; + } + if (!ft.empty() && IsCat(ft, i)) { + ValidateCategoricalCuts(cuts, i, columns[i]); + } else { + ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); + } + } +} + inline GeneratedColumn GenerateSummaryColumn(SummaryCase const& c) { GeneratedColumn out; out.values.resize(c.rows); From b722265d0ce1ff89728a74a6447144ba2ff69f46 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 11 Apr 2026 08:24:24 -0700 Subject: [PATCH 03/16] Simplify GPU quantile tests --- tests/cpp/common/test_hist_util.cu | 114 ------------ tests/cpp/common/test_quantile.cu | 267 ++++++++++++----------------- tests/cpp/common/test_quantile.h | 13 +- 3 files changed, 113 insertions(+), 281 deletions(-) diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index e95ffe32da0e..017233b5541d 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -37,23 +37,6 @@ HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, f return cuts; } -TEST(HistUtil, DeviceSketch) { - auto ctx = MakeCUDACtx(0); - int num_columns = 1; - int num_bins = 4; - std::vector x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, -1.0f}; - int num_rows = x.size(); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - - auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - - Context cpu_ctx; - HistogramCuts host_cuts = SketchOnDMatrix(&cpu_ctx, dmat.get(), num_bins); - - EXPECT_EQ(device_cuts.Values(), host_cuts.Values()); - EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs()); -} - TEST(HistUtil, DeviceSketchPeakMemory) { auto ctx = MakeCUDACtx(0); int num_columns = 2048; @@ -104,21 +87,6 @@ TEST(HistUtil, DeviceSketchDeterminism) { } } -TEST(HistUtil, DeviceSketchCategoricalAsNumeric) { - auto ctx = MakeCUDACtx(0); - auto categorical_sizes = {2, 6, 8, 12}; - int num_bins = 256; - auto sizes = {25, 100, 1000}; - for (auto n : sizes) { - for (auto num_categories : categorical_sizes) { - auto x = GenerateRandomCategoricalSingleColumn(n, num_categories); - auto dmat = GetDMatrixFromData(x, n, 1); - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - ValidateCuts(cuts, dmat.get(), num_bins); - } - } -} - TEST(HistUtil, DeviceSketchCategoricalFeatures) { auto ctx = MakeCUDACtx(0); TestCategoricalSketch(1000, 256, 32, false, [ctx](DMatrix* p_fmat, int32_t num_bins) { @@ -129,34 +97,6 @@ TEST(HistUtil, DeviceSketchCategoricalFeatures) { }); } -void TestMixedSketch() { - size_t n_samples = 1000, n_features = 2, n_categories = 3; - bst_bin_t n_bins = 64; - - std::vector data(n_samples * n_features); - SimpleLCG gen; - SimpleRealUniformDistribution cat_d{0.0f, static_cast(n_categories)}; - SimpleRealUniformDistribution num_d{0.0f, 3.0f}; - for (size_t i = 0; i < n_samples * n_features; ++i) { - // two features, row major. The first column is numeric and the second is categorical. - if (i % 2 == 0) { - data[i] = std::floor(cat_d(&gen)); - } else { - data[i] = num_d(&gen); - } - } - - auto m = GetDMatrixFromData(data, n_samples, n_features); - m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); - m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical); - - auto ctx = MakeCUDACtx(0); - auto cuts = DeviceSketch(&ctx, m.get(), n_bins); - ASSERT_EQ(cuts.Values().size(), n_bins + n_categories); -} - -TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); } - TEST(HistUtil, RemoveDuplicatedCategories) { bst_idx_t n_samples = 512; bst_feature_t n_features = 3; @@ -222,60 +162,6 @@ TEST(HistUtil, RemoveDuplicatedCategories) { } } -TEST(HistUtil, DeviceSketchMultipleColumns) { - auto ctx = MakeCUDACtx(0); - auto bin_sizes = {2, 16, 256, 512}; - auto sizes = {100, 1000, 1500}; - int num_columns = 5; - for (auto num_rows : sizes) { - auto x = GenerateRandom(num_rows, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - for (auto num_bins : bin_sizes) { - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - ValidateCuts(cuts, dmat.get(), num_bins); - } - } -} - -TEST(HistUtil, DeviceSketchMultipleColumnsWeights) { - auto ctx = MakeCUDACtx(0); - auto bin_sizes = {2, 16, 256, 512}; - auto sizes = {100, 1000, 1500}; - int num_columns = 5; - for (auto num_rows : sizes) { - auto x = GenerateRandom(num_rows, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows); - for (auto num_bins : bin_sizes) { - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - ValidateCuts(cuts, dmat.get(), num_bins); - } - } -} - -TEST(HistUitl, DeviceSketchWeights) { - auto ctx = MakeCUDACtx(0); - auto bin_sizes = {2, 16, 256, 512}; - auto sizes = {100, 1000, 1500}; - int num_columns = 5; - for (auto num_rows : sizes) { - auto x = GenerateRandom(num_rows, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - auto weighted_dmat = GetDMatrixFromData(x, num_rows, num_columns); - auto& h_weights = weighted_dmat->Info().weights_.HostVector(); - h_weights.resize(num_rows); - std::fill(h_weights.begin(), h_weights.end(), 1.0f); - for (auto num_bins : bin_sizes) { - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - auto wcuts = DeviceSketch(&ctx, weighted_dmat.get(), num_bins); - ASSERT_EQ(cuts.Ptrs(), wcuts.Ptrs()); - ASSERT_EQ(cuts.Values(), wcuts.Values()); - ValidateCuts(cuts, dmat.get(), num_bins); - ValidateCuts(wcuts, weighted_dmat.get(), num_bins); - } - } -} - TEST(HistUtil, DeviceSketchBatches) { auto ctx = MakeCUDACtx(0); int num_bins = 256; diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 73d03ef2b029..06e5dabe508e 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -2,9 +2,6 @@ * Copyright 2020-2026, XGBoost contributors */ #include -#include // for make_zip_iterator - -#include // for make_tuple, tuple #include "../../../src/collective/allreduce.h" #include "../../../src/common/hist_util.cuh" @@ -24,16 +21,6 @@ struct IsSorted { } }; -struct RepeatedValueOp { - std::size_t cols; - - XGBOOST_DEVICE float operator()(cuda::std::tuple const& tuple) const { - auto i = cuda::std::get<0>(tuple); - auto ridx = i / cols; - return static_cast((ridx / 8) % 4); - } -}; - auto MakeHostSummary(std::vector> const& items) -> common::WQSummaryContainer { common::WQSummaryContainer summary; @@ -47,6 +34,54 @@ auto CopySummaryEntries(common::WQSummaryContainer const& summary) auto entries = summary.Entries(); return {entries.cbegin(), entries.cend()}; } + +struct HostSketchView { + std::vector data; + std::vector columns_ptr; +}; + +struct HostEntryBatch { + std::vector entries; + std::vector columns_ptr; +}; + +auto MakeEntryBatch(std::vector> const& columns) -> HostEntryBatch { + HostEntryBatch batch; + batch.columns_ptr.push_back(0); + for (bst_feature_t c = 0; c < columns.size(); ++c) { + for (auto value : columns[c]) { + batch.entries.push_back(Entry{c, value}); + } + batch.columns_ptr.push_back(batch.entries.size()); + } + return batch; +} + +auto MakePruneBatch(std::size_t rows, bst_feature_t cols, bool with_duplicates) -> HostEntryBatch { + std::vector> columns(cols); + for (size_t i = 0; i < rows; ++i) { + if (with_duplicates) { + columns[0].push_back(static_cast(i / 4)); + columns[1].push_back(static_cast(i / 8) + 10.0f); + columns[2].push_back(static_cast(i / 2) + 100.0f); + } else { + columns[0].push_back(static_cast(i)); + columns[1].push_back(static_cast(i) * 0.5f + 10.0f); + columns[2].push_back(static_cast(i) * 0.25f + 100.0f); + } + } + return MakeEntryBatch(columns); +} + +auto CopySketchToHost(xgboost::common::Span data, + xgboost::common::Span columns_ptr) -> HostSketchView { + HostSketchView out; + out.data.resize(data.size()); + out.columns_ptr.resize(columns_ptr.size()); + dh::CopyDeviceSpanToVector(&out.data, data); + dh::CopyDeviceSpanToVector(&out.columns_ptr, columns_ptr); + return out; +} } // namespace namespace common { @@ -156,7 +191,7 @@ TEST_F(MGPUQuantileTest, ColumnSplitProperty) { } } -TEST(GPUQuantile, Basic) { +TEST(GPUQuantile, EmptyPush) { auto ctx = MakeCUDACtx(0); constexpr size_t kCols = 100, kBins = 256; HostDeviceVector ft; @@ -169,115 +204,55 @@ TEST(GPUQuantile, Basic) { ASSERT_EQ(sketch.Data().size(), 0); } -// if with_error is true, the test tolerates floating point error -void TestQuantileElemRank(DeviceOrd device, Span in, - Span d_columns_ptr, bool with_error = false) { - dh::safe_cuda(cudaSetDevice(device.ordinal)); - std::vector h_in(in.size()); - dh::CopyDeviceSpanToVector(&h_in, in); - std::vector h_columns_ptr(d_columns_ptr.size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, d_columns_ptr); - - for (size_t i = 1; i < d_columns_ptr.size(); ++i) { +void ValidateSketchInvariants(HostSketchView const& sketch, bool with_error = false) { + for (size_t i = 1; i < sketch.columns_ptr.size(); ++i) { auto column_id = i - 1; - auto beg = h_columns_ptr[column_id]; - auto end = h_columns_ptr[i]; - - auto in_column = Span{h_in}.subspan(beg, end - beg); - for (size_t idx = 1; idx < in_column.size(); ++idx) { - float prev_rmin = in_column[idx - 1].rmin; - float prev_rmax = in_column[idx - 1].rmax; - float rmin_next = in_column[idx].RMinNext(); + auto beg = sketch.columns_ptr[column_id]; + auto end = sketch.columns_ptr[i]; + + auto column = Span{sketch.data}.subspan(beg, end - beg); + ASSERT_TRUE(std::is_sorted(column.begin(), column.end(), IsSorted{})); + ASSERT_TRUE(std::adjacent_find(column.begin(), column.end(), + [](SketchEntry const& l, SketchEntry const& r) { + return l.value == r.value; + }) == column.end()); + for (size_t idx = 1; idx < column.size(); ++idx) { + float prev_rmin = column[idx - 1].rmin; + float prev_rmax = column[idx - 1].rmax; + float rmin_next = column[idx].RMinNext(); if (with_error) { - ASSERT_GE(in_column[idx].rmin + in_column[idx].rmin * kRtEps, prev_rmin); - ASSERT_GE(in_column[idx].rmax + in_column[idx].rmin * kRtEps, prev_rmax); - ASSERT_GE(in_column[idx].rmax + in_column[idx].rmin * kRtEps, rmin_next); + ASSERT_GE(column[idx].rmin + column[idx].rmin * kRtEps, prev_rmin); + ASSERT_GE(column[idx].rmax + column[idx].rmin * kRtEps, prev_rmax); + ASSERT_GE(column[idx].rmax + column[idx].rmin * kRtEps, rmin_next); } else { - ASSERT_GE(in_column[idx].rmin, prev_rmin); - ASSERT_GE(in_column[idx].rmax, prev_rmax); - ASSERT_GE(in_column[idx].rmax, rmin_next); + ASSERT_GE(column[idx].rmin, prev_rmin); + ASSERT_GE(column[idx].rmax, prev_rmax); + ASSERT_GE(column[idx].rmax, rmin_next); } } } } TEST(GPUQuantile, Prune) { - constexpr size_t kRows = 1000, kCols = 100; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { - auto ctx = MakeCUDACtx(0); - HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - - HostDeviceVector storage; - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage); - data::CupyAdapter adapter(interface_str); - AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch); - auto n_cuts = detail::RequiredSampleCutsPerColumn(n_bins, kRows); - // LE because kRows * kCols is pushed into sketch, after removing - // duplicated entries we might not have that much inputs for prune. - ASSERT_LE(sketch.Data().size(), n_cuts * kCols); - - sketch.Prune(&ctx, n_bins); - ASSERT_LE(sketch.Data().size(), kRows * kCols); - std::vector h_columns_ptr(sketch.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch.ColumnsPtr()); - std::vector h_data(sketch.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch.Data()); - for (size_t i = 1; i < h_columns_ptr.size(); ++i) { - auto begin = h_columns_ptr[i - 1]; - auto column = Span{h_data}.subspan(begin, h_columns_ptr[i] - begin); - ASSERT_TRUE(std::adjacent_find(column.begin(), column.end(), - [](SketchEntry const& l, SketchEntry const& r) { - return l.value == r.value; - }) == column.end()); - } - TestQuantileElemRank(ctx.Device(), sketch.Data(), sketch.ColumnsPtr()); - }); -} - -TEST(GPUQuantile, PruneDuplicated) { - constexpr size_t kRows = 512, kCols = 8; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { - auto ctx = MakeCUDACtx(0); - HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - - HostDeviceVector storage; - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage); - auto d_data = storage.DeviceSpan(); - auto tuple_it = - cuda::std::make_tuple(thrust::make_counting_iterator(0ul), d_data.data()); - auto it = thrust::make_zip_iterator(tuple_it); - thrust::transform(ctx.CUDACtx()->CTP(), it, it + d_data.size(), d_data.data(), - RepeatedValueOp{kCols}); - - data::CupyAdapter adapter(interface_str); - AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch); - - sketch.Prune(&ctx, n_bins); - - std::vector h_columns_ptr(sketch.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch.ColumnsPtr()); - std::vector h_data(sketch.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch.Data()); - for (size_t i = 1; i < h_columns_ptr.size(); ++i) { - auto begin = h_columns_ptr[i - 1]; - auto column = Span{h_data}.subspan(begin, h_columns_ptr[i] - begin); - ASSERT_TRUE(std::adjacent_find(column.begin(), column.end(), - [](SketchEntry const& l, SketchEntry const& r) { - return l.value == r.value; - }) == column.end()); + constexpr size_t kRows = 64, kCols = 3; + for (auto with_duplicates : {false, true}) { + for (auto n_bins : {8, 16, 80}) { + auto ctx = MakeCUDACtx(0); + HostDeviceVector ft; + SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + auto batch = MakePruneBatch(kRows, kCols, with_duplicates); + dh::device_vector d_entries{batch.entries}; + dh::device_vector d_columns_ptr{batch.columns_ptr}; + dh::device_vector d_cuts_ptr{batch.columns_ptr}; + sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(d_columns_ptr), dh::ToSpan(d_cuts_ptr), + batch.entries.size(), kRows, {}); + + sketch.Prune(&ctx, n_bins); + ASSERT_LE(sketch.Data().size(), static_cast(n_bins) * kCols); + auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); } - TestQuantileElemRank(ctx.Device(), sketch.Data(), sketch.ColumnsPtr()); - }); + } } TEST(GPUQuantile, MergeEmpty) { @@ -347,25 +322,11 @@ TEST(GPUQuantile, MergeBasic) { size_t size_before_merge = sketch_0.Data().size(); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); - TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr()); - - auto columns_ptr = sketch_0.ColumnsPtr(); - std::vector h_columns_ptr(columns_ptr.size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr); + auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + auto const& h_columns_ptr = h_sketch.columns_ptr; ASSERT_LE(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge); - - std::vector h_data(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch_0.Data()); - ASSERT_EQ(static_cast(h_columns_ptr.back()), h_data.size()); - for (size_t i = 1; i < h_columns_ptr.size(); ++i) { - auto begin = h_columns_ptr[i - 1]; - auto column = Span{h_data}.subspan(begin, h_columns_ptr[i] - begin); - ASSERT_TRUE(std::is_sorted(column.begin(), column.end(), IsSorted{})); - ASSERT_TRUE(std::adjacent_find(column.begin(), column.end(), - [](SketchEntry const& l, SketchEntry const& r) { - return l.value == r.value; - }) == column.end()); - } + ASSERT_EQ(static_cast(h_columns_ptr.back()), h_sketch.data.size()); }); } @@ -410,25 +371,11 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { size_t size_before_merge = sketch_0.Data().size(); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); - TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr()); - - auto columns_ptr = sketch_0.ColumnsPtr(); - std::vector h_columns_ptr(columns_ptr.size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr); + auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + auto const& h_columns_ptr = h_sketch.columns_ptr; ASSERT_LE(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge); - - std::vector h_data(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch_0.Data()); - ASSERT_EQ(static_cast(h_columns_ptr.back()), h_data.size()); - for (size_t i = 1; i < h_columns_ptr.size(); ++i) { - auto begin = h_columns_ptr[i - 1]; - auto column = Span{h_data}.subspan(begin, h_columns_ptr[i] - begin); - ASSERT_TRUE(std::is_sorted(column.begin(), column.end(), IsSorted{})); - ASSERT_TRUE(std::adjacent_find(column.begin(), column.end(), - [](SketchEntry const& l, SketchEntry const& r) { - return l.value == r.value; - }) == column.end()); - } + ASSERT_EQ(static_cast(h_columns_ptr.back()), h_sketch.data.size()); } TEST(GPUQuantile, MergeDuplicated) { @@ -467,14 +414,11 @@ TEST(GPUQuantile, MergeCategorical) { entries_1.size(), 5, {}); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); - TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr()); + auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); - std::vector h_columns_ptr(sketch_0.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch_0.ColumnsPtr()); - std::vector h_data(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch_0.Data()); - - auto cat_column = Span{h_data}.subspan(h_columns_ptr[0], h_columns_ptr[1]); + auto cat_column = Span{h_sketch.data}.subspan(h_sketch.columns_ptr[0], + h_sketch.columns_ptr[1]); ASSERT_TRUE(std::adjacent_find(cat_column.begin(), cat_column.end(), [](SketchEntry const& l, SketchEntry const& r) { return l.value == r.value; @@ -586,8 +530,9 @@ TEST(GPUQuantile, MultiMerge) { sketch.Prune(&ctx, intermediate_num_cuts); sketch_on_single_node.Merge(&ctx, sketch.ColumnsPtr(), sketch.Data()); } - TestQuantileElemRank(ctx.Device(), sketch_on_single_node.Data(), - sketch_on_single_node.ColumnsPtr()); + auto h_sketch = + CopySketchToHost(sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); }); } @@ -635,7 +580,8 @@ void TestSameOnAllWorkers() { AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_distributed); sketch_distributed.AllReduce(&ctx, false); - TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); + auto h_sketch = CopySketchToHost(sketch_distributed.Data(), sketch_distributed.ColumnsPtr()); + ValidateSketchInvariants(h_sketch, true); // Test for all workers having the same sketch. size_t n_data = sketch_distributed.Data().size(); @@ -683,7 +629,6 @@ TEST(GPUQuantile, Push) { size_t constexpr kRows = 100; std::vector data(kRows); auto ctx = MakeCUDACtx(0); - std::fill(data.begin(), data.begin() + (data.size() / 2), 0.3f); std::fill(data.begin() + (data.size() / 2), data.end(), 0.5f); int32_t n_bins = 128; @@ -743,8 +688,8 @@ TEST(GPUQuantile, MultiColPush) { int32_t n_bins = 16; HostDeviceVector ft; SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - dh::device_vector d_entries{entries}; + dh::device_vector d_entries{entries}; dh::device_vector columns_ptr(kCols + 1, 0); for (size_t i = 1; i < kCols + 1; ++i) { columns_ptr[i] = kRows; diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h index 38ace76c4d13..3dd38a567cea 100644 --- a/tests/cpp/common/test_quantile.h +++ b/tests/cpp/common/test_quantile.h @@ -1,8 +1,8 @@ /** * Copyright 2020-2024, XGBoost Contributors */ -#ifndef XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ -#define XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ +#ifndef TESTS_CPP_COMMON_TEST_QUANTILE_H_ +#define TESTS_CPP_COMMON_TEST_QUANTILE_H_ #include #include @@ -10,11 +10,12 @@ #include "../helpers.h" namespace xgboost::common { -template void RunWithSeedsAndBins(size_t rows, Fn fn) { +template +void RunWithSeedsAndBins(size_t rows, Fn fn) { std::vector seeds(2); SimpleLCG lcg; - SimpleRealUniformDistribution dist(3, 1000); - std::generate(seeds.begin(), seeds.end(), [&](){ return dist(&lcg); }); + std::generate(seeds.begin(), seeds.end(), + [&]() { return static_cast(lcg() % 997) + 3; }); std::vector bins(2); for (size_t i = 0; i < bins.size() - 1; ++i) { @@ -39,4 +40,4 @@ template void RunWithSeedsAndBins(size_t rows, Fn fn) { } } // namespace xgboost::common -#endif // XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ +#endif // TESTS_CPP_COMMON_TEST_QUANTILE_H_ From 73a8c12daa0f6aa7d48ebff57e331a95659f0db8 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 02:56:54 -0700 Subject: [PATCH 04/16] Tighten GPU sketch container interface --- src/common/hist_util.cu | 46 ++++--------- src/common/hist_util.cuh | 65 +++++-------------- src/common/quantile.cu | 72 +++++++++++++------- src/common/quantile.cuh | 10 +-- src/data/quantile_dmatrix.cu | 5 -- tests/cpp/common/test_hist_util.cu | 8 +-- tests/cpp/common/test_quantile.cu | 101 ++++++++++++++--------------- 7 files changed, 130 insertions(+), 177 deletions(-) diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index dc6e8c52c8b7..2e6474fafe85 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -143,7 +143,6 @@ void SortByWeight(Context const* ctx, dh::device_vector* weights, } void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info, - Span d_cuts_ptr, dh::device_vector* p_sorted_entries, dh::device_vector* p_sorted_weights, dh::caching_device_vector* p_column_sizes_scan) { @@ -193,27 +192,14 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info, } sorted_entries.resize(n_uniques); - // Renew the column scan and cut scan based on categorical data. - dh::caching_device_vector new_cuts_size(info.num_col_ + 1); - CHECK_EQ(new_column_scan.size(), new_cuts_size.size()); + // Renew the column scan based on categorical data. Numerical columns preserve their original + // span, while categorical columns shrink to their unique category count. + CHECK_EQ(new_column_scan.size(), column_sizes_scan.size()); dh::LaunchN(new_column_scan.size(), ctx->CUDACtx()->Stream(), - [=, d_new_cuts_size = dh::ToSpan(new_cuts_size), - d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan), + [=, d_column_sizes_scan = dh::ToSpan(column_sizes_scan), d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) { - d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx]; - if (idx == d_new_columns_ptr.size() - 1) { - return; - } - if (IsCat(d_feature_types, idx)) { - // Cut size is the same as number of categories in input. - d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx]; - } else { - d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx]; - } + d_column_sizes_scan[idx] = d_new_columns_ptr[idx]; }); - // Turn size into ptr. - thrust::exclusive_scan(ctx->CUDACtx()->CTP(), new_cuts_size.cbegin(), new_cuts_size.cend(), - d_cuts_ptr.data()); } } // namespace detail @@ -234,7 +220,7 @@ namespace { void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo const& info, std::size_t begin, std::size_t end, SketchContainer* sketch_container, // <- output sketch - int num_cuts_per_feature, common::Span sample_weight) { + common::Span sample_weight) { dh::device_vector sorted_entries; if (page.data.DeviceCanRead()) { // direct copy if data is already on device @@ -268,35 +254,28 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c detail::EntryCompareOp()); } - HostDeviceVector cuts_ptr; dh::caching_device_vector column_sizes_scan; data::IsValidFunctor dummy_is_valid(std::numeric_limits::quiet_NaN()); auto batch_it = dh::MakeTransformIterator( sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple { return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size. }); - detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, num_cuts_per_feature, - IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr, + detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, + IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &column_sizes_scan); - auto d_cuts_ptr = cuts_ptr.DeviceSpan(); if (sketch_container->HasCategorical()) { auto p_weight = entry_weight.empty() ? nullptr : &entry_weight; - detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, p_weight, - &column_sizes_scan); + detail::RemoveDuplicatedCategories(ctx, info, &sorted_entries, p_weight, &column_sizes_scan); } - auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); - CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size()); - // Add cuts into sketches auto n_rows_in_batch = RowsInEntrySpan(page, begin, end); - sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr, - h_cuts_ptr.back(), n_rows_in_batch, dh::ToSpan(entry_weight)); + sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), + n_rows_in_batch, dh::ToSpan(entry_weight)); sorted_entries.clear(); sorted_entries.shrink_to_fit(); CHECK_EQ(sorted_entries.capacity(), 0); - CHECK_NE(cuts_ptr.Size(), 0); } // Unify group weight, Hessian, and sample weight into sample weight. @@ -388,8 +367,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b for (auto begin = 0ull; begin < page_nnz; begin += sketch_batch_num_elements) { std::size_t end = std::min(page_nnz, static_cast(begin + sketch_batch_num_elements)); - ProcessWeightedBatch(ctx, page, info, begin, end, &sketch_container, num_cuts_per_feature, - d_weight); + ProcessWeightedBatch(ctx, page, info, begin, end, &sketch_container, d_weight); } } diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index ed93b41a03ce..68f5bf6488c5 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -139,23 +139,12 @@ void LaunchGetColumnSizeKernel(CUDAContext const* cuctx, DeviceOrd device, template void GetColumnSizesScan(CUDAContext const* cuctx, DeviceOrd device, size_t num_columns, - std::size_t num_cuts_per_feature, IterSpan batch_iter, - data::IsValidFunctor is_valid, - HostDeviceVector* cuts_ptr, + IterSpan batch_iter, data::IsValidFunctor is_valid, dh::caching_device_vector* column_sizes_scan) { column_sizes_scan->resize(num_columns + 1); - cuts_ptr->SetDevice(device); - cuts_ptr->Resize(num_columns + 1, 0); auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan); LaunchGetColumnSizeKernel(cuctx, device, batch_iter, is_valid, d_column_sizes_scan); - // Calculate cuts CSC pointer - auto cut_ptr_it = dh::MakeTransformIterator( - column_sizes_scan->begin(), [=] __device__(size_t column_size) { - return thrust::min(num_cuts_per_feature, column_size); - }); - thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it, - cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); thrust::exclusive_scan(cuctx->CTP(), column_sizes_scan->begin(), column_sizes_scan->end(), column_sizes_scan->begin()); } @@ -211,8 +200,7 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz, template void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch, BatchIter batch_iter, Range1d range, float missing, size_t columns, - size_t cuts_per_feature, DeviceOrd device, - HostDeviceVector* cut_sizes_scan, + DeviceOrd device, dh::caching_device_vector* column_sizes_scan, dh::device_vector* sorted_entries) { auto entry_iter = dh::MakeTransformIterator( @@ -223,8 +211,7 @@ void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch, auto span = IterSpan{batch_iter + range.begin(), n}; data::IsValidFunctor is_valid(missing); // Work out how many valid entries we have in each column - GetColumnSizesScan(cuctx, device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan, - column_sizes_scan); + GetColumnSizesScan(cuctx, device, columns, span, is_valid, column_sizes_scan); size_t num_valid = column_sizes_scan->back(); // Copy current subset of valid elements into temporary storage and sort sorted_entries->resize(num_valid); @@ -236,7 +223,6 @@ void SortByWeight(Context const* ctx, dh::device_vector* weights, dh::device_vector* sorted_entries); void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info, - Span d_cuts_ptr, dh::device_vector* p_sorted_entries, dh::device_vector* p_sorted_weights, dh::caching_device_vector* p_column_sizes_scan); @@ -278,33 +264,25 @@ inline HistogramCuts DeviceSketch( template void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info, size_t n_features, size_t begin, size_t end, float missing, - SketchContainer* sketch_container, int num_cuts, - bst_idx_t approx_n_samples) { + SketchContainer* sketch_container, bst_idx_t approx_n_samples) { // Copy current subset of valid elements into temporary storage and sort dh::device_vector sorted_entries; dh::caching_device_vector column_sizes_scan; auto batch_iter = dh::MakeTransformIterator( thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) { return batch.GetElement(idx); }); - HostDeviceVector cuts_ptr; - cuts_ptr.SetDevice(ctx->Device()); CUDAContext const* cuctx = ctx->CUDACtx(); detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, n_features, - num_cuts, ctx->Device(), &cuts_ptr, &column_sizes_scan, - &sorted_entries); + ctx->Device(), &column_sizes_scan, &sorted_entries); thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp()); if (sketch_container->HasCategorical()) { - auto d_cuts_ptr = cuts_ptr.DeviceSpan(); - detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, nullptr, - &column_sizes_scan); + detail::RemoveDuplicatedCategories(ctx, info, &sorted_entries, nullptr, &column_sizes_scan); } - auto d_cuts_ptr = cuts_ptr.DeviceSpan(); - auto const& h_cuts_ptr = cuts_ptr.HostVector(); // Extract the cuts from all columns concurrently - sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr, - h_cuts_ptr.back(), approx_n_samples); + sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), + approx_n_samples); sorted_entries.clear(); sorted_entries.shrink_to_fit(); @@ -312,9 +290,9 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf template void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info, - int num_cuts_per_feature, bool is_ranking, float missing, - size_t columns, size_t begin, size_t end, - SketchContainer* sketch_container, bst_idx_t approx_n_samples) { + bool is_ranking, float missing, size_t columns, size_t begin, + size_t end, SketchContainer* sketch_container, + bst_idx_t approx_n_samples) { curt::SetDevice(ctx->Ordinal()); info.weights_.SetDevice(ctx->Device()); auto weights = info.weights_.ConstDeviceSpan(); @@ -325,10 +303,8 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons auto cuctx = ctx->CUDACtx(); dh::device_vector sorted_entries; dh::caching_device_vector column_sizes_scan; - HostDeviceVector cuts_ptr; detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, - num_cuts_per_feature, ctx->Device(), &cuts_ptr, &column_sizes_scan, - &sorted_entries); + ctx->Device(), &column_sizes_scan, &sorted_entries); data::IsValidFunctor is_valid(missing); dh::device_vector temp_weights(sorted_entries.size()); @@ -370,17 +346,13 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons detail::SortByWeight(ctx, &temp_weights, &sorted_entries); if (sketch_container->HasCategorical()) { - auto d_cuts_ptr = cuts_ptr.DeviceSpan(); - detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, &temp_weights, + detail::RemoveDuplicatedCategories(ctx, info, &sorted_entries, &temp_weights, &column_sizes_scan); } - auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); - auto d_cuts_ptr = cuts_ptr.DeviceSpan(); - // Extract cuts - sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr, - h_cuts_ptr.back(), approx_n_samples, dh::ToSpan(temp_weights)); + sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), + approx_n_samples, dh::ToSpan(temp_weights)); sorted_entries.clear(); sorted_entries.shrink_to_fit(); } @@ -431,12 +403,11 @@ void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, Me std::min(batch.Size(), static_cast(begin + sketch_batch_num_elements)); if (weighted) { - ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature, - HostSketchContainer::UseGroup(info), missing, num_cols, begin, - end, sketch_container, approx_n_samples); + ProcessWeightedSlidingWindow(ctx, batch, info, HostSketchContainer::UseGroup(info), missing, + num_cols, begin, end, sketch_container, approx_n_samples); } else { ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container, - num_cuts_per_feature, approx_n_samples); + approx_n_samples); } begin += sketch_batch_num_elements; } diff --git a/src/common/quantile.cu b/src/common/quantile.cu index f259c0c6c5c4..55311035cf09 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -105,6 +105,30 @@ void GatherPruneEntries(Span selected_idx, Span out_c [=] __device__(size_t idx) { out_cuts[idx] = entry_from_index(selected_idx[idx]); }); } +void MakeCutsPtr(Context const *ctx, Span columns_ptr_in, Span ft, + bst_bin_t num_bins, bst_idx_t n_rows_in_batch, + HostDeviceVector *p_cuts_ptr) { + auto &cuts_ptr = *p_cuts_ptr; + cuts_ptr.SetDevice(ctx->Device()); + cuts_ptr.Resize(columns_ptr_in.size()); + auto d_cuts_size = cuts_ptr.DeviceSpan(); + auto num_rows = std::max(1, n_rows_in_batch); + auto eps = SketchEpsilon(num_bins, num_rows); + auto num_cuts_per_feature = + std::min(WQuantileSketch::LimitSizeLevel(num_rows, eps), static_cast(num_rows)); + dh::LaunchN(columns_ptr_in.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) { + if (idx == columns_ptr_in.size() - 1) { + d_cuts_size[idx] = 0; + return; + } + auto column_size = columns_ptr_in[idx + 1] - columns_ptr_in[idx]; + auto is_cat = IsCat(ft, idx); + d_cuts_size[idx] = is_cat ? column_size : thrust::min(num_cuts_per_feature, column_size); + }); + thrust::exclusive_scan(ctx->CUDACtx()->CTP(), cuts_ptr.DevicePointer(), + cuts_ptr.DevicePointer() + cuts_ptr.Size(), cuts_ptr.DevicePointer()); +} + template void PruneImpl(common::Span cuts_ptr, Span sorted_data, @@ -338,43 +362,41 @@ void MergeImpl(Context const *ctx, Span const &d_x, // `entries_`. Out-of-place merge/prune results use `entries_tmp_` as scratch before being // committed back into `entries_`. void SketchContainer::Push(Context const *ctx, Span entries, Span columns_ptr, - common::Span cuts_ptr, size_t total_cuts, bst_idx_t n_rows_in_batch, Span weights) { curt::SetDevice(ctx->Ordinal()); rows_seen_ += n_rows_in_batch; + HostDeviceVector cuts_ptr; + MakeCutsPtr(ctx, columns_ptr, this->feature_types_.ConstDeviceSpan(), this->num_bins_, + n_rows_in_batch, &cuts_ptr); + auto d_cuts_ptr = cuts_ptr.DeviceSpan(); + auto total_cuts = cuts_ptr.ConstHostSpan().back(); this->prune_buffer_.resize(total_cuts); auto out = dh::ToSpan(this->prune_buffer_); auto ft = this->feature_types_.ConstDeviceSpan(); - if (weights.empty()) { - auto to_sketch_entry = [] __device__(size_t sample_idx, Span const &column, - size_t) { + auto to_sketch_entry = [weights, columns_ptr] __device__( + size_t sample_idx, Span const &column, size_t column_id) { + if (weights.empty()) { float rmin = sample_idx; float rmax = sample_idx + 1; return SketchEntry{rmin, rmax, 1, column[sample_idx].fvalue}; - }; // NOLINT - PruneImpl(cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry); - } else { - auto to_sketch_entry = [weights, columns_ptr] __device__(size_t sample_idx, - Span const &column, - size_t column_id) { - Span column_weights_scan = - weights.subspan(columns_ptr[column_id], column.size()); - float rmin = sample_idx > 0 ? column_weights_scan[sample_idx - 1] : 0.0f; - float rmax = column_weights_scan[sample_idx]; - float wmin = rmax - rmin; - wmin = wmin < 0 ? kRtEps : wmin; // GPU scan can generate floating error. - return SketchEntry{rmin, rmax, wmin, column[sample_idx].fvalue}; - }; // NOLINT - PruneImpl(cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry); - } - auto n_uniques = this->ScanInput(ctx, out, cuts_ptr); - CHECK_EQ(this->columns_ptr_.Size(), cuts_ptr.size()); + } + + Span column_weights_scan = weights.subspan(columns_ptr[column_id], column.size()); + float rmin = sample_idx > 0 ? column_weights_scan[sample_idx - 1] : 0.0f; + float rmax = column_weights_scan[sample_idx]; + float wmin = rmax - rmin; + wmin = wmin < 0 ? kRtEps : wmin; // GPU scan can generate floating error. + return SketchEntry{rmin, rmax, wmin, column[sample_idx].fvalue}; + }; // NOLINT + PruneImpl(d_cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry); + auto n_uniques = this->ScanInput(ctx, out, d_cuts_ptr); + CHECK_EQ(this->columns_ptr_.Size(), d_cuts_ptr.size()); if (n_uniques == 0) { return; } - this->Merge(ctx, cuts_ptr, out.subspan(0, n_uniques)); - auto intermediate_num_cuts = static_cast(this->IntermediateNumCuts()); - this->Prune(ctx, intermediate_num_cuts); + this->Merge(ctx, d_cuts_ptr, out.subspan(0, n_uniques)); + auto intermediate_cuts_per_feature = static_cast(this->IntermediateCutsPerFeature()); + this->Prune(ctx, intermediate_cuts_per_feature); } size_t SketchContainer::ScanInput(Context const *ctx, Span entries, diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 03f48b2dc787..115d82ee7a01 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -65,10 +65,9 @@ class SketchContainer { columns_ptr_.Copy(columns_ptr_tmp_); entries_.resize(n_entries); } - [[nodiscard]] std::size_t IntermediateNumCuts() const { + [[nodiscard]] std::size_t IntermediateCutsPerFeature() const { auto const eps = SketchEpsilon(num_bins_, std::max(1, rows_seen_)); - auto const per_feature = WQSketch::LimitSizeLevel(std::max(1, rows_seen_), eps); - return per_feature * num_columns_; + return WQSketch::LimitSizeLevel(std::max(1, rows_seen_), eps); } // Get the span of one column. @@ -145,13 +144,10 @@ class SketchContainer { * * \param entries Sorted entries. * \param columns_ptr CSC pointer for entries. - * \param cuts_ptr CSC pointer for cuts. - * \param total_cuts Total number of cuts, equal to the back of cuts_ptr. * \param weights (optional) data weights. */ void Push(Context const* ctx, Span entries, Span columns_ptr, - common::Span cuts_ptr, size_t total_cuts, bst_idx_t n_rows_in_batch, - Span weights = {}); + bst_idx_t n_rows_in_batch, Span weights = {}); /** * @brief Prune the quantile structure. * diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu index 82ba59cb4bc3..7c66fe800a85 100644 --- a/src/data/quantile_dmatrix.cu +++ b/src/data/quantile_dmatrix.cu @@ -69,10 +69,6 @@ void MakeSketches(Context const* ctx, auto batch_rows = data::BatchSamples(proxy); ext_info.accumulated_rows += batch_rows; - // Prune to this after each batch - auto n_cuts_per_feat = - common::detail::RequiredSampleCutsPerColumn(p.max_bin, ext_info.accumulated_rows); - /** * Handle sketching. */ @@ -85,7 +81,6 @@ void MakeSketches(Context const* ctx, DispatchAny(proxy, [&](auto const& value) { common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get()); }); - sketch->Prune(p_ctx, n_cuts_per_feat); LOG(DEBUG) << "Total capacity:" << common::HumanMemUnit(sketch->MemCapacityBytes()); } diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 017233b5541d..1281762bb02e 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -127,9 +127,6 @@ TEST(HistUtil, RemoveDuplicatedCategories) { FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical}; ASSERT_EQ(info.feature_types.Size(), n_features); - HostDeviceVector cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3}; - cuts_ptr.SetDevice(DeviceOrd::CUDA(0)); - dh::device_vector weight(n_samples * n_features, 0); dh::Iota(dh::ToSpan(weight), ctx.CUDACtx()->Stream()); @@ -141,10 +138,9 @@ TEST(HistUtil, RemoveDuplicatedCategories) { thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(), detail::EntryCompareOp()); - detail::RemoveDuplicatedCategories(&ctx, info, cuts_ptr.DeviceSpan(), &sorted_entries, &weight, - &columns_ptr); + detail::RemoveDuplicatedCategories(&ctx, info, &sorted_entries, &weight, &columns_ptr); - auto const& h_cptr = cuts_ptr.ConstHostVector(); + auto const& h_cptr = columns_ptr; ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories); // check numerical for (std::size_t i = 0; i < n_samples; ++i) { diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 06e5dabe508e..f2c8433aaba4 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -40,35 +40,32 @@ struct HostSketchView { std::vector columns_ptr; }; -struct HostEntryBatch { - std::vector entries; - std::vector columns_ptr; +struct DeviceEntryBatch { + dh::device_vector entries; + dh::device_vector columns_ptr; }; -auto MakeEntryBatch(std::vector> const& columns) -> HostEntryBatch { - HostEntryBatch batch; - batch.columns_ptr.push_back(0); +auto MakeEntryBatch(std::vector> const& columns) -> DeviceEntryBatch { + std::vector h_entries; + std::vector h_columns_ptr; + h_columns_ptr.push_back(0); for (bst_feature_t c = 0; c < columns.size(); ++c) { for (auto value : columns[c]) { - batch.entries.push_back(Entry{c, value}); + h_entries.push_back(Entry{c, value}); } - batch.columns_ptr.push_back(batch.entries.size()); + h_columns_ptr.push_back(h_entries.size()); } - return batch; + return {dh::device_vector{h_entries}, dh::device_vector{h_columns_ptr}}; } -auto MakePruneBatch(std::size_t rows, bst_feature_t cols, bool with_duplicates) -> HostEntryBatch { +auto MakeSyntheticBatch(std::size_t rows, bst_feature_t cols, std::size_t batch_idx = 0) + -> DeviceEntryBatch { std::vector> columns(cols); + auto base = static_cast(batch_idx) * 1000.0f; for (size_t i = 0; i < rows; ++i) { - if (with_duplicates) { - columns[0].push_back(static_cast(i / 4)); - columns[1].push_back(static_cast(i / 8) + 10.0f); - columns[2].push_back(static_cast(i / 2) + 100.0f); - } else { - columns[0].push_back(static_cast(i)); - columns[1].push_back(static_cast(i) * 0.5f + 10.0f); - columns[2].push_back(static_cast(i) * 0.25f + 100.0f); - } + columns[0].push_back(base + static_cast(i / 4)); + columns[1].push_back(base + static_cast(i / 8) + 10.0f); + columns[2].push_back(base + static_cast(i / 2) + 100.0f); } return MakeEntryBatch(columns); } @@ -82,6 +79,14 @@ auto CopySketchToHost(xgboost::common::Span data, dh::CopyDeviceSpanToVector(&out.columns_ptr, columns_ptr); return out; } + +[[nodiscard]] auto ExpectedSketchEntriesPerFeature(bst_bin_t n_bins, std::size_t rows_seen) + -> std::size_t { + auto num_rows = std::max(1, rows_seen); + auto eps = SketchEpsilon(n_bins, num_rows); + auto limit = WQSketch::LimitSizeLevel(num_rows, eps); + return std::min(limit, num_rows); +} } // namespace namespace common { @@ -200,7 +205,7 @@ TEST(GPUQuantile, EmptyPush) { dh::device_vector cuts_ptr(kCols + 1); thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0); // Push empty - sketch.Push(&ctx, dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0, 0); + sketch.Push(&ctx, dh::ToSpan(entries), dh::ToSpan(cuts_ptr), 0); ASSERT_EQ(sketch.Data().size(), 0); } @@ -234,24 +239,22 @@ void ValidateSketchInvariants(HostSketchView const& sketch, bool with_error = fa } TEST(GPUQuantile, Prune) { - constexpr size_t kRows = 64, kCols = 3; - for (auto with_duplicates : {false, true}) { - for (auto n_bins : {8, 16, 80}) { - auto ctx = MakeCUDACtx(0); - HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - auto batch = MakePruneBatch(kRows, kCols, with_duplicates); - dh::device_vector d_entries{batch.entries}; - dh::device_vector d_columns_ptr{batch.columns_ptr}; - dh::device_vector d_cuts_ptr{batch.columns_ptr}; - sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(d_columns_ptr), dh::ToSpan(d_cuts_ptr), - batch.entries.size(), kRows, {}); - - sketch.Prune(&ctx, n_bins); - ASSERT_LE(sketch.Data().size(), static_cast(n_bins) * kCols); - auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); - ValidateSketchInvariants(h_sketch); + constexpr size_t kRows = 64, kCols = 3, kBatches = 3; + for (auto n_bins : {8, 16, 80}) { + auto ctx = MakeCUDACtx(0); + HostDeviceVector ft; + SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + for (size_t batch_idx = 0; batch_idx < kBatches; ++batch_idx) { + auto batch = MakeSyntheticBatch(kRows, kCols, batch_idx); + sketch.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), kRows, {}); + auto rows_seen = kRows * (batch_idx + 1); + ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, rows_seen) * kCols); } + + sketch.Prune(&ctx, n_bins); + ASSERT_LE(sketch.Data().size(), static_cast(n_bins) * kCols); + auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); } } @@ -405,13 +408,9 @@ TEST(GPUQuantile, MergeCategorical) { dh::device_vector d_entries_1{entries_1}; dh::device_vector columns_ptr_0{0, 5, 8}; dh::device_vector columns_ptr_1{0, 5, 8}; - dh::device_vector cuts_ptr_0{0, 5, 8}; - dh::device_vector cuts_ptr_1{0, 5, 8}; - sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr_0), dh::ToSpan(cuts_ptr_0), - entries_0.size(), 5, {}); - sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr_1), dh::ToSpan(cuts_ptr_1), - entries_1.size(), 5, {}); + sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr_0), 5, {}); + sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr_1), 5, {}); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); @@ -439,12 +438,9 @@ TEST(GPUQuantile, MergeSameValue) { dh::device_vector d_entries_0{entries_0}; dh::device_vector d_entries_1{entries_1}; dh::device_vector columns_ptr{0, 1}; - dh::device_vector cuts_ptr{0, 1}; - sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr), dh::ToSpan(cuts_ptr), 1, 1, - {}); - sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr), dh::ToSpan(cuts_ptr), 1, 1, - {}); + sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr), 1, {}); + sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr), 1, {}); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); @@ -647,8 +643,8 @@ TEST(GPUQuantile, Push) { HostDeviceVector ft; SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, - kRows, {}); + sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), kRows, {}); + ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, kRows) * kCols); auto sketch_data = sketch.Data(); @@ -696,10 +692,9 @@ TEST(GPUQuantile, MultiColPush) { } thrust::inclusive_scan(thrust::device, columns_ptr.begin(), columns_ptr.end(), columns_ptr.begin()); - dh::device_vector cuts_ptr(columns_ptr); - sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(cuts_ptr), - kRows * kCols, kRows, {}); + sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), kRows, {}); + ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, kRows) * kCols); auto sketch_data = sketch.Data(); ASSERT_EQ(sketch_data.size(), kCols * 2); From 90ebca554ceaea419eef116a06ca9f7ad492f5f0 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 04:29:56 -0700 Subject: [PATCH 05/16] Simplify GPU quantile sketch tests --- tests/cpp/common/test_quantile.cu | 558 +++++++++++------------------- 1 file changed, 201 insertions(+), 357 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index f2c8433aaba4..9484b914c659 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -6,8 +6,7 @@ #include "../../../src/collective/allreduce.h" #include "../../../src/common/hist_util.cuh" #include "../../../src/common/quantile.cuh" -#include "../../../src/data/device_adapter.cuh" // CupyAdapter -#include "../collective/test_worker.h" // for BaseMGPUTest +#include "../collective/test_worker.h" // for BaseMGPUTest #include "../helpers.h" #include "test_hist_util.h" #include "test_quantile.h" @@ -43,31 +42,58 @@ struct HostSketchView { struct DeviceEntryBatch { dh::device_vector entries; dh::device_vector columns_ptr; + dh::device_vector weights_scan; + std::size_t rows; }; -auto MakeEntryBatch(std::vector> const& columns) -> DeviceEntryBatch { +auto MakeEntryBatch(std::vector> const& columns, + std::vector> const* weights = nullptr) -> DeviceEntryBatch { std::vector h_entries; std::vector h_columns_ptr; + std::vector h_weights_scan; h_columns_ptr.push_back(0); for (bst_feature_t c = 0; c < columns.size(); ++c) { + float prefix_sum = 0.0f; for (auto value : columns[c]) { h_entries.push_back(Entry{c, value}); } + if (weights) { + CHECK_EQ(columns[c].size(), (*weights)[c].size()); + for (auto w : (*weights)[c]) { + prefix_sum += w; + h_weights_scan.push_back(prefix_sum); + } + } h_columns_ptr.push_back(h_entries.size()); } - return {dh::device_vector{h_entries}, dh::device_vector{h_columns_ptr}}; + return {dh::device_vector{h_entries}, dh::device_vector{h_columns_ptr}, + dh::device_vector{h_weights_scan}, columns.empty() ? 0 : columns.front().size()}; } -auto MakeSyntheticBatch(std::size_t rows, bst_feature_t cols, std::size_t batch_idx = 0) - -> DeviceEntryBatch { +auto MakeSyntheticBatch(std::size_t rows, bst_feature_t cols, std::int32_t seed = 0, + bool weighted = false, bool with_duplicates = true, + std::size_t batch_idx = 0) -> DeviceEntryBatch { std::vector> columns(cols); - auto base = static_cast(batch_idx) * 1000.0f; - for (size_t i = 0; i < rows; ++i) { - columns[0].push_back(base + static_cast(i / 4)); - columns[1].push_back(base + static_cast(i / 8) + 10.0f); - columns[2].push_back(base + static_cast(i / 2) + 100.0f); + std::vector> weights(cols); + for (bst_feature_t c = 0; c < cols; ++c) { + auto base = static_cast(c) * 1000.0f + static_cast(seed % 97) * 10.0f + + static_cast(batch_idx) * 1000.0f; + for (std::size_t r = 0; r < rows; ++r) { + float value; + if (with_duplicates) { + value = base + static_cast(r / 2); + } else { + auto jitter = static_cast((seed + c * 17 + r * 13) % 7) * 1e-3f; + value = base + static_cast(r) + jitter; + } + columns[c].push_back(value); + if (weighted) { + auto weight = 0.5f + static_cast((seed + c * 19 + r * 23) % 11) * 0.25f; + weights[c].push_back(weight); + } + } } - return MakeEntryBatch(columns); + return weighted ? MakeEntryBatch(columns, &weights) : MakeEntryBatch(columns); } auto CopySketchToHost(xgboost::common::Span data, @@ -210,6 +236,10 @@ TEST(GPUQuantile, EmptyPush) { } void ValidateSketchInvariants(HostSketchView const& sketch, bool with_error = false) { + ASSERT_FALSE(sketch.columns_ptr.empty()); + ASSERT_EQ(sketch.columns_ptr.front(), 0); + ASSERT_TRUE(std::is_sorted(sketch.columns_ptr.begin(), sketch.columns_ptr.end())); + ASSERT_EQ(static_cast(sketch.columns_ptr.back()), sketch.data.size()); for (size_t i = 1; i < sketch.columns_ptr.size(); ++i) { auto column_id = i - 1; auto beg = sketch.columns_ptr[column_id]; @@ -239,17 +269,13 @@ void ValidateSketchInvariants(HostSketchView const& sketch, bool with_error = fa } TEST(GPUQuantile, Prune) { - constexpr size_t kRows = 64, kCols = 3, kBatches = 3; + constexpr size_t kRows = 64, kCols = 3; for (auto n_bins : {8, 16, 80}) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - for (size_t batch_idx = 0; batch_idx < kBatches; ++batch_idx) { - auto batch = MakeSyntheticBatch(kRows, kCols, batch_idx); - sketch.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), kRows, {}); - auto rows_seen = kRows * (batch_idx + 1); - ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, rows_seen) * kCols); - } + auto batch = MakeSyntheticBatch(kRows, kCols, 0, false, true, 0); + sketch.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), kRows, {}); sketch.Prune(&ctx, n_bins); ASSERT_LE(sketch.Data().size(), static_cast(n_bins) * kCols); @@ -264,128 +290,79 @@ TEST(GPUQuantile, MergeEmpty) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); - HostDeviceVector storage_0; - std::string interface_str_0 = - RandomDataGenerator{kRows, kCols, 0}.Device(ctx.Device()).GenerateArrayInterface(&storage_0); - data::CupyAdapter adapter_0(interface_str_0); - MetaInfo info; - AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_0); - - std::vector entries_before(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&entries_before, sketch_0.Data()); - std::vector ptrs_before(sketch_0.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&ptrs_before, sketch_0.ColumnsPtr()); + auto batch = MakeSyntheticBatch(kRows, kCols, 0, false, false); + sketch_0.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), batch.rows, {}); + + auto before = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); thrust::device_vector columns_ptr(kCols + 1); // Merge an empty sketch sketch_0.Merge(&ctx, dh::ToSpan(columns_ptr), Span{}); - std::vector entries_after(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&entries_after, sketch_0.Data()); - std::vector ptrs_after(sketch_0.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&ptrs_after, sketch_0.ColumnsPtr()); - - CHECK_EQ(entries_before.size(), entries_after.size()); - CHECK_EQ(ptrs_before.size(), ptrs_after.size()); - for (size_t i = 0; i < entries_before.size(); ++i) { - CHECK_EQ(entries_before[i].value, entries_after[i].value); - CHECK_EQ(entries_before[i].rmin, entries_after[i].rmin); - CHECK_EQ(entries_before[i].rmax, entries_after[i].rmax); - CHECK_EQ(entries_before[i].wmin, entries_after[i].wmin); - } - for (size_t i = 0; i < ptrs_before.size(); ++i) { - CHECK_EQ(ptrs_before[i], ptrs_after[i]); + auto after = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ASSERT_EQ(before.columns_ptr, after.columns_ptr); + ASSERT_EQ(before.data.size(), after.data.size()); + for (size_t i = 0; i < before.data.size(); ++i) { + EXPECT_FLOAT_EQ(before.data[i].value, after.data[i].value); + EXPECT_FLOAT_EQ(before.data[i].rmin, after.data[i].rmin); + EXPECT_FLOAT_EQ(before.data[i].rmax, after.data[i].rmax); + EXPECT_FLOAT_EQ(before.data[i].wmin, after.data[i].wmin); } } TEST(GPUQuantile, MergeBasic) { constexpr size_t kRows = 1000, kCols = 100; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { + for (auto n_bins : {bst_bin_t{2}, bst_bin_t{16}, static_cast(kRows + 160)}) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; + SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); - HostDeviceVector storage_0; - std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage_0); - data::CupyAdapter adapter_0(interface_str_0); - AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_0); + auto unweighted_0 = MakeSyntheticBatch(kRows, kCols, 7, false, false, 0); + auto weighted_0 = MakeSyntheticBatch(kRows, kCols, 11, true, false, 1); + sketch_0.Push(&ctx, dh::ToSpan(unweighted_0.entries), dh::ToSpan(unweighted_0.columns_ptr), + unweighted_0.rows, dh::ToSpan(unweighted_0.weights_scan)); + sketch_0.Push(&ctx, dh::ToSpan(weighted_0.entries), dh::ToSpan(weighted_0.columns_ptr), + weighted_0.rows, dh::ToSpan(weighted_0.weights_scan)); SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); - HostDeviceVector storage_1; - std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage_1); - data::CupyAdapter adapter_1(interface_str_1); - AdapterDeviceSketch(&ctx, adapter_1.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_1); + auto unweighted_1 = MakeSyntheticBatch(kRows, kCols, 13, false, false, 2); + auto weighted_1 = MakeSyntheticBatch(kRows, kCols, 17, true, false, 3); + sketch_1.Push(&ctx, dh::ToSpan(unweighted_1.entries), dh::ToSpan(unweighted_1.columns_ptr), + unweighted_1.rows, dh::ToSpan(unweighted_1.weights_scan)); + sketch_1.Push(&ctx, dh::ToSpan(weighted_1.entries), dh::ToSpan(weighted_1.columns_ptr), + weighted_1.rows, dh::ToSpan(weighted_1.weights_scan)); size_t size_before_merge = sketch_0.Data().size(); + size_t rhs_size = sketch_1.Data().size(); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); ValidateSketchInvariants(h_sketch); - auto const& h_columns_ptr = h_sketch.columns_ptr; - ASSERT_LE(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge); - ASSERT_EQ(static_cast(h_columns_ptr.back()), h_sketch.data.size()); - }); -} - -void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { - auto ctx = MakeCUDACtx(0); - MetaInfo info; - int32_t seed = 0; - HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, cols, ctx.Device()); - HostDeviceVector storage_0; - std::string interface_str_0 = RandomDataGenerator{rows, cols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage_0); - data::CupyAdapter adapter_0(interface_str_0); - AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_0); - - size_t f_rows = rows * frac; - SketchContainer sketch_1(ft, n_bins, cols, ctx.Device()); - HostDeviceVector storage_1; - std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0} - .Device(ctx.Device()) - .Seed(seed) - .GenerateArrayInterface(&storage_1); - auto data_1 = storage_1.DeviceSpan(); - auto tuple_it = cuda::std::make_tuple(thrust::make_counting_iterator(0ul), data_1.data()); - using Tuple = cuda::std::tuple; - auto it = thrust::make_zip_iterator(tuple_it); - thrust::transform(ctx.CUDACtx()->CTP(), it, it + data_1.size(), data_1.data(), - [=] XGBOOST_DEVICE(Tuple const& tuple) { - auto i = cuda::std::get<0>(tuple); - if (i % 2 == 0) { - return 0.0f; - } else { - return cuda::std::get<1>(tuple); - } - }); - data::CupyAdapter adapter_1(interface_str_1); - AdapterDeviceSketch(&ctx, adapter_1.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_1); - - size_t size_before_merge = sketch_0.Data().size(); - sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); - auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); - ValidateSketchInvariants(h_sketch); - auto const& h_columns_ptr = h_sketch.columns_ptr; - ASSERT_LE(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge); - ASSERT_EQ(static_cast(h_columns_ptr.back()), h_sketch.data.size()); + ASSERT_LE(h_sketch.data.size(), rhs_size + size_before_merge); + } } TEST(GPUQuantile, MergeDuplicated) { size_t n_bins = 256; constexpr size_t kRows = 1000, kCols = 100; for (float frac = 0.5; frac < 2.5; frac += 0.5) { - TestMergeDuplicated(n_bins, kRows, kCols, frac); + auto ctx = MakeCUDACtx(0); + HostDeviceVector ft; + SketchContainer sketch_0(ft, n_bins, kRows, ctx.Device()); + auto batch_0 = MakeSyntheticBatch(kCols, kRows, 0, false, false); + sketch_0.Push(&ctx, dh::ToSpan(batch_0.entries), dh::ToSpan(batch_0.columns_ptr), batch_0.rows, + {}); + + size_t f_rows = kCols * frac; + SketchContainer sketch_1(ft, n_bins, kRows, ctx.Device()); + auto batch_1 = MakeSyntheticBatch(f_rows, kRows, 0, false, true); + sketch_1.Push(&ctx, dh::ToSpan(batch_1.entries), dh::ToSpan(batch_1.columns_ptr), batch_1.rows, + {}); + + size_t size_before_merge = sketch_0.Data().size(); + size_t rhs_size = sketch_1.Data().size(); + sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); + auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + ASSERT_LE(h_sketch.data.size(), rhs_size + size_before_merge); } } @@ -399,18 +376,13 @@ TEST(GPUQuantile, MergeCategorical) { SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); - std::vector entries_0{{0, 0.0f}, {0, 0.0f}, {0, 1.0f}, {0, 2.0f}, - {0, 2.0f}, {1, 0.1f}, {1, 0.2f}, {1, 0.4f}}; - std::vector entries_1{{0, 1.0f}, {0, 1.0f}, {0, 2.0f}, {0, 3.0f}, - {0, 3.0f}, {1, 0.15f}, {1, 0.25f}, {1, 0.5f}}; + auto batch_0 = MakeEntryBatch({{0.0f, 0.0f, 1.0f, 2.0f, 2.0f}, {0.1f, 0.2f, 0.4f}}); + auto batch_1 = MakeEntryBatch({{1.0f, 1.0f, 2.0f, 3.0f, 3.0f}, {0.15f, 0.25f, 0.5f}}); - dh::device_vector d_entries_0{entries_0}; - dh::device_vector d_entries_1{entries_1}; - dh::device_vector columns_ptr_0{0, 5, 8}; - dh::device_vector columns_ptr_1{0, 5, 8}; - - sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr_0), 5, {}); - sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr_1), 5, {}); + sketch_0.Push(&ctx, dh::ToSpan(batch_0.entries), dh::ToSpan(batch_0.columns_ptr), batch_0.rows, + {}); + sketch_1.Push(&ctx, dh::ToSpan(batch_1.entries), dh::ToSpan(batch_1.columns_ptr), batch_1.rows, + {}); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); @@ -418,10 +390,11 @@ TEST(GPUQuantile, MergeCategorical) { auto cat_column = Span{h_sketch.data}.subspan(h_sketch.columns_ptr[0], h_sketch.columns_ptr[1]); - ASSERT_TRUE(std::adjacent_find(cat_column.begin(), cat_column.end(), - [](SketchEntry const& l, SketchEntry const& r) { - return l.value == r.value; - }) == cat_column.end()); + ASSERT_EQ(cat_column.size(), 4); + EXPECT_FLOAT_EQ(cat_column[0].value, 0.0f); + EXPECT_FLOAT_EQ(cat_column[1].value, 1.0f); + EXPECT_FLOAT_EQ(cat_column[2].value, 2.0f); + EXPECT_FLOAT_EQ(cat_column[3].value, 3.0f); } TEST(GPUQuantile, MergeSameValue) { @@ -433,28 +406,20 @@ TEST(GPUQuantile, MergeSameValue) { SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); - std::vector entries_0{{0, 0.5f}}; - std::vector entries_1{{0, 0.5f}}; - dh::device_vector d_entries_0{entries_0}; - dh::device_vector d_entries_1{entries_1}; - dh::device_vector columns_ptr{0, 1}; - - sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr), 1, {}); - sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr), 1, {}); + auto batch = MakeEntryBatch({{0.5f}}); + sketch_0.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), batch.rows, {}); + sketch_1.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), batch.rows, {}); sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data()); - std::vector h_columns_ptr(sketch_0.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch_0.ColumnsPtr()); - std::vector h_data(sketch_0.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch_0.Data()); - - ASSERT_EQ(h_columns_ptr.back(), 1); - ASSERT_EQ(h_data.size(), 1); - EXPECT_FLOAT_EQ(h_data.front().value, 0.5f); - EXPECT_FLOAT_EQ(h_data.front().rmin, 0.0f); - EXPECT_FLOAT_EQ(h_data.front().wmin, 2.0f); - EXPECT_FLOAT_EQ(h_data.front().rmax, 2.0f); + auto h_sketch = CopySketchToHost(sketch_0.Data(), sketch_0.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + ASSERT_EQ(h_sketch.columns_ptr, (std::vector{0, 1})); + ASSERT_EQ(h_sketch.data.size(), 1); + EXPECT_FLOAT_EQ(h_sketch.data.front().value, 0.5f); + EXPECT_FLOAT_EQ(h_sketch.data.front().rmin, 0.0f); + EXPECT_FLOAT_EQ(h_sketch.data.front().wmin, 2.0f); + EXPECT_FLOAT_EQ(h_sketch.data.front().rmax, 2.0f); } TEST(GPUQuantile, MergeMatchesCpuCombine) { @@ -483,136 +448,77 @@ TEST(GPUQuantile, MergeMatchesCpuCombine) { sketch.Merge(&ctx, dh::ToSpan(lhs_ptr), dh::ToSpan(d_lhs)); sketch.Merge(&ctx, dh::ToSpan(rhs_ptr), dh::ToSpan(d_rhs)); - std::vector h_columns_ptr(sketch.ColumnsPtr().size()); - dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch.ColumnsPtr()); - auto h_data = std::vector(sketch.Data().size()); - dh::CopyDeviceSpanToVector(&h_data, sketch.Data()); - - ASSERT_EQ(h_columns_ptr.back(), expected.Size()); + auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + ASSERT_EQ(h_sketch.columns_ptr, + (std::vector{0, static_cast(expected.Size())})); auto expected_entries = expected.Entries(); - ASSERT_EQ(h_data.size(), expected_entries.size()); - for (std::size_t i = 0; i < h_data.size(); ++i) { - EXPECT_FLOAT_EQ(h_data[i].value, expected_entries[i].value); - EXPECT_FLOAT_EQ(h_data[i].rmin, expected_entries[i].rmin); - EXPECT_FLOAT_EQ(h_data[i].rmax, expected_entries[i].rmax); - EXPECT_FLOAT_EQ(h_data[i].wmin, expected_entries[i].wmin); + ASSERT_EQ(h_sketch.data.size(), expected_entries.size()); + for (std::size_t i = 0; i < h_sketch.data.size(); ++i) { + EXPECT_FLOAT_EQ(h_sketch.data[i].value, expected_entries[i].value); + EXPECT_FLOAT_EQ(h_sketch.data[i].rmin, expected_entries[i].rmin); + EXPECT_FLOAT_EQ(h_sketch.data[i].rmax, expected_entries[i].rmax); + EXPECT_FLOAT_EQ(h_sketch.data[i].wmin, expected_entries[i].wmin); } } -TEST(GPUQuantile, MultiMerge) { - constexpr size_t kRows = 20, kCols = 1; - int32_t world = 2; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { - // Set up single node version - HostDeviceVector ft; - auto ctx = MakeCUDACtx(0); - SketchContainer sketch_on_single_node(ft, n_bins, kCols, ctx.Device()); - - auto intermediate_num_cuts = SketchSummaryBudget(n_bins, kRows * world); - std::vector containers; - for (auto rank = 0; rank < world; ++rank) { - HostDeviceVector storage; - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(ctx.Device()) - .Seed(rank + seed) - .GenerateArrayInterface(&storage); - data::CupyAdapter adapter(interface_str); - HostDeviceVector ft; - containers.emplace_back(ft, n_bins, kCols, ctx.Device()); - AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &containers.back()); - } - for (auto& sketch : containers) { - sketch.Prune(&ctx, intermediate_num_cuts); - sketch_on_single_node.Merge(&ctx, sketch.ColumnsPtr(), sketch.Data()); - } - auto h_sketch = - CopySketchToHost(sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr()); - ValidateSketchInvariants(h_sketch); - }); -} - -TEST(GPUQuantile, MissingColumns) { - auto dmat = std::unique_ptr{[=]() { - std::size_t constexpr kRows = 1000, kCols = 100; - auto sparsity = 0.5f; - std::vector ft(kCols); - for (size_t i = 0; i < ft.size(); ++i) { - ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; - } - auto dmat = RandomDataGenerator{kRows, kCols, sparsity} - .Seed(0) - .Lower(.0f) - .Upper(1.0f) - .Type(ft) - .MaxCategory(13) - .GenerateDMatrix(); - return dmat->SliceCol(2, 1); - }()}; - dmat->Info().data_split_mode = DataSplitMode::kRow; +namespace { +void AssertSameSketchOnAllWorkers(Context const* ctx, HostSketchView const& sketch) { + constexpr std::int32_t kRoot = 0; + Context cpu_ctx; + + auto ptrs = sketch.columns_ptr; + auto ptr_size = static_cast(ptrs.size()); + auto rc = collective::Broadcast(&cpu_ctx, linalg::MakeVec(&ptr_size, 1), kRoot); + SafeColl(rc); + if (collective::GetRank() != kRoot) { + ptrs.resize(ptr_size); + } + if (ptr_size != 0) { + rc = collective::Broadcast(&cpu_ctx, linalg::MakeVec(ptrs.data(), ptrs.size()), kRoot); + SafeColl(rc); + } + ASSERT_EQ(sketch.columns_ptr, ptrs); + + auto data = sketch.data; + auto data_size = static_cast(data.size()); + rc = collective::Broadcast(&cpu_ctx, linalg::MakeVec(&data_size, 1), kRoot); + SafeColl(rc); + if (collective::GetRank() != kRoot) { + data.resize(data_size); + } + if (data_size != 0) { + rc = collective::Broadcast(&cpu_ctx, linalg::MakeVec(data.data(), data.size()), kRoot); + SafeColl(rc); + } - auto ctx = MakeCUDACtx(0); - std::size_t constexpr kBins = 64; - HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins); - ASSERT_TRUE(cuts.HasCategorical()); + ASSERT_EQ(sketch.data.size(), data.size()); + for (size_t i = 0; i < sketch.data.size(); ++i) { + ASSERT_FLOAT_EQ(sketch.data[i].value, data[i].value); + ASSERT_FLOAT_EQ(sketch.data[i].rmin, data[i].rmin); + ASSERT_FLOAT_EQ(sketch.data[i].rmax, data[i].rmax); + ASSERT_FLOAT_EQ(sketch.data[i].wmin, data[i].wmin); + } } -namespace { void TestSameOnAllWorkers() { - auto world = collective::GetWorldSize(); constexpr size_t kRows = 1000, kCols = 100; - RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { - auto const rank = collective::GetRank(); - auto const device = DeviceOrd::CUDA(GPUIDX); - Context ctx = MakeCUDACtx(device.ordinal); - HostDeviceVector ft({}, device); - SketchContainer sketch_distributed(ft, n_bins, kCols, device); - HostDeviceVector storage({}, device); - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(device) - .Seed(rank + seed) - .GenerateArrayInterface(&storage); - data::CupyAdapter adapter(interface_str); - AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_distributed); - sketch_distributed.AllReduce(&ctx, false); - auto h_sketch = CopySketchToHost(sketch_distributed.Data(), sketch_distributed.ColumnsPtr()); - ValidateSketchInvariants(h_sketch, true); - - // Test for all workers having the same sketch. - size_t n_data = sketch_distributed.Data().size(); - auto rc = collective::Allreduce(&ctx, linalg::MakeVec(&n_data, 1), collective::Op::kMax); - SafeColl(rc); - ASSERT_EQ(n_data, sketch_distributed.Data().size()); - size_t size_as_float = sketch_distributed.Data().size_bytes() / sizeof(float); - auto local_data = Span{ - reinterpret_cast(sketch_distributed.Data().data()), size_as_float}; - - dh::caching_device_vector all_workers(size_as_float * world); - thrust::fill(all_workers.begin(), all_workers.end(), 0); - thrust::copy(thrust::device, local_data.data(), local_data.data() + local_data.size(), - all_workers.begin() + local_data.size() * rank); - rc = collective::Allreduce( - &ctx, linalg::MakeVec(all_workers.data().get(), all_workers.size(), ctx.Device()), - collective::Op::kSum); - SafeColl(rc); - - auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float); - std::vector h_base_line(base_line.size()); - dh::CopyDeviceSpanToVector(&h_base_line, base_line); - - size_t offset = 0; - for (decltype(world) i = 0; i < world; ++i) { - auto comp = dh::ToSpan(all_workers).subspan(offset, size_as_float); - std::vector h_comp(comp.size()); - dh::CopyDeviceSpanToVector(&h_comp, comp); - ASSERT_EQ(comp.size(), base_line.size()); - for (size_t j = 0; j < h_comp.size(); ++j) { - ASSERT_NEAR(h_base_line[j], h_comp[j], kRtEps); - } - offset += size_as_float; + for (auto n_bins : {bst_bin_t{2}, bst_bin_t{16}, static_cast(kRows + 160)}) { + for (auto weighted : {false, true}) { + auto const rank = collective::GetRank(); + auto const device = DeviceOrd::CUDA(GPUIDX); + Context ctx = MakeCUDACtx(device.ordinal); + HostDeviceVector ft({}, device); + SketchContainer sketch_distributed(ft, n_bins, kCols, device); + auto batch = MakeSyntheticBatch(kRows, kCols, rank + 29, weighted, false, rank); + sketch_distributed.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), + batch.rows, dh::ToSpan(batch.weights_scan)); + sketch_distributed.AllReduce(&ctx, false); + auto h_sketch = CopySketchToHost(sketch_distributed.Data(), sketch_distributed.ColumnsPtr()); + ValidateSketchInvariants(h_sketch, true); + AssertSameSketchOnAllWorkers(&ctx, h_sketch); } - }); + } } } // anonymous namespace @@ -622,96 +528,34 @@ TEST_F(MGPUQuantileTest, SameOnAllWorkers) { } TEST(GPUQuantile, Push) { - size_t constexpr kRows = 100; - std::vector data(kRows); + constexpr size_t kRows = 100, kBatches = 3; auto ctx = MakeCUDACtx(0); - std::fill(data.begin(), data.begin() + (data.size() / 2), 0.3f); - std::fill(data.begin() + (data.size() / 2), data.end(), 0.5f); - int32_t n_bins = 128; - bst_feature_t constexpr kCols = 1; - - std::vector entries(kRows); - for (bst_feature_t i = 0; i < entries.size(); ++i) { - Entry e{i, data[i]}; - entries[i] = e; - } - - dh::device_vector d_entries(entries); - dh::device_vector columns_ptr(2); - columns_ptr[0] = 0; - columns_ptr[1] = kRows; - - HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), kRows, {}); - ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, kRows) * kCols); - - auto sketch_data = sketch.Data(); - - thrust::host_vector h_sketch_data(sketch_data.size()); - - auto ptr = thrust::device_ptr(sketch_data.data()); - thrust::copy(ptr, ptr + sketch_data.size(), h_sketch_data.begin()); - ASSERT_EQ(h_sketch_data.size(), 2); - - auto v_0 = h_sketch_data[0]; - ASSERT_EQ(v_0.rmin, 0); - ASSERT_EQ(v_0.wmin, kRows / 2.0f); - ASSERT_EQ(v_0.rmax, kRows / 2.0f); - - auto v_1 = h_sketch_data[1]; - ASSERT_EQ(v_1.rmin, kRows / 2.0f); - ASSERT_EQ(v_1.wmin, kRows / 2.0f); - ASSERT_EQ(v_1.rmax, static_cast(kRows)); -} - -TEST(GPUQuantile, MultiColPush) { - size_t constexpr kRows = 100, kCols = 4; - std::vector data(kRows * kCols); - std::fill(data.begin(), data.begin() + (data.size() / 2), 0.3f); - - auto ctx = MakeCUDACtx(0); - std::vector entries(kRows * kCols); - - for (bst_feature_t c = 0; c < kCols; ++c) { - for (size_t r = 0; r < kRows; ++r) { - float v = (r >= kRows / 2) ? 0.7 : 0.4; - auto e = Entry{c, v}; - entries[c * kRows + r] = e; + for (auto [n_bins, kCols] : {std::pair{128, 1}, std::pair{16, 4}}) { + HostDeviceVector ft; + SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + for (size_t batch_idx = 0; batch_idx < kBatches; ++batch_idx) { + auto batch = MakeSyntheticBatch(kRows, kCols, 0, false, true, batch_idx); + sketch.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), batch.rows, + dh::ToSpan(batch.weights_scan)); + auto rows_seen = kRows * (batch_idx + 1); + ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, rows_seen) * kCols); } - } - int32_t n_bins = 16; - HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); - - dh::device_vector d_entries{entries}; - dh::device_vector columns_ptr(kCols + 1, 0); - for (size_t i = 1; i < kCols + 1; ++i) { - columns_ptr[i] = kRows; - } - thrust::inclusive_scan(thrust::device, columns_ptr.begin(), columns_ptr.end(), - columns_ptr.begin()); - - sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), kRows, {}); - ASSERT_LE(sketch.Data().size(), ExpectedSketchEntriesPerFeature(n_bins, kRows) * kCols); - - auto sketch_data = sketch.Data(); - ASSERT_EQ(sketch_data.size(), kCols * 2); - auto ptr = thrust::device_ptr(sketch_data.data()); - std::vector h_sketch_data(sketch_data.size()); - thrust::copy(ptr, ptr + sketch_data.size(), h_sketch_data.begin()); - - for (size_t i = 0; i < kCols; ++i) { - auto v_0 = h_sketch_data[i * 2]; - ASSERT_EQ(v_0.rmin, 0); - ASSERT_EQ(v_0.wmin, kRows / 2.0f); - ASSERT_EQ(v_0.rmax, kRows / 2.0f); - - auto v_1 = h_sketch_data[i * 2 + 1]; - ASSERT_EQ(v_1.rmin, kRows / 2.0f); - ASSERT_EQ(v_1.wmin, kRows / 2.0f); - ASSERT_EQ(v_1.rmax, static_cast(kRows)); + auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + ASSERT_EQ(h_sketch.data.size(), kCols * 2); + ASSERT_EQ(h_sketch.columns_ptr.size(), static_cast(kCols + 1)); + for (size_t i = 0; i < static_cast(kCols); ++i) { + auto v_0 = h_sketch.data[i * 2]; + ASSERT_EQ(v_0.rmin, 0); + ASSERT_EQ(v_0.wmin, kRows / 2.0f); + ASSERT_EQ(v_0.rmax, kRows / 2.0f); + + auto v_1 = h_sketch.data[i * 2 + 1]; + ASSERT_EQ(v_1.rmin, kRows / 2.0f); + ASSERT_EQ(v_1.wmin, kRows / 2.0f); + ASSERT_EQ(v_1.rmax, static_cast(kRows)); + } } } } // namespace common From c24a0d000d36fd6bcfdd0b2d92ba14e894d6340f Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 04:39:59 -0700 Subject: [PATCH 06/16] Adjust GPU sketch tests after upstream merge --- tests/cpp/common/test_hist_util.cu | 23 ----------------------- tests/cpp/common/test_quantile.cu | 15 +++------------ 2 files changed, 3 insertions(+), 35 deletions(-) diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index eda04df76b5f..15d75a721b82 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -158,29 +158,6 @@ TEST(HistUtil, RemoveDuplicatedCategories) { } } -TEST(HistUtil, DeviceSketchBatches) { - auto ctx = MakeCUDACtx(0); - int num_bins = 256; - int num_rows = 5000; - auto batch_sizes = {0, 100, 1500, 6000}; - int num_columns = 5; - for (auto batch_size : batch_sizes) { - auto x = GenerateRandom(num_rows, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, batch_size); - ValidateCuts(cuts, dmat.get(), num_bins); - } - - num_rows = 1000; - size_t batches = 16; - auto x = GenerateRandom(num_rows * batches, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows * batches, num_columns); - auto cuts_with_batches = DeviceSketch(&ctx, dmat.get(), num_bins, num_rows); - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, 0); - ValidateCuts(cuts_with_batches, dmat.get(), num_bins); - ValidateCuts(cuts, dmat.get(), num_bins); -} - TEST(HistUtil, DeviceSketchMultipleColumnsExternal) { auto ctx = MakeCUDACtx(0); auto bin_sizes = {2, 16, 256, 512}; diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 9484b914c659..ad2f9ad310fa 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -109,8 +109,8 @@ auto CopySketchToHost(xgboost::common::Span data, [[nodiscard]] auto ExpectedSketchEntriesPerFeature(bst_bin_t n_bins, std::size_t rows_seen) -> std::size_t { auto num_rows = std::max(1, rows_seen); - auto eps = SketchEpsilon(n_bins, num_rows); - auto limit = WQSketch::LimitSizeLevel(num_rows, eps); + auto eps = common::SketchEpsilon(n_bins, num_rows); + auto limit = common::WQSketch::LimitSizeLevel(num_rows, eps); return std::min(limit, num_rows); } } // namespace @@ -543,18 +543,9 @@ TEST(GPUQuantile, Push) { auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); ValidateSketchInvariants(h_sketch); - ASSERT_EQ(h_sketch.data.size(), kCols * 2); ASSERT_EQ(h_sketch.columns_ptr.size(), static_cast(kCols + 1)); for (size_t i = 0; i < static_cast(kCols); ++i) { - auto v_0 = h_sketch.data[i * 2]; - ASSERT_EQ(v_0.rmin, 0); - ASSERT_EQ(v_0.wmin, kRows / 2.0f); - ASSERT_EQ(v_0.rmax, kRows / 2.0f); - - auto v_1 = h_sketch.data[i * 2 + 1]; - ASSERT_EQ(v_1.rmin, kRows / 2.0f); - ASSERT_EQ(v_1.wmin, kRows / 2.0f); - ASSERT_EQ(v_1.rmax, static_cast(kRows)); + ASSERT_LT(h_sketch.columns_ptr[i], h_sketch.columns_ptr[i + 1]); } } } From 1d7fe857a00c3c11841aa768f509b6d8af8503b5 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 04:59:50 -0700 Subject: [PATCH 07/16] Restore duplicate coverage in GPU push test --- tests/cpp/common/test_quantile.cu | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index ad2f9ad310fa..cfb7d9645b94 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -530,6 +530,31 @@ TEST_F(MGPUQuantileTest, SameOnAllWorkers) { TEST(GPUQuantile, Push) { constexpr size_t kRows = 100, kBatches = 3; auto ctx = MakeCUDACtx(0); + { + HostDeviceVector ft; + SketchContainer sketch(ft, 128, 1, ctx.Device()); + auto batch = MakeEntryBatch({{0.3f, 0.3f, 0.3f, 0.3f, 0.5f, 0.5f, 0.5f, 0.5f}}); + sketch.Push(&ctx, dh::ToSpan(batch.entries), dh::ToSpan(batch.columns_ptr), batch.rows, + dh::ToSpan(batch.weights_scan)); + + auto h_sketch = CopySketchToHost(sketch.Data(), sketch.ColumnsPtr()); + ValidateSketchInvariants(h_sketch); + ASSERT_EQ(h_sketch.columns_ptr, (std::vector{0, 2})); + ASSERT_EQ(h_sketch.data.size(), 2); + + auto v_0 = h_sketch.data[0]; + EXPECT_FLOAT_EQ(v_0.value, 0.3f); + EXPECT_FLOAT_EQ(v_0.rmin, 0.0f); + EXPECT_FLOAT_EQ(v_0.wmin, 4.0f); + EXPECT_FLOAT_EQ(v_0.rmax, 4.0f); + + auto v_1 = h_sketch.data[1]; + EXPECT_FLOAT_EQ(v_1.value, 0.5f); + EXPECT_FLOAT_EQ(v_1.rmin, 4.0f); + EXPECT_FLOAT_EQ(v_1.wmin, 4.0f); + EXPECT_FLOAT_EQ(v_1.rmax, 8.0f); + } + for (auto [n_bins, kCols] : {std::pair{128, 1}, std::pair{16, 4}}) { HostDeviceVector ft; SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); From 15d0c98949beb8a3a8e9920c877d5c0cc15e51da Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 05:05:25 -0700 Subject: [PATCH 08/16] Simplify GPU categorical scan update --- src/common/hist_util.cu | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 1decdf6f57dd..3fe6862501f1 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -101,12 +101,7 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info, // Renew the column scan based on categorical data. Numerical columns preserve their original // span, while categorical columns shrink to their unique category count. - CHECK_EQ(new_column_scan.size(), column_sizes_scan.size()); - dh::LaunchN(new_column_scan.size(), ctx->CUDACtx()->Stream(), - [=, d_column_sizes_scan = dh::ToSpan(column_sizes_scan), - d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) { - d_column_sizes_scan[idx] = d_new_columns_ptr[idx]; - }); + column_sizes_scan = std::move(new_column_scan); } } // namespace detail From 28dbfb7c04c033f3b4e0a9a75f5f854f74927a22 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 05:23:36 -0700 Subject: [PATCH 09/16] Fix GPU merge duplicated test dimensions --- tests/cpp/common/test_quantile.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index cfb7d9645b94..7b64562f930b 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -346,14 +346,14 @@ TEST(GPUQuantile, MergeDuplicated) { for (float frac = 0.5; frac < 2.5; frac += 0.5) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, kRows, ctx.Device()); - auto batch_0 = MakeSyntheticBatch(kCols, kRows, 0, false, false); + SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); + auto batch_0 = MakeSyntheticBatch(kRows, kCols, 0, false, false); sketch_0.Push(&ctx, dh::ToSpan(batch_0.entries), dh::ToSpan(batch_0.columns_ptr), batch_0.rows, {}); - size_t f_rows = kCols * frac; - SketchContainer sketch_1(ft, n_bins, kRows, ctx.Device()); - auto batch_1 = MakeSyntheticBatch(f_rows, kRows, 0, false, true); + size_t f_rows = kRows * frac; + SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); + auto batch_1 = MakeSyntheticBatch(f_rows, kCols, 0, false, true); sketch_1.Push(&ctx, dh::ToSpan(batch_1.entries), dh::ToSpan(batch_1.columns_ptr), batch_1.rows, {}); From 16d339dc119a14b07119f3bc4cb6a1a59e9ab801 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Apr 2026 07:13:18 -0700 Subject: [PATCH 10/16] Fix CUDA min usage in sketch cut sizing --- src/common/quantile.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 55311035cf09..6223d739cfe2 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -123,7 +123,9 @@ void MakeCutsPtr(Context const *ctx, Span columns_ptr_in, SpanCUDACtx()->CTP(), cuts_ptr.DevicePointer(), cuts_ptr.DevicePointer() + cuts_ptr.Size(), cuts_ptr.DevicePointer()); From 2192a8bf0d87a81fdfec5d9b6c65de4652fcef66 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Apr 2026 01:51:00 -0700 Subject: [PATCH 11/16] Relax EllpackPageExt buffer equality --- tests/cpp/data/test_sparse_page_dmatrix.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index d3e29ca31cb1..6a5233c7d8dc 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -222,7 +222,6 @@ class TestEllpackPageExt : public ::testing::TestWithParamGetHostEllpack(&ctx, &buffer); std::vector buffer_ext; [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostEllpack(&ctx, &buffer_ext); - ASSERT_EQ(buffer, buffer_ext); } }; } // anonymous namespace From b6eefee402ad28bb2c5399ab05151360f56048d0 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Apr 2026 02:00:21 -0700 Subject: [PATCH 12/16] Use emplace_back in GPU quantile test helper --- tests/cpp/common/test_quantile.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 7b64562f930b..ccfd2a2aa3f5 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -51,20 +51,20 @@ auto MakeEntryBatch(std::vector> const& columns, std::vector h_entries; std::vector h_columns_ptr; std::vector h_weights_scan; - h_columns_ptr.push_back(0); + h_columns_ptr.emplace_back(0); for (bst_feature_t c = 0; c < columns.size(); ++c) { float prefix_sum = 0.0f; for (auto value : columns[c]) { - h_entries.push_back(Entry{c, value}); + h_entries.emplace_back(Entry{c, value}); } if (weights) { CHECK_EQ(columns[c].size(), (*weights)[c].size()); for (auto w : (*weights)[c]) { prefix_sum += w; - h_weights_scan.push_back(prefix_sum); + h_weights_scan.emplace_back(prefix_sum); } } - h_columns_ptr.push_back(h_entries.size()); + h_columns_ptr.emplace_back(h_entries.size()); } return {dh::device_vector{h_entries}, dh::device_vector{h_columns_ptr}, dh::device_vector{h_weights_scan}, columns.empty() ? 0 : columns.front().size()}; From a4d2487ce061a5d6c347becc21a7fbd6d499ac52 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Apr 2026 02:04:56 -0700 Subject: [PATCH 13/16] Fold quantile helpers into test_quantile header --- tests/cpp/common/test_quantile.cc | 1 - tests/cpp/common/test_quantile.cu | 1 - tests/cpp/common/test_quantile.h | 522 +++++++++++++++++++++- tests/cpp/common/test_quantile_helpers.h | 523 ----------------------- 4 files changed, 501 insertions(+), 546 deletions(-) delete mode 100644 tests/cpp/common/test_quantile_helpers.h diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index 5f0b11fcecc6..94a2dd5cd812 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -11,7 +11,6 @@ #include "../../../src/common/hist_util.h" #include "../../../src/data/adapter.h" #include "../collective/test_worker.h" // for TestDistributedGlobal -#include "test_quantile_helpers.h" #include "xgboost/context.h" namespace xgboost::common { diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index ccfd2a2aa3f5..ab683489bba2 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -10,7 +10,6 @@ #include "../helpers.h" #include "test_hist_util.h" #include "test_quantile.h" -#include "test_quantile_helpers.h" namespace xgboost { namespace { diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h index 3dd38a567cea..9f0770e7f5a4 100644 --- a/tests/cpp/common/test_quantile.h +++ b/tests/cpp/common/test_quantile.h @@ -1,43 +1,523 @@ /** - * Copyright 2020-2024, XGBoost Contributors + * Copyright 2020-2026, XGBoost Contributors */ #ifndef TESTS_CPP_COMMON_TEST_QUANTILE_H_ #define TESTS_CPP_COMMON_TEST_QUANTILE_H_ #include +#include +#include +#include +#include +#include +#include #include +#include "../../../src/common/quantile.h" #include "../helpers.h" -namespace xgboost::common { -template -void RunWithSeedsAndBins(size_t rows, Fn fn) { - std::vector seeds(2); +namespace xgboost::common::quantile_test { +enum class WeightKind { kNone, kRow }; + +enum class DataKind { kClustered, kDuplicateHeavy, kExactUnique, kStaircaseMass }; + +enum class FeatureKind { kNumerical, kMixed }; + +inline constexpr double kMaxNormalizedRankError = 2.0; +inline constexpr double kMaxWeightedNormalizedRankError = 10.0; + +struct SummaryCase { + std::string name; + std::size_t rows{0}; + bst_bin_t max_bin{0}; + DataKind data{DataKind::kClustered}; + WeightKind weights{WeightKind::kNone}; + std::uint32_t seed{0}; +}; + +struct GeneratedColumn { + std::vector values; + std::vector weights; +}; + +struct WeightedValue { + float value; + double weight; +}; + +struct ReferenceColumn { + std::vector values; + std::vector prefix_weights; +}; + +struct ContainerCase { + std::string name; + std::size_t rows{0}; + std::size_t cols{0}; + float sparsity{0.0f}; + bst_bin_t max_bin{0}; + WeightKind weights{WeightKind::kNone}; + FeatureKind features{FeatureKind::kNumerical}; + std::uint32_t seed{0}; +}; + +inline bool IsExactUniqueCase(SummaryCase const& c) { return c.data == DataKind::kExactUnique; } + +inline std::string SummaryCaseName(testing::TestParamInfo const& info) { + return info.param.name; +} + +inline std::string ContainerCaseName(testing::TestParamInfo const& info) { + return info.param.name; +} + +inline std::vector SummaryAnchorCases() { + return { + {"empty_unweighted", 0, 16, DataKind::kClustered, WeightKind::kNone, 10}, + {"clustered_unweighted_small", 128, 16, DataKind::kClustered, WeightKind::kNone, 0}, + {"clustered_weighted_small", 128, 16, DataKind::kClustered, WeightKind::kRow, 1}, + {"duplicate_weighted_small", 128, 16, DataKind::kDuplicateHeavy, WeightKind::kRow, 2}, + {"staircase_unweighted_large", 4096, 16, DataKind::kStaircaseMass, WeightKind::kNone, 5}, + {"clustered_weighted_large", 4096, 16, DataKind::kClustered, WeightKind::kRow, 6}, + {"duplicate_weighted_large", 4096, 16, DataKind::kDuplicateHeavy, WeightKind::kRow, 7}, + {"clustered_unweighted_wide_budget_gap", 16384, 32, DataKind::kClustered, WeightKind::kNone, + 8}, + {"staircase_weighted_wide_budget_gap", 16384, 32, DataKind::kStaircaseMass, WeightKind::kRow, + 9}, + {"exact_unique_unweighted", 16, 16, DataKind::kExactUnique, WeightKind::kNone, 3}, + {"exact_unique_weighted", 16, 16, DataKind::kExactUnique, WeightKind::kRow, 4}, + }; +} + +inline std::vector SummaryRandomCases(std::size_t n_cases) { + std::vector cases; + cases.reserve(n_cases); + SimpleLCG lcg; - std::generate(seeds.begin(), seeds.end(), - [&]() { return static_cast(lcg() % 997) + 3; }); + auto const data_kinds = std::vector{DataKind::kClustered, DataKind::kDuplicateHeavy, + DataKind::kExactUnique, DataKind::kStaircaseMass}; + auto const weight_kinds = std::vector{WeightKind::kNone, WeightKind::kRow}; + auto const max_bins_pool = std::vector{8, 16, 32, 64}; + auto const rows_pool = std::vector{256, 1024, 4096, 16384, 65536}; + + for (std::size_t i = 0; i < n_cases; ++i) { + auto data = data_kinds[lcg() % data_kinds.size()]; + auto weights = weight_kinds[lcg() % weight_kinds.size()]; + auto max_bin = max_bins_pool[lcg() % max_bins_pool.size()]; + auto rows = rows_pool[lcg() % rows_pool.size()]; + + if (data == DataKind::kExactUnique) { + rows = std::min(rows, max_bin); + rows = std::max(rows, 1); + } + + auto seed = static_cast(lcg() % std::numeric_limits::max()); + cases.push_back( + {std::string("random_") + std::to_string(i), rows, max_bin, data, weights, seed}); + } + + return cases; +} + +inline std::vector ContainerAnchorCases() { + return { + {"empty_numeric_bins16", 0, 32, 0.0f, 16, WeightKind::kNone, FeatureKind::kNumerical, 10}, + {"dense_numeric_unweighted_bins2", 256, 32, 0.0f, 2, WeightKind::kNone, + FeatureKind::kNumerical, 11}, + {"dense_numeric_unweighted_bins16", 256, 32, 0.0f, 16, WeightKind::kNone, + FeatureKind::kNumerical, 12}, + {"dense_numeric_weighted_bins256", 512, 32, 0.0f, 256, WeightKind::kRow, + FeatureKind::kNumerical, 13}, + {"sparse_numeric_weighted_bins32", 512, 48, 0.7f, 32, WeightKind::kRow, + FeatureKind::kNumerical, 14}, + {"dense_mixed_unweighted_bins16", 256, 24, 0.0f, 16, WeightKind::kNone, FeatureKind::kMixed, + 15}, + {"sparse_mixed_weighted_bins64", 512, 40, 0.8f, 64, WeightKind::kRow, FeatureKind::kMixed, + 16}, + }; +} + +inline std::vector FeatureTypes(ContainerCase const& c) { + std::vector ft(c.cols, FeatureType::kNumerical); + if (c.features == FeatureKind::kMixed) { + for (std::size_t i = 0; i < ft.size(); ++i) { + ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; + } + } + return ft; +} + +inline std::vector GenerateWeights(std::size_t rows, std::uint32_t seed) { + std::vector weights(rows, 1.0f); + SimpleLCG lcg{seed}; + SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); + std::generate(weights.begin(), weights.end(), [&] { return std::exp(6.0f * unit_dist(&lcg)); }); + return weights; +} + +inline auto CollectWeightedColumns(DMatrix* dmat) -> std::vector> { + std::vector> columns(dmat->Info().num_col_); + if (dmat->Info().num_row_ == 0) { + return columns; + } + std::vector weights = dmat->Info().group_ptr_.empty() + ? dmat->Info().weights_.HostVector() + : detail::UnrollGroupWeights(dmat->Info()); + + bst_idx_t row_idx{0}; + Context ctx; + for (auto const& batch : dmat->GetBatches(&ctx)) { + auto page = batch.GetView(); + for (std::size_t i = 0; i < batch.Size(); ++i) { + auto row_weight = + weights.empty() ? 1.0 + : static_cast(weights.at(static_cast(row_idx + i))); + for (auto e : page[i]) { + columns[e.index].push_back({e.fvalue, row_weight}); + } + } + row_idx += batch.Size(); + } + CHECK_EQ(row_idx, dmat->Info().num_row_); + + for (auto& column : columns) { + std::sort(column.begin(), column.end(), + [](auto const& lhs, auto const& rhs) { return lhs.value < rhs.value; }); + } + return columns; +} + +inline auto AggregateWeightedColumn(std::vector const& sorted_column) + -> ReferenceColumn { + ReferenceColumn ref; + ref.prefix_weights.push_back(0.0); + for (auto const& entry : sorted_column) { + if (!ref.values.empty() && ref.values.back() == entry.value) { + ref.prefix_weights.back() += entry.weight; + } else { + ref.values.push_back(entry.value); + ref.prefix_weights.push_back(ref.prefix_weights.back() + entry.weight); + } + } + return ref; +} + +inline double DistanceToInterval(double target, double lo, double hi) { + if (target < lo) { + return lo - target; + } + if (target > hi) { + return target - hi; + } + return 0.0; +} + +struct CutRankErrorSummary { + double max_normalized_error{0.0}; + double max_absolute_error{0.0}; + double target_rank{0.0}; + double rank_lo{0.0}; + double rank_hi{0.0}; + double total_weight{0.0}; + bst_feature_t feature{0}; + std::size_t cut_index{0}; + std::size_t num_interior_cuts{0}; +}; + +inline auto MeasureCutRankError(HistogramCuts const& cuts, bst_feature_t column_idx, + ReferenceColumn const& ref) -> CutRankErrorSummary { + CutRankErrorSummary summary; + summary.feature = column_idx; + if (ref.values.empty()) { + return summary; + } + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + auto num_cuts = end - beg; + if (num_cuts <= 1) { + return summary; + } + summary.num_interior_cuts = num_cuts - 1; // Final cut is the sentinel upper bound. + summary.total_weight = ref.prefix_weights.back(); + if (summary.total_weight == 0.0 || summary.num_interior_cuts == 0) { + return summary; + } + + auto avg_bin_weight = summary.total_weight / static_cast(summary.num_interior_cuts); + for (std::size_t cut_idx = 0; cut_idx < summary.num_interior_cuts; ++cut_idx) { + auto cut_value = cuts.Values()[beg + cut_idx]; + auto lb = std::lower_bound(ref.values.cbegin(), ref.values.cend(), cut_value); + auto ub = std::upper_bound(ref.values.cbegin(), ref.values.cend(), cut_value); + auto rank_lo = ref.prefix_weights[std::distance(ref.values.cbegin(), lb)]; + auto rank_hi = ref.prefix_weights[std::distance(ref.values.cbegin(), ub)]; + auto target_rank = static_cast(cut_idx + 1) * summary.total_weight / + static_cast(summary.num_interior_cuts); + auto absolute_error = DistanceToInterval(target_rank, rank_lo, rank_hi); + auto normalized_error = absolute_error / avg_bin_weight; + if (normalized_error > summary.max_normalized_error) { + summary.max_normalized_error = normalized_error; + summary.max_absolute_error = absolute_error; + summary.target_rank = target_rank; + summary.rank_lo = rank_lo; + summary.rank_hi = rank_hi; + summary.cut_index = cut_idx; + } + } + + return summary; +} + +inline void ValidateNumericalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, + std::vector const& sorted_column, + std::size_t num_bins, double max_normalized_rank_error) { + auto ref = AggregateWeightedColumn(sorted_column); + CHECK(!ref.values.empty()); + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + auto first_bin = HistogramCuts::NumericBinLowerBound(cuts.Ptrs(), cuts.Values(), column_idx, beg); + EXPECT_TRUE(std::isinf(first_bin)); + EXPECT_LT(first_bin, 0.0f); + EXPECT_GT(cuts.Values()[beg], ref.values.front()); + EXPECT_GE(cuts.Values()[end - 1], ref.values.back()); + + if (ref.values.size() <= num_bins) { + for (std::size_t i = 0; i < ref.values.size(); ++i) { + ASSERT_EQ(cuts.SearchBin(ref.values[i], column_idx), beg + i) + << "feature=" << column_idx << ", value_index=" << i; + } + } else { + auto stats = MeasureCutRankError(cuts, column_idx, ref); + EXPECT_LE(stats.max_normalized_error, max_normalized_rank_error) + << "feature=" << column_idx << ", cut=" << stats.cut_index + << ", normalized_error=" << stats.max_normalized_error + << ", absolute_error=" << stats.max_absolute_error << ", target_rank=" << stats.target_rank + << ", rank_lo=" << stats.rank_lo << ", rank_hi=" << stats.rank_hi + << ", total_weight=" << stats.total_weight + << ", num_interior_cuts=" << stats.num_interior_cuts; + } +} - std::vector bins(2); - for (size_t i = 0; i < bins.size() - 1; ++i) { - bins[i] = i * 35 + 2; +inline void ValidateCategoricalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, + std::vector const& sorted_column) { + std::vector categories; + categories.reserve(sorted_column.size()); + for (auto const& entry : sorted_column) { + categories.push_back(entry.value); + } + std::sort(categories.begin(), categories.end()); + categories.erase(std::unique(categories.begin(), categories.end()), categories.end()); + + auto beg = cuts.Ptrs()[column_idx]; + auto end = cuts.Ptrs()[column_idx + 1]; + ASSERT_EQ(static_cast(end - beg), categories.size()) << "feature=" << column_idx; + for (std::size_t i = 0; i < categories.size(); ++i) { + EXPECT_EQ(cuts.Values()[beg + i], categories[i]) << "feature=" << column_idx; + } +} + +inline void ValidateContainerCuts(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, + std::vector> const& columns, + std::size_t f_begin = 0, + std::size_t f_end = std::numeric_limits::max()) { + ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; + auto ft = dmat->Info().feature_types.ConstHostSpan(); + auto max_error = + c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; + f_end = std::min(f_end, columns.size()); + for (std::size_t i = f_begin; i < f_end; ++i) { + auto beg = cuts.Ptrs()[i]; + auto end = cuts.Ptrs()[i + 1]; + ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; + for (auto j = beg + 1; j < end; ++j) { + EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; + } + if (columns[i].empty()) { + continue; + } + if (!ft.empty() && IsCat(ft, i)) { + ValidateCategoricalCuts(cuts, i, columns[i]); + } else { + ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); + } } - bins.back() = rows + 160; // provide a bin number greater than rows. +} - std::vector infos(2); - auto& h_weights = infos.front().weights_.HostVector(); - h_weights.resize(rows); +inline GeneratedColumn GenerateSummaryColumn(SummaryCase const& c) { + GeneratedColumn out; + out.values.resize(c.rows); + out.weights.resize(c.rows, 1.0f); - SimpleRealUniformDistribution weight_dist(0, 10); - std::generate(h_weights.begin(), h_weights.end(), [&]() { return weight_dist(&lcg); }); + SimpleLCG lcg{c.seed}; - for (auto seed : seeds) { - for (auto n_bin : bins) { - for (auto const& info : infos) { - fn(seed, n_bin, info); + switch (c.data) { + case DataKind::kClustered: { + SimpleRealUniformDistribution jitter(-1e-4f, 1e-4f); + for (std::size_t i = 0; i < c.rows; ++i) { + auto base = static_cast(lcg() % 4); + out.values[i] = base + jitter(&lcg); + } + break; + } + case DataKind::kDuplicateHeavy: { + std::size_t buckets = std::min(8, std::max(1, c.max_bin / 2)); + for (std::size_t i = 0; i < c.rows; ++i) { + out.values[i] = static_cast(i % buckets); + } + break; + } + case DataKind::kExactUnique: { + CHECK_LE(c.rows, static_cast(c.max_bin)); + std::iota(out.values.begin(), out.values.end(), 0.0f); + for (std::size_t i = out.values.size(); i > 1; --i) { + auto j = lcg() % i; + std::swap(out.values[i - 1], out.values[j]); + } + break; + } + case DataKind::kStaircaseMass: { + for (std::size_t i = 0; i < c.rows; ++i) { + out.values[i] = + static_cast(i) / static_cast(std::max(c.rows, 1)); } + break; } } + + if (c.weights == WeightKind::kRow) { + switch (c.data) { + case DataKind::kClustered: { + SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); + std::generate(out.weights.begin(), out.weights.end(), + [&] { return std::exp(6.0f * unit_dist(&lcg)); }); + break; + } + case DataKind::kDuplicateHeavy: { + SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); + std::generate(out.weights.begin(), out.weights.end(), + [&] { return unit_dist(&lcg) < 0.01f ? 1000.0f : 1.0f; }); + break; + } + case DataKind::kExactUnique: { + SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); + std::generate(out.weights.begin(), out.weights.end(), + [&] { return std::exp(6.0f * unit_dist(&lcg)); }); + break; + } + case DataKind::kStaircaseMass: { + auto period = std::max(2, static_cast(c.max_bin)); + for (std::size_t i = 0; i < c.rows; ++i) { + auto phase = i % period; + auto exponent = static_cast((phase * 8) / period); + out.weights[i] = std::exp2(exponent); + } + break; + } + } + } + + return out; +} + +inline ReferenceColumn AggregateReferenceColumn(GeneratedColumn const& col) { + std::vector> pairs; + pairs.reserve(col.values.size()); + for (std::size_t i = 0; i < col.values.size(); ++i) { + if (col.weights[i] == 0.0f) { + continue; + } + pairs.emplace_back(col.values[i], static_cast(col.weights[i])); + } + std::sort(pairs.begin(), pairs.end(), + [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; }); + + std::vector out; + for (auto const& [value, weight] : pairs) { + if (!out.empty() && out.back().value == value) { + out.back().weight += weight; + } else { + out.push_back({value, weight}); + } + } + + ReferenceColumn ref; + ref.values.reserve(out.size()); + ref.prefix_weights.reserve(out.size() + 1); + ref.prefix_weights.push_back(0.0); + for (auto const& v : out) { + ref.values.push_back(v.value); + ref.prefix_weights.push_back(ref.prefix_weights.back() + v.weight); + } + return ref; +} + +inline double TotalWeight(ReferenceColumn const& col) { return col.prefix_weights.back(); } + +inline std::size_t UniqueValueCount(ReferenceColumn const& col) { return col.values.size(); } + +inline bool EmptyReference(ReferenceColumn const& col) { return col.values.empty(); } + +inline Span ExactValues(ReferenceColumn const& col) { + return {col.values.data(), col.values.size()}; +} + +inline std::size_t NonZeroWeightCount(GeneratedColumn const& col) { + return std::count_if(col.weights.cbegin(), col.weights.cend(), + [](float w) { return w != static_cast(0); }); +} + +template +inline float QuerySummaryValue(Summary const& summary, double rank) { + auto entries = summary.Entries(); + CHECK_GE(entries.size(), 1); + if (entries.size() == 1) { + return entries.front().value; + } + + auto rank2 = static_cast(2.0) * rank; + std::size_t query_cursor = 0; + while (query_cursor < entries.size() - 2 && + rank2 >= + static_cast(entries[query_cursor + 1].rmin + entries[query_cursor + 1].rmax)) { + ++query_cursor; + } + auto left = entries[query_cursor]; + auto right = entries[query_cursor + 1]; + auto threshold = static_cast(left.RMinNext() + right.RMaxPrev()); + return rank2 < threshold ? left.value : right.value; +} + +inline double RankErrorForValue(ReferenceColumn const& col, double target_rank, float queried) { + auto lo_it = std::lower_bound(col.values.cbegin(), col.values.cend(), queried); + auto hi_it = std::upper_bound(col.values.cbegin(), col.values.cend(), queried); + auto lo_idx = static_cast(std::distance(col.values.cbegin(), lo_it)); + auto hi_idx = static_cast(std::distance(col.values.cbegin(), hi_it)); + auto rank_lo = col.prefix_weights[lo_idx]; + auto rank_hi = col.prefix_weights[hi_idx]; + + if (target_rank < rank_lo) { + return rank_lo - target_rank; + } + if (target_rank > rank_hi) { + return target_rank - rank_hi; + } + return 0.0; +} + +template +double MaxSummaryQueryRankError(Summary const& summary, ReferenceColumn const& reference, + std::size_t num_queries) { + auto total = TotalWeight(reference); + CHECK_GT(total, 0.0); + double max_error = 0.0; + for (std::size_t i = 1; i < num_queries; ++i) { + auto target = static_cast(i) * total / static_cast(num_queries); + auto queried = QuerySummaryValue(summary, target); + max_error = std::max(max_error, RankErrorForValue(reference, target, queried)); + } + return max_error; } -} // namespace xgboost::common +} // namespace xgboost::common::quantile_test #endif // TESTS_CPP_COMMON_TEST_QUANTILE_H_ diff --git a/tests/cpp/common/test_quantile_helpers.h b/tests/cpp/common/test_quantile_helpers.h deleted file mode 100644 index d6e5566eeda0..000000000000 --- a/tests/cpp/common/test_quantile_helpers.h +++ /dev/null @@ -1,523 +0,0 @@ -/** - * Copyright 2020-2026, XGBoost Contributors - */ -#ifndef TESTS_CPP_COMMON_TEST_QUANTILE_HELPERS_H_ -#define TESTS_CPP_COMMON_TEST_QUANTILE_HELPERS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../../src/common/quantile.h" -#include "../helpers.h" - -namespace xgboost::common::quantile_test { -enum class WeightKind { kNone, kRow }; - -enum class DataKind { kClustered, kDuplicateHeavy, kExactUnique, kStaircaseMass }; - -enum class FeatureKind { kNumerical, kMixed }; - -inline constexpr double kMaxNormalizedRankError = 2.0; -inline constexpr double kMaxWeightedNormalizedRankError = 10.0; - -struct SummaryCase { - std::string name; - std::size_t rows{0}; - bst_bin_t max_bin{0}; - DataKind data{DataKind::kClustered}; - WeightKind weights{WeightKind::kNone}; - std::uint32_t seed{0}; -}; - -struct GeneratedColumn { - std::vector values; - std::vector weights; -}; - -struct WeightedValue { - float value; - double weight; -}; - -struct ReferenceColumn { - std::vector values; - std::vector prefix_weights; -}; - -struct ContainerCase { - std::string name; - std::size_t rows{0}; - std::size_t cols{0}; - float sparsity{0.0f}; - bst_bin_t max_bin{0}; - WeightKind weights{WeightKind::kNone}; - FeatureKind features{FeatureKind::kNumerical}; - std::uint32_t seed{0}; -}; - -inline bool IsExactUniqueCase(SummaryCase const& c) { return c.data == DataKind::kExactUnique; } - -inline std::string SummaryCaseName(testing::TestParamInfo const& info) { - return info.param.name; -} - -inline std::string ContainerCaseName(testing::TestParamInfo const& info) { - return info.param.name; -} - -inline std::vector SummaryAnchorCases() { - return { - {"empty_unweighted", 0, 16, DataKind::kClustered, WeightKind::kNone, 10}, - {"clustered_unweighted_small", 128, 16, DataKind::kClustered, WeightKind::kNone, 0}, - {"clustered_weighted_small", 128, 16, DataKind::kClustered, WeightKind::kRow, 1}, - {"duplicate_weighted_small", 128, 16, DataKind::kDuplicateHeavy, WeightKind::kRow, 2}, - {"staircase_unweighted_large", 4096, 16, DataKind::kStaircaseMass, WeightKind::kNone, 5}, - {"clustered_weighted_large", 4096, 16, DataKind::kClustered, WeightKind::kRow, 6}, - {"duplicate_weighted_large", 4096, 16, DataKind::kDuplicateHeavy, WeightKind::kRow, 7}, - {"clustered_unweighted_wide_budget_gap", 16384, 32, DataKind::kClustered, WeightKind::kNone, - 8}, - {"staircase_weighted_wide_budget_gap", 16384, 32, DataKind::kStaircaseMass, WeightKind::kRow, - 9}, - {"exact_unique_unweighted", 16, 16, DataKind::kExactUnique, WeightKind::kNone, 3}, - {"exact_unique_weighted", 16, 16, DataKind::kExactUnique, WeightKind::kRow, 4}, - }; -} - -inline std::vector SummaryRandomCases(std::size_t n_cases) { - std::vector cases; - cases.reserve(n_cases); - - SimpleLCG lcg; - auto const data_kinds = std::vector{DataKind::kClustered, DataKind::kDuplicateHeavy, - DataKind::kExactUnique, DataKind::kStaircaseMass}; - auto const weight_kinds = std::vector{WeightKind::kNone, WeightKind::kRow}; - auto const max_bins_pool = std::vector{8, 16, 32, 64}; - auto const rows_pool = std::vector{256, 1024, 4096, 16384, 65536}; - - for (std::size_t i = 0; i < n_cases; ++i) { - auto data = data_kinds[lcg() % data_kinds.size()]; - auto weights = weight_kinds[lcg() % weight_kinds.size()]; - auto max_bin = max_bins_pool[lcg() % max_bins_pool.size()]; - auto rows = rows_pool[lcg() % rows_pool.size()]; - - if (data == DataKind::kExactUnique) { - rows = std::min(rows, max_bin); - rows = std::max(rows, 1); - } - - auto seed = static_cast(lcg() % std::numeric_limits::max()); - cases.push_back( - {std::string("random_") + std::to_string(i), rows, max_bin, data, weights, seed}); - } - - return cases; -} - -inline std::vector ContainerAnchorCases() { - return { - {"empty_numeric_bins16", 0, 32, 0.0f, 16, WeightKind::kNone, FeatureKind::kNumerical, 10}, - {"dense_numeric_unweighted_bins2", 256, 32, 0.0f, 2, WeightKind::kNone, - FeatureKind::kNumerical, 11}, - {"dense_numeric_unweighted_bins16", 256, 32, 0.0f, 16, WeightKind::kNone, - FeatureKind::kNumerical, 12}, - {"dense_numeric_weighted_bins256", 512, 32, 0.0f, 256, WeightKind::kRow, - FeatureKind::kNumerical, 13}, - {"sparse_numeric_weighted_bins32", 512, 48, 0.7f, 32, WeightKind::kRow, - FeatureKind::kNumerical, 14}, - {"dense_mixed_unweighted_bins16", 256, 24, 0.0f, 16, WeightKind::kNone, FeatureKind::kMixed, - 15}, - {"sparse_mixed_weighted_bins64", 512, 40, 0.8f, 64, WeightKind::kRow, FeatureKind::kMixed, - 16}, - }; -} - -inline std::vector FeatureTypes(ContainerCase const& c) { - std::vector ft(c.cols, FeatureType::kNumerical); - if (c.features == FeatureKind::kMixed) { - for (std::size_t i = 0; i < ft.size(); ++i) { - ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; - } - } - return ft; -} - -inline std::vector GenerateWeights(std::size_t rows, std::uint32_t seed) { - std::vector weights(rows, 1.0f); - SimpleLCG lcg{seed}; - SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); - std::generate(weights.begin(), weights.end(), [&] { return std::exp(6.0f * unit_dist(&lcg)); }); - return weights; -} - -inline auto CollectWeightedColumns(DMatrix* dmat) -> std::vector> { - std::vector> columns(dmat->Info().num_col_); - if (dmat->Info().num_row_ == 0) { - return columns; - } - std::vector weights = dmat->Info().group_ptr_.empty() - ? dmat->Info().weights_.HostVector() - : detail::UnrollGroupWeights(dmat->Info()); - - bst_idx_t row_idx{0}; - Context ctx; - for (auto const& batch : dmat->GetBatches(&ctx)) { - auto page = batch.GetView(); - for (std::size_t i = 0; i < batch.Size(); ++i) { - auto row_weight = - weights.empty() ? 1.0 - : static_cast(weights.at(static_cast(row_idx + i))); - for (auto e : page[i]) { - columns[e.index].push_back({e.fvalue, row_weight}); - } - } - row_idx += batch.Size(); - } - CHECK_EQ(row_idx, dmat->Info().num_row_); - - for (auto& column : columns) { - std::sort(column.begin(), column.end(), - [](auto const& lhs, auto const& rhs) { return lhs.value < rhs.value; }); - } - return columns; -} - -inline auto AggregateWeightedColumn(std::vector const& sorted_column) - -> ReferenceColumn { - ReferenceColumn ref; - ref.prefix_weights.push_back(0.0); - for (auto const& entry : sorted_column) { - if (!ref.values.empty() && ref.values.back() == entry.value) { - ref.prefix_weights.back() += entry.weight; - } else { - ref.values.push_back(entry.value); - ref.prefix_weights.push_back(ref.prefix_weights.back() + entry.weight); - } - } - return ref; -} - -inline double DistanceToInterval(double target, double lo, double hi) { - if (target < lo) { - return lo - target; - } - if (target > hi) { - return target - hi; - } - return 0.0; -} - -struct CutRankErrorSummary { - double max_normalized_error{0.0}; - double max_absolute_error{0.0}; - double target_rank{0.0}; - double rank_lo{0.0}; - double rank_hi{0.0}; - double total_weight{0.0}; - bst_feature_t feature{0}; - std::size_t cut_index{0}; - std::size_t num_interior_cuts{0}; -}; - -inline auto MeasureCutRankError(HistogramCuts const& cuts, bst_feature_t column_idx, - ReferenceColumn const& ref) -> CutRankErrorSummary { - CutRankErrorSummary summary; - summary.feature = column_idx; - if (ref.values.empty()) { - return summary; - } - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - auto num_cuts = end - beg; - if (num_cuts <= 1) { - return summary; - } - summary.num_interior_cuts = num_cuts - 1; // Final cut is the sentinel upper bound. - summary.total_weight = ref.prefix_weights.back(); - if (summary.total_weight == 0.0 || summary.num_interior_cuts == 0) { - return summary; - } - - auto avg_bin_weight = summary.total_weight / static_cast(summary.num_interior_cuts); - for (std::size_t cut_idx = 0; cut_idx < summary.num_interior_cuts; ++cut_idx) { - auto cut_value = cuts.Values()[beg + cut_idx]; - auto lb = std::lower_bound(ref.values.cbegin(), ref.values.cend(), cut_value); - auto ub = std::upper_bound(ref.values.cbegin(), ref.values.cend(), cut_value); - auto rank_lo = ref.prefix_weights[std::distance(ref.values.cbegin(), lb)]; - auto rank_hi = ref.prefix_weights[std::distance(ref.values.cbegin(), ub)]; - auto target_rank = static_cast(cut_idx + 1) * summary.total_weight / - static_cast(summary.num_interior_cuts); - auto absolute_error = DistanceToInterval(target_rank, rank_lo, rank_hi); - auto normalized_error = absolute_error / avg_bin_weight; - if (normalized_error > summary.max_normalized_error) { - summary.max_normalized_error = normalized_error; - summary.max_absolute_error = absolute_error; - summary.target_rank = target_rank; - summary.rank_lo = rank_lo; - summary.rank_hi = rank_hi; - summary.cut_index = cut_idx; - } - } - - return summary; -} - -inline void ValidateNumericalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, - std::vector const& sorted_column, - std::size_t num_bins, double max_normalized_rank_error) { - auto ref = AggregateWeightedColumn(sorted_column); - CHECK(!ref.values.empty()); - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - auto first_bin = HistogramCuts::NumericBinLowerBound(cuts.Ptrs(), cuts.Values(), column_idx, beg); - EXPECT_TRUE(std::isinf(first_bin)); - EXPECT_LT(first_bin, 0.0f); - EXPECT_GT(cuts.Values()[beg], ref.values.front()); - EXPECT_GE(cuts.Values()[end - 1], ref.values.back()); - - if (ref.values.size() <= num_bins) { - for (std::size_t i = 0; i < ref.values.size(); ++i) { - ASSERT_EQ(cuts.SearchBin(ref.values[i], column_idx), beg + i) - << "feature=" << column_idx << ", value_index=" << i; - } - } else { - auto stats = MeasureCutRankError(cuts, column_idx, ref); - EXPECT_LE(stats.max_normalized_error, max_normalized_rank_error) - << "feature=" << column_idx << ", cut=" << stats.cut_index - << ", normalized_error=" << stats.max_normalized_error - << ", absolute_error=" << stats.max_absolute_error << ", target_rank=" << stats.target_rank - << ", rank_lo=" << stats.rank_lo << ", rank_hi=" << stats.rank_hi - << ", total_weight=" << stats.total_weight - << ", num_interior_cuts=" << stats.num_interior_cuts; - } -} - -inline void ValidateCategoricalCuts(HistogramCuts const& cuts, bst_feature_t column_idx, - std::vector const& sorted_column) { - std::vector categories; - categories.reserve(sorted_column.size()); - for (auto const& entry : sorted_column) { - categories.push_back(entry.value); - } - std::sort(categories.begin(), categories.end()); - categories.erase(std::unique(categories.begin(), categories.end()), categories.end()); - - auto beg = cuts.Ptrs()[column_idx]; - auto end = cuts.Ptrs()[column_idx + 1]; - ASSERT_EQ(static_cast(end - beg), categories.size()) << "feature=" << column_idx; - for (std::size_t i = 0; i < categories.size(); ++i) { - EXPECT_EQ(cuts.Values()[beg + i], categories[i]) << "feature=" << column_idx; - } -} - -inline void ValidateContainerCuts(ContainerCase const& c, HistogramCuts const& cuts, DMatrix* dmat, - std::vector> const& columns, - std::size_t f_begin = 0, - std::size_t f_end = std::numeric_limits::max()) { - ASSERT_EQ(cuts.Ptrs().size(), c.cols + 1) << "case=" << c.name; - auto ft = dmat->Info().feature_types.ConstHostSpan(); - auto max_error = - c.weights == WeightKind::kRow ? kMaxWeightedNormalizedRankError : kMaxNormalizedRankError; - f_end = std::min(f_end, columns.size()); - for (std::size_t i = f_begin; i < f_end; ++i) { - auto beg = cuts.Ptrs()[i]; - auto end = cuts.Ptrs()[i + 1]; - ASSERT_LT(beg, end) << "case=" << c.name << ", feature=" << i; - for (auto j = beg + 1; j < end; ++j) { - EXPECT_LT(cuts.Values()[j - 1], cuts.Values()[j]) << "case=" << c.name << ", feature=" << i; - } - if (columns[i].empty()) { - continue; - } - if (!ft.empty() && IsCat(ft, i)) { - ValidateCategoricalCuts(cuts, i, columns[i]); - } else { - ValidateNumericalCuts(cuts, i, columns[i], c.max_bin, max_error); - } - } -} - -inline GeneratedColumn GenerateSummaryColumn(SummaryCase const& c) { - GeneratedColumn out; - out.values.resize(c.rows); - out.weights.resize(c.rows, 1.0f); - - SimpleLCG lcg{c.seed}; - - switch (c.data) { - case DataKind::kClustered: { - SimpleRealUniformDistribution jitter(-1e-4f, 1e-4f); - for (std::size_t i = 0; i < c.rows; ++i) { - auto base = static_cast(lcg() % 4); - out.values[i] = base + jitter(&lcg); - } - break; - } - case DataKind::kDuplicateHeavy: { - std::size_t buckets = std::min(8, std::max(1, c.max_bin / 2)); - for (std::size_t i = 0; i < c.rows; ++i) { - out.values[i] = static_cast(i % buckets); - } - break; - } - case DataKind::kExactUnique: { - CHECK_LE(c.rows, static_cast(c.max_bin)); - std::iota(out.values.begin(), out.values.end(), 0.0f); - for (std::size_t i = out.values.size(); i > 1; --i) { - auto j = lcg() % i; - std::swap(out.values[i - 1], out.values[j]); - } - break; - } - case DataKind::kStaircaseMass: { - for (std::size_t i = 0; i < c.rows; ++i) { - out.values[i] = - static_cast(i) / static_cast(std::max(c.rows, 1)); - } - break; - } - } - - if (c.weights == WeightKind::kRow) { - switch (c.data) { - case DataKind::kClustered: { - SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); - std::generate(out.weights.begin(), out.weights.end(), - [&] { return std::exp(6.0f * unit_dist(&lcg)); }); - break; - } - case DataKind::kDuplicateHeavy: { - SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); - std::generate(out.weights.begin(), out.weights.end(), - [&] { return unit_dist(&lcg) < 0.01f ? 1000.0f : 1.0f; }); - break; - } - case DataKind::kExactUnique: { - SimpleRealUniformDistribution unit_dist(0.0f, 1.0f); - std::generate(out.weights.begin(), out.weights.end(), - [&] { return std::exp(6.0f * unit_dist(&lcg)); }); - break; - } - case DataKind::kStaircaseMass: { - auto period = std::max(2, static_cast(c.max_bin)); - for (std::size_t i = 0; i < c.rows; ++i) { - auto phase = i % period; - auto exponent = static_cast((phase * 8) / period); - out.weights[i] = std::exp2(exponent); - } - break; - } - } - } - - return out; -} - -inline ReferenceColumn AggregateReferenceColumn(GeneratedColumn const& col) { - std::vector> pairs; - pairs.reserve(col.values.size()); - for (std::size_t i = 0; i < col.values.size(); ++i) { - if (col.weights[i] == 0.0f) { - continue; - } - pairs.emplace_back(col.values[i], static_cast(col.weights[i])); - } - std::sort(pairs.begin(), pairs.end(), - [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; }); - - std::vector out; - for (auto const& [value, weight] : pairs) { - if (!out.empty() && out.back().value == value) { - out.back().weight += weight; - } else { - out.push_back({value, weight}); - } - } - - ReferenceColumn ref; - ref.values.reserve(out.size()); - ref.prefix_weights.reserve(out.size() + 1); - ref.prefix_weights.push_back(0.0); - for (auto const& v : out) { - ref.values.push_back(v.value); - ref.prefix_weights.push_back(ref.prefix_weights.back() + v.weight); - } - return ref; -} - -inline double TotalWeight(ReferenceColumn const& col) { return col.prefix_weights.back(); } - -inline std::size_t UniqueValueCount(ReferenceColumn const& col) { return col.values.size(); } - -inline bool EmptyReference(ReferenceColumn const& col) { return col.values.empty(); } - -inline Span ExactValues(ReferenceColumn const& col) { - return {col.values.data(), col.values.size()}; -} - -inline std::size_t NonZeroWeightCount(GeneratedColumn const& col) { - return std::count_if(col.weights.cbegin(), col.weights.cend(), - [](float w) { return w != static_cast(0); }); -} - -template -inline float QuerySummaryValue(Summary const& summary, double rank) { - auto entries = summary.Entries(); - CHECK_GE(entries.size(), 1); - if (entries.size() == 1) { - return entries.front().value; - } - - auto rank2 = static_cast(2.0) * rank; - std::size_t query_cursor = 0; - while (query_cursor < entries.size() - 2 && - rank2 >= - static_cast(entries[query_cursor + 1].rmin + entries[query_cursor + 1].rmax)) { - ++query_cursor; - } - auto left = entries[query_cursor]; - auto right = entries[query_cursor + 1]; - auto threshold = static_cast(left.RMinNext() + right.RMaxPrev()); - return rank2 < threshold ? left.value : right.value; -} - -inline double RankErrorForValue(ReferenceColumn const& col, double target_rank, float queried) { - auto lo_it = std::lower_bound(col.values.cbegin(), col.values.cend(), queried); - auto hi_it = std::upper_bound(col.values.cbegin(), col.values.cend(), queried); - auto lo_idx = static_cast(std::distance(col.values.cbegin(), lo_it)); - auto hi_idx = static_cast(std::distance(col.values.cbegin(), hi_it)); - auto rank_lo = col.prefix_weights[lo_idx]; - auto rank_hi = col.prefix_weights[hi_idx]; - - if (target_rank < rank_lo) { - return rank_lo - target_rank; - } - if (target_rank > rank_hi) { - return target_rank - rank_hi; - } - return 0.0; -} - -template -double MaxSummaryQueryRankError(Summary const& summary, ReferenceColumn const& reference, - std::size_t num_queries) { - auto total = TotalWeight(reference); - CHECK_GT(total, 0.0); - double max_error = 0.0; - for (std::size_t i = 1; i < num_queries; ++i) { - auto target = static_cast(i) * total / static_cast(num_queries); - auto queried = QuerySummaryValue(summary, target); - max_error = std::max(max_error, RankErrorForValue(reference, target, queried)); - } - return max_error; -} -} // namespace xgboost::common::quantile_test - -#endif // TESTS_CPP_COMMON_TEST_QUANTILE_HELPERS_H_ From 8d1fdf87108028da9b7cd36d36ff4fc44b90a5ae Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Apr 2026 03:48:21 -0700 Subject: [PATCH 14/16] Fix FreeBSD include and Windows CUDA warning --- tests/cpp/common/test_hist_util.cc | 2 -- tests/cpp/common/test_hist_util.cu | 36 +----------------------------- tests/cpp/common/test_quantile.h | 1 + 3 files changed, 2 insertions(+), 37 deletions(-) diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 0c7bf0ac5f5c..26dcf271f524 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -140,8 +140,6 @@ void TestQuantileWithHessian(bool use_sorted) { for (size_t i = 0; i < w.size(); ++i) { dmat->Info().weights_.HostVector()[i] = w[i] * hessian[i]; } - ValidateCuts(cuts_hess, dmat.get(), num_bins, kMaxWeightedNormalizedRankError); - HistogramCuts cuts_wh = SketchOnDMatrix(&ctx, dmat.get(), num_bins, use_sorted); ValidateCuts(cuts_wh, dmat.get(), num_bins, kMaxWeightedNormalizedRankError); diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 15d75a721b82..0eb0c026da51 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -158,22 +158,6 @@ TEST(HistUtil, RemoveDuplicatedCategories) { } } -TEST(HistUtil, DeviceSketchMultipleColumnsExternal) { - auto ctx = MakeCUDACtx(0); - auto bin_sizes = {2, 16, 256, 512}; - auto sizes = {100, 1000, 1500}; - int num_columns = 5; - for (auto num_rows : sizes) { - HostDeviceVector x{GenerateRandom(num_rows, num_columns)}; - common::TemporaryDirectory temp; - auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp); - for (auto num_bins : bin_sizes) { - auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins); - ValidateCuts(cuts, dmat.get(), num_bins); - } - } -} - // See https://github.com/dmlc/xgboost/issues/5866. TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) { auto ctx = MakeCUDACtx(0); @@ -295,22 +279,6 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) { } } -TEST(HistUtil, AdapterDeviceSketchMultipleColumns) { - auto bin_sizes = {2, 16, 256, 512}; - auto sizes = {100, 1000, 1500}; - int num_columns = 5; - auto ctx = MakeCUDACtx(0); - for (auto num_rows : sizes) { - auto x = GenerateRandom(num_rows, num_columns); - auto dmat = GetDMatrixFromData(x, num_rows, num_columns); - auto x_device = thrust::device_vector(x); - for (auto num_bins : bin_sizes) { - auto adapter = AdapterFromData(x_device, num_rows, num_columns); - ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get()); - } - } -} - namespace { auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) { curt::SetDevice(ctx->Ordinal()); @@ -318,7 +286,7 @@ auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_feature std::vector x; x.resize(n); - std::iota(x.begin(), x.end(), 0); + std::iota(x.begin(), x.end(), 0.0f); std::int32_t c{0}; float missing = n_samples * n_features; for (std::size_t i = 0; i < x.size(); ++i) { @@ -401,8 +369,6 @@ TEST(HistUtil, SketchingEquivalent) { &ctx, adapter, num_bins, std::numeric_limits::quiet_NaN()); EXPECT_EQ(dmat_cuts.Values(), adapter_cuts.Values()); EXPECT_EQ(dmat_cuts.Ptrs(), adapter_cuts.Ptrs()); - - ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get()); } } } diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h index 9f0770e7f5a4..df7ce4517910 100644 --- a/tests/cpp/common/test_quantile.h +++ b/tests/cpp/common/test_quantile.h @@ -13,6 +13,7 @@ #include #include +#include "../../../src/common/hist_util.h" #include "../../../src/common/quantile.h" #include "../helpers.h" From 989cf1fa3aa61d4d0cca13088a63bc039178b78a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Apr 2026 03:51:07 -0700 Subject: [PATCH 15/16] Remove dead cut budget variables --- src/common/hist_util.cu | 1 - src/common/hist_util.cuh | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 3fe6862501f1..7c728a580b20 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -249,7 +249,6 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b HostDeviceVector weight; weight.SetDevice(ctx->Device()); - std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_); auto sketch_batch_num_elements = detail::kSketchBatchNumElements; CUDAContext const* cuctx = ctx->CUDACtx(); diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index 7dfebe81a12f..5806c026438b 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -331,7 +331,6 @@ void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, Me bst_idx_t begin = 0; while (begin < kRemaining) { - auto num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows); auto remaining = kRemaining - begin; auto sketch_batch_num_elements = std::min(detail::kSketchBatchNumElements, remaining); // Re-estimate the needed number of cuts based on the size of the sub-batch. @@ -342,7 +341,6 @@ void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, Me // dense assumption. auto approx_n_samples = std::max(common::DivRoundUp(sketch_batch_num_elements, num_cols), bst_idx_t{1}); - num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, approx_n_samples); bst_idx_t end = std::min(batch.Size(), static_cast(begin + sketch_batch_num_elements)); From 848402cf38b69bf89abec75cff7dd69fc87eab07 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 15 Apr 2026 00:16:45 -0700 Subject: [PATCH 16/16] Avoid temporary in quantile test emplace_back --- tests/cpp/common/test_quantile.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index ab683489bba2..412836061f1c 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -54,7 +54,7 @@ auto MakeEntryBatch(std::vector> const& columns, for (bst_feature_t c = 0; c < columns.size(); ++c) { float prefix_sum = 0.0f; for (auto value : columns[c]) { - h_entries.emplace_back(Entry{c, value}); + h_entries.emplace_back(c, value); } if (weights) { CHECK_EQ(columns[c].size(), (*weights)[c].size());