diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc index 5b96d8f5c98e..37567b8e75b3 100644 --- a/plugin/sycl/common/hist_util.cc +++ b/plugin/sycl/common/hist_util.cc @@ -321,7 +321,7 @@ ::sycl::event BuildHistDispatchKernel( GHistRow* hist, bool isDense, GHistRow* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event events_priv, bool force_atomic_use) { const size_t size = row_indices.Size(); @@ -373,7 +373,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu, const GHistIndexMatrix& gmat, const bool isDense, GHistRow* hist, GHistRow* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event event_priv, bool force_atomic_use) { const bool is_dense = isDense; @@ -409,7 +409,7 @@ ::sycl::event GHistBuilder::BuildHist( GHistRowT* hist, bool isDense, GHistRowT* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event event_priv, bool force_atomic_use) { return BuildHistKernel(qu_, gpair, row_indices, gmat, @@ -426,7 +426,7 @@ ::sycl::event GHistBuilder::BuildHist( GHistRow* hist, bool isDense, GHistRow* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event event_priv, bool force_atomic_use); template @@ -437,7 +437,7 @@ ::sycl::event GHistBuilder::BuildHist( GHistRow* hist, bool isDense, GHistRow* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event event_priv, bool force_atomic_use); diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h index c2148c6a612e..3c71a7be20d9 100644 --- a/plugin/sycl/common/hist_util.h +++ b/plugin/sycl/common/hist_util.h @@ -162,7 +162,7 @@ class GHistBuilder { GHistRowT* HistCollection, bool isDense, GHistRowT* hist_buffer, - const tree::DeviceProperties& device_prop, + const DeviceProperties& device_prop, ::sycl::event event, bool force_atomic_use = false); diff --git a/plugin/sycl/device_properties.h b/plugin/sycl/device_properties.h new file mode 100644 index 000000000000..0b0bc90fbff4 --- /dev/null +++ b/plugin/sycl/device_properties.h @@ -0,0 +1,66 @@ +/*! + * Copyright 2017-2025 by Contributors + * \file device_properties.h + */ +#ifndef PLUGIN_SYCL_DEVICE_PROPERTIES_H_ +#define PLUGIN_SYCL_DEVICE_PROPERTIES_H_ + +#include +#include +#include "../../src/common/common.h" // for HumanMemUnit + +namespace xgboost { +namespace sycl { + +class DeviceProperties { + void GetL2Size(const ::sycl::device& device) { + l2_size = device.get_info<::sycl::info::device::global_mem_cache_size>(); + LOG(INFO) << "Detected L2 Size = " << ::xgboost::common::HumanMemUnit(l2_size); + l2_size_per_eu = static_cast(l2_size) / max_compute_units; + } + + void GetSRAMSize(const ::sycl::device& device) { + auto arch = + device.get_info<::sycl::ext::oneapi::experimental::info::device::architecture>(); + size_t eu_per_core = + device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>(); + switch (arch) { + case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: { + LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled."; + size_t l1_size = 512 * 1024; + size_t registers_size = 64 * 1024; + sram_size_per_eu = l1_size / eu_per_core + registers_size; + break; + } + default: + sram_size_per_eu = 0; + } + } + + public: + bool is_gpu; + bool usm_host_allocations; + size_t max_compute_units; + size_t max_work_group_size; + size_t sub_group_size; + float sram_size_per_eu = 0; + size_t l2_size = 0; + float l2_size_per_eu = 0; + + explicit DeviceProperties(const ::sycl::device& device): + is_gpu(device.is_gpu()), + usm_host_allocations(device.has(::sycl::aspect::usm_host_allocations)), + max_compute_units(device.get_info<::sycl::info::device::max_compute_units>()), + max_work_group_size(device.get_info<::sycl::info::device::max_work_group_size>()), + sub_group_size(device.get_info<::sycl::info::device::sub_group_sizes>().back()) { + GetL2Size(device); + if (is_gpu) { + GetSRAMSize(device); + } + } +}; + +} // namespace sycl +} // namespace xgboost + +#endif // PLUGIN_SYCL_DEVICE_PROPERTIES_H_ diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc index bde2d96bd8e8..d1195625d90e 100755 --- a/plugin/sycl/predictor/predictor.cc +++ b/plugin/sycl/predictor/predictor.cc @@ -29,6 +29,7 @@ #include "../../src/gbm/gbtree_model.h" #include "../device_manager.h" +#include "../device_properties.h" namespace xgboost { namespace sycl { @@ -129,20 +130,64 @@ class DeviceModel { } }; -float GetLeafWeight(const Node* nodes, const float* fval_buff, const uint8_t* miss_buff) { - const Node* node = nodes; - while (!node->IsLeaf()) { - if (miss_buff[node->GetFidx()] == 1) { - node = nodes + node->MissingIdx(); +// Binary search +float BinarySearch(const Entry* begin_ptr, const Entry* end_ptr, + size_t col_idx, size_t num_features) { + const size_t n_elems = end_ptr - begin_ptr; + if (n_elems == num_features) { + return (begin_ptr + col_idx)->fvalue; + } + + // Since indexes are in range [0: num_features), + // we can squeeze the search window from [0: n_elems) to [offset_left: offset_right) + const size_t shift = (num_features - 1) - col_idx; + const size_t offset_left = shift > n_elems - 1 ? 0 : std::max(0, (n_elems - 1) - shift); + const size_t offset_right = std::min(col_idx + 1, n_elems); + + end_ptr = begin_ptr + offset_right; + begin_ptr += offset_left; + const Entry* previous_middle = nullptr; + while (end_ptr != begin_ptr) { + const Entry* middle = begin_ptr + (end_ptr - begin_ptr) / 2; + if (middle == previous_middle) { + break; } else { - const float fvalue = fval_buff[node->GetFidx()]; - if (fvalue < node->GetFvalue()) { - node = nodes + node->LeftChildIdx(); - } else { - node = nodes + node->RightChildIdx(); - } + previous_middle = middle; + } + if (middle->index == col_idx) { + return middle->fvalue; + } else if (middle->index < col_idx) { + begin_ptr = middle + 1; + } else { + end_ptr = middle; } } + return std::numeric_limits::quiet_NaN(); +} + +size_t NextNodeIdx(float fvalue, const Node& node) { + if (std::isnan(fvalue)) { + return node.MissingIdx(); + } else { + if (fvalue < node.GetFvalue()) { + return node.LeftChildIdx(); + } else { + return node.RightChildIdx(); + } + } +} + +float GetLeafWeight(const Node* nodes, const Entry* first_entry, + const Entry* last_entry, size_t num_features) { + size_t is_dense = (last_entry - first_entry == num_features); + + const Node* node = nodes; + while (!node->IsLeaf()) { + const float fvalue = is_dense ? + (first_entry + node->GetFidx())->fvalue : + BinarySearch(first_entry, last_entry, node->GetFidx(), num_features); + node = nodes + NextNodeIdx(fvalue, *node); + } return node->GetWeight(); } @@ -150,11 +195,7 @@ float GetLeafWeight(const Node* nodes, const float* fval_buff) { const Node* node = nodes; while (!node->IsLeaf()) { const float fvalue = fval_buff[node->GetFidx()]; - if (fvalue < node->GetFvalue()) { - node = nodes + node->LeftChildIdx(); - } else { - node = nodes + node->RightChildIdx(); - } + node = nodes + NextNodeIdx(fvalue, *node); } return node->GetWeight(); } @@ -191,14 +232,13 @@ class Predictor : public xgboost::Predictor { } out_preds->Fill(base_score); } - needs_buffer_update = true; } explicit Predictor(Context const* context) : xgboost::Predictor::Predictor{context}, - cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) { - qu_ = device_manager.GetQueue(ctx_->Device()); - } + cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)), + qu_(device_manager.GetQueue(context->Device())), + device_prop_(qu_->get_device()) {} void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model, bst_tree_t tree_begin, @@ -254,7 +294,117 @@ class Predictor : public xgboost::Predictor { } private: - template + // 8KB fits EU registers + static constexpr int kMaxFeatureBufferSize = 2048; + + // Relative cost of reading and writing for discrete and integrated devices. + static constexpr float kCostCalibrationIntegrated = 64; + static constexpr float kCostCalibrationDescrete = 4; + + template + void PredictKernelBufferDispatch(::sycl::event* event, + const Entry* data, + float* out_predictions, + const size_t* row_ptr, + size_t num_rows, + size_t num_features, + size_t num_group, + size_t tree_begin, + size_t tree_end, + float sparsity) const { + if constexpr (kFeatureBufferSize > kMaxFeatureBufferSize) { + LOG(FATAL) << "Unreachable"; + } else { + if (num_features > kFeatureBufferSize) { + PredictKernelBufferDispatch + (event, data, out_predictions, row_ptr, num_rows, + num_features, num_group, tree_begin, tree_end, sparsity); + } else { + PredictKernelBuffer + (event, data, out_predictions, row_ptr, num_rows, + num_features, num_group, tree_begin, tree_end, sparsity); + } + } + } + + size_t GetBlockSize(size_t n_nodes, size_t num_features, size_t num_rows, float sparsity) const { + size_t max_compute_units = device_prop_.max_compute_units; + size_t l2_size = device_prop_.l2_size; + size_t sub_group_size = device_prop_.sub_group_size; + size_t nodes_bytes = n_nodes * sizeof(Node); + bool nodes_fit_l2 = l2_size > 2 * nodes_bytes; + size_t block_size = nodes_fit_l2 + // nodes and data fit L2 + ? 0.8 * (l2_size - nodes_bytes) / (sparsity * num_features * sizeof(Entry)) + // only data fit L2 + : 0.8 * (l2_size) / (sparsity * num_features * sizeof(Entry)); + block_size = (block_size / sub_group_size) * sub_group_size; + if (block_size < max_compute_units * sub_group_size) { + block_size = max_compute_units * sub_group_size; + } + + if (block_size > num_rows) block_size = num_rows; + return block_size; + } + + template + void PredictKernelBuffer(::sycl::event* event, + const Entry* data, + float* out_predictions, + const size_t* row_ptr, + size_t num_rows, + size_t num_features, + size_t num_group, + size_t tree_begin, + size_t tree_end, + float sparsity) const { + const Node* nodes = device_model.nodes.DataConst(); + const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer(); + const int* tree_group = device_model.tree_group.ConstDevicePointer(); + + size_t block_size = GetBlockSize(device_model.nodes.Size(), + num_features, num_rows, sparsity); + size_t n_blocks = num_rows / block_size + (num_rows % block_size > 0); + + for (size_t block = 0; block < n_blocks; ++block) { + *event = qu_->submit([&](::sycl::handler& cgh) { + cgh.depends_on(*event); + cgh.parallel_for<>(::sycl::range<1>(block_size), [=](::sycl::id<1> pid) { + int row_idx = block * block_size + pid[0]; + if (row_idx < num_rows) { + const Entry* first_entry = data + row_ptr[row_idx]; + const Entry* last_entry = data + row_ptr[row_idx + 1]; + + float fvalues[kFeatureBufferSize]; + if constexpr (any_missing) { + for (size_t fid = 0; fid < num_features; ++fid) { + fvalues[fid] = std::numeric_limits::quiet_NaN(); + } + } + + for (const Entry* entry = first_entry; entry < last_entry; entry += 1) { + fvalues[entry->index] = entry->fvalue; + } + if (num_group == 1) { + float& sum = out_predictions[row_idx]; + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; + sum += GetLeafWeight(first_node, fvalues); + } + } else { + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; + int out_prediction_idx = row_idx * num_group + tree_group[tree_idx]; + out_predictions[out_prediction_idx] += + GetLeafWeight(first_node, fvalues); + } + } + } + }); + }); + } + } + void PredictKernel(::sycl::event* event, const Entry* data, float* out_predictions, @@ -263,58 +413,70 @@ class Predictor : public xgboost::Predictor { size_t num_features, size_t num_group, size_t tree_begin, - size_t tree_end) const { + size_t tree_end, + float sparsity) const { const Node* nodes = device_model.nodes.DataConst(); const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer(); const int* tree_group = device_model.tree_group.ConstDevicePointer(); - float* fval_buff_ptr = fval_buff.Data(); - uint8_t* miss_buff_ptr = miss_buff.Data(); - bool needs_buffer_update = this->needs_buffer_update; - - *event = qu_->submit([&](::sycl::handler& cgh) { - cgh.depends_on(*event); - cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::id<1> pid) { - int row_idx = pid[0]; - auto* fval_buff_row_ptr = fval_buff_ptr + num_features * row_idx; - auto* miss_buff_row_ptr = miss_buff_ptr + num_features * row_idx; - - if (needs_buffer_update) { - const Entry* first_entry = data + row_ptr[row_idx]; - const Entry* last_entry = data + row_ptr[row_idx + 1]; - for (const Entry* entry = first_entry; entry < last_entry; entry += 1) { - fval_buff_row_ptr[entry->index] = entry->fvalue; - if constexpr (any_missing) { - miss_buff_row_ptr[entry->index] = 0; - } - } - } - - if (num_group == 1) { - float& sum = out_predictions[row_idx]; - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; - if constexpr (any_missing) { - sum += GetLeafWeight(first_node, fval_buff_row_ptr, miss_buff_row_ptr); + size_t block_size = GetBlockSize(device_model.nodes.Size(), + num_features, num_rows, sparsity); + size_t n_blocks = num_rows / block_size + (num_rows % block_size > 0); + + for (size_t block = 0; block < n_blocks; ++block) { + *event = qu_->submit([&](::sycl::handler& cgh) { + cgh.depends_on(*event); + cgh.parallel_for<>(::sycl::range<1>(block_size), [=](::sycl::id<1> pid) { + int row_idx = block * block_size + pid[0]; + if (row_idx < num_rows) { + const Entry* first_entry = data + row_ptr[row_idx]; + const Entry* last_entry = data + row_ptr[row_idx + 1]; + + if (num_group == 1) { + float& sum = out_predictions[row_idx]; + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; + sum += GetLeafWeight(first_node, first_entry, last_entry, num_features); + } } else { - sum += GetLeafWeight(first_node, fval_buff_row_ptr); + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; + int out_prediction_idx = row_idx * num_group + tree_group[tree_idx]; + out_predictions[out_prediction_idx] += + GetLeafWeight(first_node, first_entry, last_entry, num_features); + } } } - } else { - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; - int out_prediction_idx = row_idx * num_group + tree_group[tree_idx]; - if constexpr (any_missing) { - out_predictions[out_prediction_idx] += - GetLeafWeight(first_node, fval_buff_row_ptr, miss_buff_row_ptr); - } else { - out_predictions[out_prediction_idx] += - GetLeafWeight(first_node, fval_buff_row_ptr); - } - } - } + }); }); - }); + } + } + + template + bool UseFvalueBuffer(size_t tree_begin, + size_t tree_end, + int num_features) const { + size_t n_nodes = device_model.nodes.Size(); + size_t n_trees = tree_end - tree_begin; + float av_depth = std::log2(static_cast(n_nodes) / n_trees); + // the last one is leaf + float av_nodes_per_traversal = av_depth - 1; + // number of reads in case of no-bufer + float n_reads = av_nodes_per_traversal * n_trees; + if (any_missing) { + // we use binary search for sparse + n_reads *= std::log2(static_cast(num_features)); + } + + float cost_callibration = device_prop_.usm_host_allocations + ? kCostCalibrationIntegrated + : kCostCalibrationDescrete; + + // number of writes in local memory. + float n_writes = num_features; + bool use_fvalue_buffer = (num_features <= kMaxFeatureBufferSize) && + (n_reads > cost_callibration * n_writes); + return use_fvalue_buffer; } template @@ -342,30 +504,28 @@ class Predictor : public xgboost::Predictor { if (batch_size > 0) { const auto base_rowid = batch.base_rowid; - if (needs_buffer_update) { - fval_buff.ResizeNoCopy(qu_, num_features * batch_size); - if constexpr (any_missing) { - miss_buff.ResizeAndFill(qu_, num_features * batch_size, 1, &event); - } + float sparsity = static_cast(batch.data.Size()) / (batch_size * num_features); + if (UseFvalueBuffer(tree_begin, tree_end, num_features)) { + PredictKernelBufferDispatch(&event, data, + out_predictions + base_rowid * num_group, + row_ptr, batch_size, num_features, + num_group, tree_begin, tree_end, sparsity); + } else { + PredictKernel(&event, data, + out_predictions + base_rowid * num_group, + row_ptr, batch_size, num_features, + num_group, tree_begin, tree_end, sparsity); } - - PredictKernel(&event, data, out_predictions + base_rowid, - row_ptr, batch_size, num_features, - num_group, tree_begin, tree_end); - needs_buffer_update = (batch_size != out_preds->Size()); } } qu_->wait(); } - mutable USMVector fval_buff; - mutable USMVector miss_buff; mutable DeviceModel device_model; - mutable bool needs_buffer_update = true; + DeviceManager device_manager; mutable ::sycl::queue* qu_ = nullptr; - - DeviceManager device_manager; + DeviceProperties device_prop_; std::unique_ptr cpu_predictor; }; diff --git a/plugin/sycl/tree/hist_dispatcher.h b/plugin/sycl/tree/hist_dispatcher.h index 5552a0799ae2..fe3874a90656 100644 --- a/plugin/sycl/tree/hist_dispatcher.h +++ b/plugin/sycl/tree/hist_dispatcher.h @@ -7,57 +7,13 @@ #include #include -#include -#include "../../../src/common/common.h" // for HumanMemUnit +#include "../device_properties.h" namespace xgboost { namespace sycl { namespace tree { -class DeviceProperties { - void GetL2Size(const ::sycl::device& device) { - size_t l2_size = device.get_info<::sycl::info::device::global_mem_cache_size>(); - LOG(INFO) << "Detected L2 Size = " << ::xgboost::common::HumanMemUnit(l2_size); - l2_size_per_eu = static_cast(l2_size) / max_compute_units; - } - - void GetSRAMSize(const ::sycl::device& device) { - auto arch = - device.get_info<::sycl::ext::oneapi::experimental::info::device::architecture>(); - size_t eu_per_core = - device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>(); - switch (arch) { - case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: { - LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled."; - size_t l1_size = 512 * 1024; - size_t registers_size = 64 * 1024; - sram_size_per_eu = l1_size / eu_per_core + registers_size; - break; - } - default: - sram_size_per_eu = 0; - } - } - - public: - bool is_gpu; - size_t max_compute_units; - size_t max_work_group_size; - float sram_size_per_eu = 0; - float l2_size_per_eu = 0; - - explicit DeviceProperties(const ::sycl::device& device): - is_gpu(device.is_gpu()), - max_compute_units(device.get_info<::sycl::info::device::max_compute_units>()), - max_work_group_size(device.get_info<::sycl::info::device::max_work_group_size>()) { - GetL2Size(device); - if (is_gpu) { - GetSRAMSize(device); - } - } -}; - struct BlockParams { size_t size, nblocks; }; template diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc index abf48c4c8cdc..95e38a61adf5 100644 --- a/tests/cpp/plugin/test_sycl_ghist_builder.cc +++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc @@ -68,7 +68,7 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) { InitHist(qu, &hist, hist.Size(), &event); InitHist(qu, &hist_buffer, hist_buffer.Size(), &event); - tree::DeviceProperties device_prop(qu->get_device()); + DeviceProperties device_prop(qu->get_device()); event = builder.BuildHist(gpair, row_set_collection[0], gmat_sycl, &hist, sparsity < eps , &hist_buffer, device_prop, event, force_atomic_use); qu->memcpy(hist_host.data(), hist.Data(),