PointKernel
diff --git a/‎cpp/examples/hybrid_scan_io/benchmark.hpp‎
Lines changed: 31 additions & 0 deletions b/‎cpp/examples/hybrid_scan_io/benchmark.hpp‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/common_utils.cpp‎
Lines changed: 105 additions & 33 deletions b/‎cpp/examples/hybrid_scan_io/common_utils.cpp‎
Lines changed: 105 additions & 33 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/common_utils.hpp‎
Lines changed: 35 additions & 32 deletions b/‎cpp/examples/hybrid_scan_io/common_utils.hpp‎
Lines changed: 35 additions & 32 deletions
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "timer.hpp"
+
+#include <iostream>
+
+#pragma once
+
+template <std::invocable F>
+void benchmark(F&& f, std::size_t iterations)
+{
+  double total_time_millis{0.0};
+  for (std::size_t i = 0; i < iterations; ++i) {
+    timer timer;
+    timer.reset();
+
+    f();
+
+    auto elapsed_time_ms =
+      static_cast<double>(std::chrono::duration_cast<timer::micros>(timer.elapsed()).count()) /
+      1000.0;
+    std::cout << "Iteration: " << i << ", time: " << elapsed_time_ms << " ms\n";
+    if (i != 0) { total_time_millis += elapsed_time_ms; }
+  }
+
+  std::cout << "Average time (first iteration excluded): " << total_time_millis / (iterations - 1)
+            << " ms\n\n";
+}
@@ -8,6 +8,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/join/filtered_join.hpp>
@@ -19,6 +20,7 @@
 #include <rmm/mr/owning_wrapper.hpp>
 #include <rmm/mr/pool_memory_resource.hpp>
 
+#include <numeric>
 #include <string>
 #include <vector>
 
@@ -27,6 +29,14 @@
  * @brief Definitions for utilities for `hybrid_scan_io` example
  */
 
+bool get_boolean(std::string input)
+{
+  std::transform(input.begin(), input.end(), input.begin(), ::toupper);
+
+  // Check if the input string matches to any of the following
+  return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T";
+}
+
 std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
 {
   if (is_pool_used) {
@@ -91,68 +101,130 @@ void check_tables_equal(cudf::table_view const& lhs_table,
   }
 }
 
-cudf::host_span<uint8_t const> fetch_footer_bytes(cudf::host_span<uint8_t const> buffer)
+std::unique_ptr<cudf::io::datasource::buffer> fetch_footer_bytes(cudf::io::datasource& datasource)
 {
   CUDF_FUNC_RANGE();
 
   using namespace cudf::io::parquet;
 
   constexpr auto header_len = sizeof(file_header_s);
   constexpr auto ender_len  = sizeof(file_ender_s);
-  size_t const len          = buffer.size();
+  size_t const len          = datasource.size();
 
-  auto const header_buffer = cudf::host_span<uint8_t const>(buffer.data(), header_len);
-  auto const header        = reinterpret_cast<file_header_s const*>(header_buffer.data());
-  auto const ender_buffer =
-    cudf::host_span<uint8_t const>(buffer.data() + len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<file_ender_s const*>(ender_buffer.data());
+  auto header_buffer = datasource.host_read(0, header_len);
+  auto const header  = reinterpret_cast<file_header_s const*>(header_buffer->data());
+  auto ender_buffer  = datasource.host_read(len - ender_len, ender_len);
+  auto const ender   = reinterpret_cast<file_ender_s const*>(ender_buffer->data());
   CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
   constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
   CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
                "Corrupted header or footer");
   CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
                "Incorrect footer length");
 
-  return cudf::host_span<uint8_t const>(buffer.data() + len - ender->footer_len - ender_len,
-                                        ender->footer_len);
+  return datasource.host_read(len - ender->footer_len - ender_len, ender->footer_len);
 }
 
-cudf::host_span<uint8_t const> fetch_page_index_bytes(
-  cudf::host_span<uint8_t const> buffer, cudf::io::text::byte_range_info const page_index_bytes)
+std::unique_ptr<cudf::io::datasource::buffer> fetch_page_index_bytes(
+  cudf::io::datasource& datasource, cudf::io::text::byte_range_info const page_index_bytes)
 {
-  return cudf::host_span<uint8_t const>(
-    reinterpret_cast<uint8_t const*>(buffer.data()) + page_index_bytes.offset(),
-    page_index_bytes.size());
+  return datasource.host_read(page_index_bytes.offset(), page_index_bytes.size());
 }
 
-std::vector<rmm::device_buffer> fetch_byte_ranges(
-  cudf::host_span<uint8_t const> host_buffer,
-  cudf::host_span<cudf::io::text::byte_range_info const> byte_ranges,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+cudf::host_span<uint8_t const> make_host_span(
+  std::reference_wrapper<cudf::io::datasource::buffer const> buffer)
 {
-  CUDF_FUNC_RANGE();
+  return cudf::host_span<uint8_t const>{static_cast<uint8_t const*>(buffer.get().data()),
+                                        buffer.get().size()};
+}
 
+std::tuple<std::vector<rmm::device_buffer>,
+           std::vector<cudf::device_span<uint8_t const>>,
+           std::future<void>>
+fetch_byte_ranges(cudf::io::datasource& datasource,
+                  cudf::host_span<cudf::io::text::byte_range_info const> byte_ranges,
+                  rmm::cuda_stream_view stream,
+                  rmm::device_async_resource_ref mr)
+{
   static std::mutex mutex;
 
-  std::vector<rmm::device_buffer> buffers(byte_ranges.size());
+  // Allocate device spans for each column chunk
+  std::vector<cudf::device_span<uint8_t const>> column_chunk_data{};
+  column_chunk_data.reserve(byte_ranges.size());
+
+  auto total_size = std::accumulate(
+    byte_ranges.begin(), byte_ranges.end(), std::size_t{0}, [&](auto acc, auto const& range) {
+      return acc + range.size();
+    });
+
+  // Allocate single device buffer for all column chunks
+  std::vector<rmm::device_buffer> column_chunk_buffers{};
+  column_chunk_buffers.emplace_back(total_size, stream, mr);
+  auto buffer_data = static_cast<uint8_t*>(column_chunk_buffers.back().data());
+  std::ignore      = std::accumulate(
+    byte_ranges.begin(), byte_ranges.end(), std::size_t{0}, [&](auto acc, auto const& range) {
+      column_chunk_data.emplace_back(buffer_data + acc, static_cast<size_t>(range.size()));
+      return acc + range.size();
+    });
+
+  std::vector<std::future<size_t>> device_read_tasks{};
+  std::vector<std::future<size_t>> host_read_tasks{};
+  device_read_tasks.reserve(byte_ranges.size());
+  host_read_tasks.reserve(byte_ranges.size());
   {
     std::lock_guard<std::mutex> lock(mutex);
 
-    std::transform(
-      byte_ranges.begin(), byte_ranges.end(), buffers.begin(), [&](auto const& byte_range) {
-        auto const chunk_offset = host_buffer.data() + byte_range.offset();
-        auto const chunk_size   = static_cast<size_t>(byte_range.size());
-        auto buffer             = rmm::device_buffer(chunk_size, stream, mr);
-        cudf::detail::cuda_memcpy_async(
-          cudf::device_span<uint8_t>{static_cast<uint8_t*>(buffer.data()), chunk_size},
-          cudf::host_span<uint8_t const>{chunk_offset, chunk_size},
-          stream);
-        return buffer;
-      });
+    for (size_t chunk = 0; chunk < byte_ranges.size();) {
+      auto const io_offset = static_cast<size_t>(byte_ranges[chunk].offset());
+      auto io_size         = static_cast<size_t>(byte_ranges[chunk].size());
+      size_t next_chunk    = chunk + 1;
+      while (next_chunk < byte_ranges.size()) {
+        size_t const next_offset = byte_ranges[next_chunk].offset();
+        if (next_offset != io_offset + io_size) { break; }
+        io_size += byte_ranges[next_chunk].size();
+        next_chunk++;
+      }
+
+      if (io_size != 0) {
+        auto dest = const_cast<uint8_t*>(column_chunk_data[chunk].data());
+        // Directly read the column chunk data to the device
+        // buffer if supported
+        if (datasource.supports_device_read() and datasource.is_device_read_preferred(io_size)) {
+          device_read_tasks.emplace_back(
+            datasource.device_read_async(io_offset, io_size, dest, stream));
+        } else {
+          // Read the column chunk data to the host buffer and
+          // copy it to the device buffer
+          host_read_tasks.emplace_back(
+            std::async(std::launch::deferred, [&datasource, io_offset, io_size, dest, stream]() {
+              auto host_buffer = datasource.host_read(io_offset, io_size);
+              cudf::detail::cuda_memcpy_async(
+                cudf::device_span<uint8_t>{dest, io_size},
+                cudf::host_span<uint8_t const>{host_buffer->data(), io_size},
+                stream);
+              return io_size;
+            }));
+        }
+      }
+      chunk = next_chunk;
+    }
   }
 
-  return buffers;
+  auto sync_function = [](decltype(host_read_tasks) host_read_tasks,
+                          decltype(device_read_tasks) device_read_tasks) {
+    for (auto& task : host_read_tasks) {
+      task.get();
+    }
+    for (auto& task : device_read_tasks) {
+      task.get();
+    }
+  };
+  return {std::move(column_chunk_buffers),
+          std::move(column_chunk_data),
+          std::async(std::launch::deferred,
+                     sync_function,
+                     std::move(host_read_tasks),
+                     std::move(device_read_tasks))};
 }
 
 std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
 
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
@@ -19,6 +20,14 @@
  * @brief Utilities for `hybrid_scan_io` example
  */
 
+/**
+ * @brief Get boolean from they keyword
+ *
+ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return true or false
+ */
+[[nodiscard]] bool get_boolean(std::string input);
+
 /**
  * @brief Create memory resource for libcudf functions
  *
@@ -60,56 +69,50 @@ void check_tables_equal(cudf::table_view const& lhs_table,
                         rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
- * @brief Fetches a host span of Parquet footer bytes from the input buffer span
+ * @brief Fetches a host buffer of Parquet footer bytes from the input data source
  *
- * @param buffer Input buffer span
- * @return A host span of the footer bytes
+ * @param datasource Input data source
+ * @return Host buffer containing footer bytes
  */
-cudf::host_span<uint8_t const> fetch_footer_bytes(cudf::host_span<uint8_t const> buffer);
+std::unique_ptr<cudf::io::datasource::buffer> fetch_footer_bytes(cudf::io::datasource& datasource);
+
 /**
- * @brief Fetches a host span of Parquet PageIndexbytes from the input buffer span
+ * @brief Fetches a host buffer of Parquet page index from the input data source
  *
- * @param buffer Input buffer span
- * @param page_index_bytes Byte range of `PageIndex` to fetch
- * @return A host span of the PageIndex bytes
+ * @param datasource Input datasource
+ * @param page_index_bytes Byte range of page index
+ * @return Host buffer containing page index bytes
  */
-cudf::host_span<uint8_t const> fetch_page_index_bytes(
-  cudf::host_span<uint8_t const> buffer, cudf::io::text::byte_range_info const page_index_bytes);
+std::unique_ptr<cudf::io::datasource::buffer> fetch_page_index_bytes(
+  cudf::io::datasource& datasource, cudf::io::text::byte_range_info const page_index_bytes);
 
 /**
- * @brief Converts a span of device buffers into a vector of corresponding device spans
+ * @brief Converts a host buffer into a host span
  *
- * @tparam T Type of output device spans
- * @param buffers Host span of device buffers
- * @return Device spans corresponding to the input device buffers
+ * @param buffer Host buffer
+ * @return Host span of input host buffer
  */
-template <typename T>
-std::vector<cudf::device_span<T const>> make_device_spans(
-  cudf::host_span<rmm::device_buffer const> buffers)
-  requires(sizeof(T) == 1)
-{
-  std::vector<cudf::device_span<T const>> device_spans(buffers.size());
-  std::transform(buffers.begin(), buffers.end(), device_spans.begin(), [](auto const& buffer) {
-    return cudf::device_span<T const>{static_cast<T const*>(buffer.data()), buffer.size()};
-  });
-  return device_spans;
-}
+cudf::host_span<uint8_t const> make_host_span(
+  std::reference_wrapper<cudf::io::datasource::buffer const> buffer);
 
 /**
  * @brief Fetches a list of byte ranges from a host buffer into device buffers
  *
- * @param host_buffer Host buffer span
+ * @param datasource Input datasource
  * @param byte_ranges Byte ranges to fetch
  * @param stream CUDA stream
  * @param mr Device memory resource
  *
- * @return Device buffers
+ * @return A tuple containing the device buffers, the device spans of the fetched data, and a future
+ * to wait on the read tasks
  */
-std::vector<rmm::device_buffer> fetch_byte_ranges(
-  cudf::host_span<uint8_t const> host_buffer,
-  cudf::host_span<cudf::io::text::byte_range_info const> byte_ranges,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+std::tuple<std::vector<rmm::device_buffer>,
+           std::vector<cudf::device_span<uint8_t const>>,
+           std::future<void>>
+fetch_byte_ranges(cudf::io::datasource& datasource,
+                  cudf::host_span<cudf::io::text::byte_range_info const> byte_ranges,
+                  rmm::cuda_stream_view stream,
+                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Concatenate a vector of tables and return the resultant table