From 50511c12399d8cb33fc7641fdfa9225608d0c4a8 Mon Sep 17 00:00:00 2001 From: Silia Taider Date: Tue, 7 Apr 2026 16:40:10 +0200 Subject: [PATCH 1/4] [ntuple] Set fNTupleName in RNTupleDS single file constructor --- tree/dataframe/src/RNTupleDS.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index 261327e29794e..e7a9b1ed7434b 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -432,6 +432,7 @@ std::unique_ptr CreatePageSource(std::string_view n ROOT::RDF::RNTupleDS::RNTupleDS(std::string_view ntupleName, std::string_view fileName) : RNTupleDS(CreatePageSource(ntupleName, fileName)) { + fNTupleName = ntupleName; fFileNames = std::vector{std::string{fileName}}; } From 64545861faa8a0903a7b6be87abbe8e0dc38a431 Mon Sep 17 00:00:00 2001 From: Silia Taider Date: Tue, 7 Apr 2026 16:41:04 +0200 Subject: [PATCH 2/4] [ntuple] Add GetDatasetGlobalClusterBoundaries as a friend to RNTupleDS --- tree/dataframe/inc/ROOT/RNTupleDS.hxx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx index 1019b53953ee4..ca87b581da04a 100644 --- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx +++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx @@ -63,8 +63,17 @@ class RFieldBase; class RDataFrame; class RNTuple; } // namespace ROOT +namespace ROOT::Detail::RDF { +class RNodeBase; +} +namespace ROOT::RDF { +template +class RInterface; +} namespace ROOT::Internal::RDF { class RNTupleColumnReader; +std::vector> +GetDatasetGlobalClusterBoundaries(const ROOT::RDF::RInterface &node); } namespace ROOT::Internal { class RPageSource; @@ -206,6 +215,10 @@ class RNTupleDS final : public ROOT::RDF::RDataSource { const std::vector &fileNames, const std::pair &range); + // This function needs to acess private members fNTupleName and fFileNames + friend std::vector> ROOT::Internal::RDF::GetDatasetGlobalClusterBoundaries( + const ROOT::RDF::RInterface &node); + explicit RNTupleDS(std::string_view ntupleName, const std::vector &fileNames, const std::pair &range); From df70c5962a5115aa989971ebf384fda2d948859a Mon Sep 17 00:00:00 2001 From: Silia Taider Date: Wed, 8 Apr 2026 18:06:02 +0200 Subject: [PATCH 3/4] [RDF] Add internal utility to retrieve cluster entry ranges --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 3 + tree/dataframe/src/RInterface.cxx | 107 +++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index e2972cd8685da..2ed74de4a926f 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -100,6 +100,7 @@ class GraphCreatorHelper; void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair &&newRange); void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end); void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec); +std::vector> GetDatasetGlobalClusterBoundaries(const RNode &node); void TriggerRun(ROOT::RDF::RNode node); std::string GetDataSourceLabel(const ROOT::RDF::RNode &node); void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline); @@ -134,6 +135,8 @@ class RInterface : public RInterfaceBase { friend void RDFInternal::ChangeEmptyEntryRange(const RNode &node, std::pair &&newRange); friend void RDFInternal::ChangeBeginAndEndEntries(const RNode &node, Long64_t start, Long64_t end); friend void RDFInternal::ChangeSpec(const RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec); + friend std::vector> + RDFInternal::GetDatasetGlobalClusterBoundaries(const RNode &node); friend std::string ROOT::Internal::RDF::GetDataSourceLabel(const RNode &node); friend void ROOT::Internal::RDF::SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline); std::shared_ptr fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface. diff --git a/tree/dataframe/src/RInterface.cxx b/tree/dataframe/src/RInterface.cxx index 9bac68216cf7b..6c2ec6a7ebad4 100644 --- a/tree/dataframe/src/RInterface.cxx +++ b/tree/dataframe/src/RInterface.cxx @@ -9,6 +9,12 @@ *************************************************************************/ #include "ROOT/RDF/RInterface.hxx" +#include +#include "ROOT/RNTupleDS.hxx" +#include "ROOT/RTTreeDS.hxx" +#ifdef R__USE_IMT +#include +#endif void ROOT::Internal::RDF::ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair &&newRange) @@ -34,6 +40,107 @@ void ROOT::Internal::RDF::ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Ex node.GetLoopManager()->ChangeSpec(std::move(spec)); } +/** + * \brief Retrieve the cluster boundaries for each cluster in the dataset, + * across files, with a global offset. + * + * \param node Any node of the computation graph. + * \return A vector of [begin, end) entry pairs for each cluster in the dataset. + * + * \note When IMT is enabled, files are processed in parallel using a thread pool. + */ +std::vector> +ROOT::Internal::RDF::GetDatasetGlobalClusterBoundaries(const ROOT::RDF::RNode &node) +{ + std::vector> boundaries{}; + + auto *lm = node.GetLoopManager(); + auto *ds = lm->GetDataSource(); + + if (!ds) { + throw std::runtime_error("Cannot retrieve cluster boundaries: no data source available."); + } + + std::string datasetName; + std::vector fileNames; + bool isTTree = false; + + if (auto *ttreeds = dynamic_cast(ds)) { + auto *tree = ttreeds->GetTree(); + assert(tree && "The internal TTree is not available, something went wrong."); + datasetName = tree->GetName(); + fileNames = ROOT::Internal::TreeUtils::GetFileNamesFromTree(*tree); + isTTree = true; + } else if (auto *rntupleds = dynamic_cast(ds)) { + datasetName = rntupleds->fNTupleName; + fileNames = rntupleds->fFileNames; + isTTree = false; + } else { + throw std::runtime_error("Cannot retrieve cluster boundaries: unsupported data source type."); + } + + if (fileNames.empty()) { + return boundaries; + } + + const auto nFiles = fileNames.size(); + + // For each file retrieve the cluster boundaries + the number of entries + using FileResult = std::pair>, std::uint64_t>; + std::vector perFileResults(nFiles); + + // Function to process a single file and return its cluster boundaries + entry count + auto processFile = [&datasetName, isTTree](const std::string &fileName) -> FileResult { + std::vector> clusters; + std::uint64_t nEntries = 0; + + if (isTTree) { + // TTree + auto [clusterBoundaries, entries] = ROOT::Internal::TreeUtils::GetClustersAndEntries(datasetName, fileName); + nEntries = entries; + // [0, 10, 20, ...] --> [(0,10), (10,20), ...] + for (std::size_t i = 0; i + 1 < clusterBoundaries.size(); ++i) { + clusters.emplace_back(clusterBoundaries[i], clusterBoundaries[i + 1]); + } + } else { + // RNTuple + auto [clusterBoundaries, entries] = GetClustersAndEntries(datasetName, fileName); + nEntries = entries; + for (const auto &cluster : clusterBoundaries) { + clusters.emplace_back(cluster.fFirstEntry, cluster.fLastEntryPlusOne); + } + } + + return {clusters, nEntries}; + }; + +#ifdef R__USE_IMT + ROOT::TThreadExecutor pool; + // Distribute the processing of files in parallel across the thread pool, + // each thread takes a file and its index in the fileNames vector as input + // and fills the corresponding position in the perFileResults vector + pool.Foreach([&perFileResults, &fileNames, + &processFile](std::size_t idx) { perFileResults[idx] = processFile(fileNames[idx]); }, + ROOT::TSeq(nFiles)); +#else + // Process files sequentially as a fallback + for (std::size_t idx = 0; idx < nFiles; ++idx) { + perFileResults[idx] = processFile(fileNames[idx]); + } +#endif + // Now that we have the cluster boundaries and entry counts for each file, + // we can compute the global boundaries with offsets (sequentially) + std::uint64_t offset = 0; + for (const auto &[clusters, nEntries] : perFileResults) { + for (const auto &[start, end] : clusters) { + boundaries.emplace_back(offset + start, offset + end); + } + offset += nEntries; + } + + return boundaries; +} + /** * \brief Trigger the execution of an RDataFrame computation graph. * \param[in] node A node of the computation graph (not a result). From 5442dae2987379ef69e6d26b0ea1dacb6cbfe5a6 Mon Sep 17 00:00:00 2001 From: Silia Taider Date: Thu, 2 Apr 2026 11:06:08 +0200 Subject: [PATCH 4/4] [RDF] Add tests for GetDatasetGlobalClusterBoundaries --- tree/dataframe/test/dataframe_utils.cxx | 90 +++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tree/dataframe/test/dataframe_utils.cxx b/tree/dataframe/test/dataframe_utils.cxx index 3bd7f7162dea5..214c9ca58ba6e 100644 --- a/tree/dataframe/test/dataframe_utils.cxx +++ b/tree/dataframe/test/dataframe_utils.cxx @@ -1,5 +1,6 @@ #include "ROOT/RDataFrame.hxx" #include "ROOT/RDF/Utils.hxx" +#include "ROOT/RNTupleDS.hxx" #include "TTree.h" #include "gtest/gtest.h" @@ -237,3 +238,92 @@ TEST(RDataFrameUtils, TypeName2TypeID) EXPECT_THROW(RDFInt::TypeName2TypeID("float *"), std::runtime_error); EXPECT_THROW(RDFInt::TypeName2TypeID("float &"), std::runtime_error); } + +TEST(RDataFrameUtils, GetClusterRanges) +{ + // Helper RAII class for file cleanup + class FileRAII { + private: + std::string fPath; + + public: + explicit FileRAII(const std::string &path) : fPath(path) {} + FileRAII(const FileRAII &) = delete; + FileRAII &operator=(const FileRAII &) = delete; + ~FileRAII() { std::remove(fPath.c_str()); } + auto GetPath() const { return fPath.c_str(); } + }; + + FileRAII fTTree1("dataframe_interfaceAndUtils_2_tree_1.root"); + FileRAII fTTree2("dataframe_interfaceAndUtils_2_tree_2.root"); + FileRAII fRNTuple1("dataframe_interfaceAndUtils_2_rntuple_1.root"); + FileRAII fRNTuple2("dataframe_interfaceAndUtils_2_rntuple_2.root"); + + const int nEntries = 1000; + const int clusterSize = 250; + + // Test TTree single file + { + ROOT::RDF::RSnapshotOptions opts; + opts.fAutoFlush = clusterSize; + auto df = ROOT::RDataFrame(nEntries).Define("i", "rdfentry_"); + df.Snapshot("t", fTTree1.GetPath(), {"i"}, opts); + df.Snapshot("t", fTTree2.GetPath(), {"i"}, opts); + } + { + ROOT::RDataFrame dfTTree("t", fTTree1.GetPath()); + auto rangesTTree = RDFInt::GetDatasetGlobalClusterBoundaries(dfTTree); + + EXPECT_EQ(rangesTTree.size(), nEntries / clusterSize); + for (size_t i = 0; i < rangesTTree.size(); ++i) { + EXPECT_EQ(rangesTTree[i].first, i * clusterSize); + EXPECT_EQ(rangesTTree[i].second, (i + 1) * clusterSize); + } + } + + // Test TTree multiple files (chain) + { + ROOT::RDataFrame dfTTreeChain("t", {fTTree1.GetPath(), fTTree2.GetPath()}); + auto rangesTTreeChain = RDFInt::GetDatasetGlobalClusterBoundaries(dfTTreeChain); + + EXPECT_EQ(rangesTTreeChain.size(), 2 * nEntries / clusterSize); + for (size_t i = 0; i < rangesTTreeChain.size(); ++i) { + EXPECT_EQ(rangesTTreeChain[i].first, i * clusterSize); + EXPECT_EQ(rangesTTreeChain[i].second, (i + 1) * clusterSize); + } + } + + // Test RNTuple single file + { + ROOT::RDF::RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + auto df = ROOT::RDataFrame(nEntries).Define("i", "rdfentry_"); + df.Snapshot("nt", fRNTuple1.GetPath(), {"i"}, opts); + df.Snapshot("nt", fRNTuple2.GetPath(), {"i"}, opts); + } + { + auto dfRNTuple = ROOT::RDF::FromRNTuple("nt", fRNTuple1.GetPath()); + auto rangesRNTuple = RDFInt::GetDatasetGlobalClusterBoundaries(dfRNTuple); + + EXPECT_FALSE(rangesRNTuple.empty()); + EXPECT_EQ(rangesRNTuple.front().first, 0u); + EXPECT_EQ(rangesRNTuple.back().second, ULong64_t(nEntries)); + for (size_t i = 1; i < rangesRNTuple.size(); ++i) { + EXPECT_EQ(rangesRNTuple[i].first, rangesRNTuple[i - 1].second); + } + } + + // Test RNTuple multiple files + { + auto dfRNTupleChain = + ROOT::RDF::FromRNTuple("nt", std::vector{fRNTuple1.GetPath(), fRNTuple2.GetPath()}); + auto rangesRNTupleChain = RDFInt::GetDatasetGlobalClusterBoundaries(dfRNTupleChain); + + EXPECT_FALSE(rangesRNTupleChain.empty()); + EXPECT_EQ(rangesRNTupleChain.front().first, 0u); + EXPECT_EQ(rangesRNTupleChain.back().second, ULong64_t(2 * nEntries)); + for (size_t i = 1; i < rangesRNTupleChain.size(); ++i) { + EXPECT_EQ(rangesRNTupleChain[i].first, rangesRNTupleChain[i - 1].second); + } + } +}