Skip to content

Commit 4519c08

Browse files
committed
[RDF] Add internal utility to retrieve cluster entry ranges
1 parent 04365d1 commit 4519c08

3 files changed

Lines changed: 77 additions & 4 deletions

File tree

tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ protected:
130130
}
131131
}
132132

133-
RDFDetail::RLoopManager *GetLoopManager() const { return fLoopManager.get(); }
134133
RDataSource *GetDataSource() const { return fLoopManager->GetDataSource(); }
135134

136135
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
@@ -208,6 +207,8 @@ public:
208207
RInterfaceBase(std::shared_ptr<RDFDetail::RLoopManager> lm);
209208
RInterfaceBase(RDFDetail::RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister);
210209

210+
RDFDetail::RLoopManager *GetLoopManager() const { return fLoopManager.get(); }
211+
211212
ColumnNames_t GetColumnNames();
212213

213214
std::string GetColumnType(std::string_view column);

tree/dataframe/inc/ROOT/RDF/Utils.hxx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ ROOT::RLogChannel &RDFLogChannel();
5555
// fwd decl for ColumnName2ColumnTypeName
5656
class RDefineBase;
5757

58+
class RLoopManager;
59+
5860
// type used for tag dispatching
5961
struct RInferredType {
6062
};
@@ -339,6 +341,12 @@ auto MakeAliasedSharedPtr(T *rawPtr)
339341
*/
340342
ROOT::RDF::Experimental::RDatasetSpec RetrieveSpecFromJson(const std::string &jsonFile);
341343

344+
/**
345+
* \brief Function to retrieve the entry ranges for each cluster in the dataset,
346+
* across files, with a global offset.
347+
*/
348+
std::vector<std::pair<ULong64_t, ULong64_t>> GetClusterRanges(Detail::RDF::RLoopManager &lm);
349+
342350
/**
343351
* Tag to let data sources use the native data type when creating a column reader.
344352
*

tree/dataframe/src/RDFUtils.cxx

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,17 @@
1616
#include "ROOT/RDF/RSampleInfo.hxx"
1717
#include "ROOT/RDF/Utils.hxx"
1818
#include "ROOT/RLogger.hxx"
19+
#include "ROOT/RTTreeDS.hxx"
1920
#include "RtypesCore.h"
2021
#include "TBranch.h"
2122
#include "TBranchElement.h"
23+
#include <TChain.h>
24+
#include "TChainElement.h"
2225
#include "TClass.h"
2326
#include "TClassEdit.h"
2427
#include "TClassRef.h"
2528
#include "TError.h" // Info
29+
#include "TFile.h"
2630
#include "TInterpreter.h"
2731
#include "TLeaf.h"
2832
#include "TROOT.h" // IsImplicitMTEnabled, GetThreadPoolSize
@@ -632,9 +636,69 @@ ROOT::RDF::Experimental::RDatasetSpec RetrieveSpecFromJson(const std::string &js
632636
return spec;
633637
};
634638

635-
} // end NS RDF
636-
} // end NS Internal
637-
} // end NS ROOT
639+
std::vector<std::pair<ULong64_t, ULong64_t>> GetClusterRanges(ROOT::Detail::RDF::RLoopManager &lm)
640+
{
641+
std::vector<std::pair<ULong64_t, ULong64_t>> ranges{};
642+
643+
auto *ds = lm.GetDataSource();
644+
if (!ds) {
645+
throw std::runtime_error("Cannot retrieve cluster ranges, no data source available");
646+
}
647+
648+
auto *ttreeds = dynamic_cast<RTTreeDS *>(ds);
649+
if (!ttreeds) {
650+
throw std::runtime_error("Cannot retrieve cluster ranges, data source is not a TTree");
651+
}
652+
653+
TTree *tree = ttreeds->GetTree();
654+
if (!tree) {
655+
return ranges;
656+
}
657+
658+
// Collect cluster boundaries from a single tree applying a global offset
659+
auto collectClusters = [&ranges](TTree *t, Long64_t offset) -> Long64_t {
660+
const Long64_t nEntries = t->GetEntries();
661+
auto it = t->GetClusterIterator(0);
662+
Long64_t start = 0;
663+
while ((start = it()) < nEntries)
664+
ranges.emplace_back(offset + start, offset + it.GetNextEntry());
665+
return nEntries;
666+
};
667+
668+
if (auto *chain = dynamic_cast<TChain *>(tree)) {
669+
// If TChain, open each file and iterate over its clusters
670+
TObjArray *files = chain->GetListOfFiles();
671+
if (!files) {
672+
return ranges;
673+
}
674+
675+
Long64_t offset = 0;
676+
for (int i = 0; i < files->GetEntries(); ++i) {
677+
auto *el = static_cast<TChainElement *>(files->At(i));
678+
if (!el) {
679+
continue;
680+
}
681+
std::unique_ptr<TFile> f{TFile::Open(el->GetTitle(), "READ")};
682+
if (!f || f->IsZombie()) {
683+
continue;
684+
}
685+
auto *t = f->Get<TTree>(el->GetName());
686+
if (!t) {
687+
continue;
688+
}
689+
offset += collectClusters(t, offset);
690+
}
691+
} else {
692+
// If single TTree, iterate over its clusters
693+
collectClusters(tree, 0);
694+
}
695+
696+
return ranges;
697+
}
698+
699+
} // namespace RDF
700+
} // namespace Internal
701+
} // namespace ROOT
638702

639703
std::string
640704
ROOT::Internal::RDF::GetTypeNameWithOpts(const ROOT::RDF::RDataSource &df, std::string_view colName, bool vector2RVec)

0 commit comments

Comments
 (0)