Skip to content

Commit b3e6c54

Browse files
committed
[ntuple] Support merging columns with same type and different metadata
1 parent 5ab7637 commit b3e6c54

5 files changed

Lines changed: 108 additions & 34 deletions

File tree

tree/ntuple/inc/ROOT/RColumnElementBase.hxx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <ROOT/RError.hxx>
1818
#include <ROOT/RFloat16.hxx>
1919
#include <ROOT/RNTupleTypes.hxx>
20+
#include <ROOT/RNTupleDescriptor.hxx>
2021

2122
#include <Byteswap.h>
2223
#include <TError.h>
@@ -147,6 +148,13 @@ std::unique_ptr<RColumnElementBase> RColumnElementBase::Generate(ROOT::ENTupleCo
147148
template <>
148149
std::unique_ptr<RColumnElementBase> RColumnElementBase::Generate<void>(ROOT::ENTupleColumnType onDiskType);
149150

151+
struct RColumnFormat {
152+
ENTupleColumnType fType = ENTupleColumnType::kUnknown;
153+
// 0 means "use default". Only valid for fixed-bitwidth column types.
154+
std::uint16_t fBitWidth = 0;
155+
std::optional<RColumnDescriptor::RValueRange> fValueRange;
156+
};
157+
150158
} // namespace ROOT::Internal
151159

152160
#endif

tree/ntuple/inc/ROOT/RPageStorage.hxx

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -543,16 +543,10 @@ public:
543543
[[nodiscard]] std::unique_ptr<RNTupleModel>
544544
InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters);
545545

546-
struct RColumnReprElement {
547-
ENTupleColumnType fType = ENTupleColumnType::kUnknown;
548-
// 0 means "use default". Only valid for fixed-bitwidth column types.
549-
std::uint16_t fBitWidth = 0;
550-
std::optional<RColumnDescriptor::RValueRange> fValueRange;
551-
};
552546
/// Adds a new column representation to the given field.
553547
/// \return The physical id of the first newly added column.
554-
ROOT::DescriptorId_t
555-
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const RColumnReprElement> newRepresentation);
548+
ROOT::DescriptorId_t AddColumnRepresentation(const ROOT::RFieldDescriptor &field,
549+
std::span<const ROOT::Internal::RColumnFormat> newRepresentation);
556550

557551
/// Adds a new alias column pointing to an existing column with the given physical id to the given field.
558552
void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field,

tree/ntuple/src/RNTupleMerger.cxx

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ struct RColReprMapping {
345345
/// Note that this also adds a mapping for each new representation, which is why it inherits RColReprMapping.
346346
struct RColReprExtension : RColReprMapping {
347347
/// The new representations to be added
348-
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> fSourceRepr;
348+
std::vector<ROOT::Internal::RColumnFormat> fSourceRepr;
349349
};
350350

351351
static std::optional<std::uint32_t>
@@ -500,26 +500,19 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c
500500
const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId);
501501
const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx];
502502
const auto &dstCol = dstDesc.GetColumnDescriptor(dstColId);
503-
if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() ||
503+
if (srcCol.GetType() != dstCol.GetType() ||
504+
srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() ||
504505
srcCol.GetValueRange() != dstCol.GetValueRange()) {
505-
std::stringstream ss;
506-
ss << "Source field `" << srcField.GetFieldName()
507-
<< "` has a matching column representation as its destination field, however one or "
508-
"more "
509-
"of its columns have different column metadata (bit width and/or value range). "
510-
"Merging variable-sized columns is currently only supported if all metadata is "
511-
"identical between source and destination columns."
512-
<< "\n bit width src: " << srcCol.GetBitsOnStorage() << ", dst: " << dstCol.GetBitsOnStorage()
513-
<< ""
514-
<< "\n value range src: " << srcCol.GetValueRange() << ", dst: " << dstCol.GetValueRange();
515-
errors.push_back(ss.str());
506+
matches = false;
516507
break;
517508
}
518509
}
519510

520-
// We found a valid matching representation: break the loop on the dst column representations.
521-
matchingRepr = dstReprIdx;
522-
break;
511+
if (matches) {
512+
// We found a valid matching representation.
513+
matchingRepr = dstReprIdx;
514+
break;
515+
}
523516
}
524517
}
525518

@@ -532,7 +525,7 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c
532525
} else if (matchingRepr < 0) {
533526
// this representation was not found in the destination
534527
assert(dstNColReprs < std::numeric_limits<std::uint32_t>::max());
535-
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> newRepr;
528+
std::vector<ROOT::Internal::RColumnFormat> newRepr;
536529
newRepr.reserve(srcColCardinality);
537530
for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) {
538531
const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx];

tree/ntuple/src/RPageStorage.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr
11461146

11471147
ROOT::DescriptorId_t
11481148
ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field,
1149-
std::span<const RColumnReprElement> newRepresentation)
1149+
std::span<const RColumnFormat> newRepresentation)
11501150
{
11511151
const auto &descriptor = fDescriptorBuilder.GetDescriptor();
11521152

tree/ntuple/test/ntuple_merger.cxx

Lines changed: 87 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4275,10 +4275,17 @@ TEST(RNTupleMerger, MergeReal32Trunc)
42754275
RNTupleMergeOptions opts;
42764276
opts.fMergingMode = mmode;
42774277
auto res = merger.Merge(sourcePtrs, opts);
4278-
// Currently we're not supporting merging columns with the same type but different metadata.
4279-
// TODO: support this.
4280-
EXPECT_FALSE(bool(res));
4281-
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4278+
EXPECT_TRUE(bool(res));
4279+
}
4280+
{
4281+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4282+
EXPECT_EQ(reader->GetNEntries(), 20);
4283+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4284+
auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4285+
for (auto i : reader->GetEntryRange()) {
4286+
reader->LoadEntry(i);
4287+
EXPECT_NEAR(*pFlt, i, 0.01f);
4288+
}
42824289
}
42834290
}
42844291
}
@@ -4333,10 +4340,17 @@ TEST(RNTupleMerger, MergeReal32Quant)
43334340
RNTupleMergeOptions opts;
43344341
opts.fMergingMode = mmode;
43354342
auto res = merger.Merge(sourcePtrs, opts);
4336-
// Currently we're not supporting merging columns with the same type but different metadata.
4337-
// TODO: support this.
4338-
ASSERT_FALSE(bool(res));
4339-
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4343+
EXPECT_TRUE(bool(res));
4344+
}
4345+
{
4346+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4347+
EXPECT_EQ(reader->GetNEntries(), 20);
4348+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4349+
auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4350+
for (auto i : reader->GetEntryRange()) {
4351+
reader->LoadEntry(i);
4352+
EXPECT_NEAR(*pFlt, i, 0.01f);
4353+
}
43404354
}
43414355
}
43424356
}
@@ -4406,3 +4420,68 @@ TEST(RNTupleMerger, MergeReal32TruncQuantMixed)
44064420
}
44074421
}
44084422
}
4423+
4424+
TEST(RNTupleMerger, MergeRealRegularQuantMixed)
4425+
{
4426+
// Merge two files, both containing the same field, but with the first being a SplitReal64 and the second Real32Quant
4427+
FileRaii fileGuard1("test_ntuple_merge_realregquant_in_1.root");
4428+
{
4429+
auto model = RNTupleModel::Create();
4430+
auto fieldDbl = model->MakeField<double>("dbl");
4431+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4432+
for (int i = 0; i < 10; ++i) {
4433+
*fieldDbl = i;
4434+
ntuple->Fill();
4435+
}
4436+
}
4437+
FileRaii fileGuard2("test_ntuple_merge_realregquant_in_2.root");
4438+
{
4439+
auto model = RNTupleModel::Create();
4440+
auto field = std::make_unique<RField<double>>("dbl");
4441+
field->SetQuantized(29, {0., 20.});
4442+
model->AddField(std::move(field));
4443+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4444+
auto fieldDbl = ntuple->GetModel().GetDefaultEntry().GetPtr<double>("dbl");
4445+
for (int i = 0; i < 10; ++i) {
4446+
*fieldDbl = 10 + i;
4447+
ntuple->Fill();
4448+
}
4449+
}
4450+
{
4451+
// Gather the input sources
4452+
std::vector<std::unique_ptr<RPageSource>> sources;
4453+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4454+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4455+
std::vector<RPageSource *> sourcePtrs;
4456+
for (const auto &s : sources) {
4457+
sourcePtrs.push_back(s.get());
4458+
}
4459+
4460+
// Now merge the inputs
4461+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4462+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4463+
FileRaii fileGuardOut("test_ntuple_merge_realregquant_out.root");
4464+
{
4465+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4466+
RNTupleMerger merger{std::move(destination)};
4467+
RNTupleMergeOptions opts;
4468+
opts.fMergingMode = mmode;
4469+
auto res = merger.Merge(sourcePtrs, opts);
4470+
EXPECT_TRUE(bool(res));
4471+
}
4472+
{
4473+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4474+
EXPECT_EQ(reader->GetNEntries(), 20);
4475+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4476+
auto pDbl = reader->GetModel().GetDefaultEntry().GetPtr<double>("dbl");
4477+
for (auto i : reader->GetEntryRange()) {
4478+
reader->LoadEntry(i);
4479+
if (i < 10)
4480+
EXPECT_DOUBLE_EQ(*pDbl, i);
4481+
else
4482+
EXPECT_NEAR(*pDbl, i, 0.01f);
4483+
}
4484+
}
4485+
}
4486+
}
4487+
}

0 commit comments

Comments
 (0)