Skip to content

Commit 4bb2f69

Browse files
committed
[ntuple] support merging columns with metadata (with different types)
1 parent 2c49b9d commit 4bb2f69

4 files changed

Lines changed: 209 additions & 12 deletions

File tree

tree/ntuple/inc/ROOT/RPageStorage.hxx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,10 +543,16 @@ public:
543543
[[nodiscard]] std::unique_ptr<RNTupleModel>
544544
InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters);
545545

546+
struct RColumnReprElement {
547+
ENTupleColumnType fType = ENTupleColumnType::kUnknown;
548+
// 0 means "use default". Only valid for fixed-bitwidth column types.
549+
std::uint16_t fBitWidth = 0;
550+
std::optional<RColumnDescriptor::RValueRange> fValueRange;
551+
};
546552
/// Adds a new column representation to the given field.
547553
/// \return The physical id of the first newly added column.
548554
ROOT::DescriptorId_t
549-
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const ENTupleColumnType> newRepresentation);
555+
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const RColumnReprElement> newRepresentation);
550556

551557
/// Adds a new alias column pointing to an existing column with the given physical id to the given field.
552558
void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field,

tree/ntuple/src/RNTupleMerger.cxx

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ struct RColReprMapping {
336336
};
337337

338338
struct RColReprExtension : RColReprMapping {
339-
ROOT::RFieldBase::ColumnRepresentation_t fSourceRepr;
339+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> fSourceRepr;
340340
};
341341

342342
static std::optional<std::uint32_t>
@@ -626,11 +626,15 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup
626626
} else if (matchingRepr < 0) {
627627
// this representation was not found in the destination
628628
assert(dstNColReprs < std::numeric_limits<std::uint32_t>::max());
629-
ROOT::RFieldBase::ColumnRepresentation_t newRepr;
629+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> newRepr;
630+
newRepr.reserve(srcColCardinality);
630631
for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) {
631632
const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx];
632633
const auto &srcCol = src.GetColumnDescriptor(srcColId);
633-
newRepr.push_back(srcCol.GetType());
634+
auto &reprElement = newRepr.emplace_back();
635+
reprElement.fType = srcCol.GetType();
636+
reprElement.fBitWidth = srcCol.GetBitsOnStorage();
637+
reprElement.fValueRange = srcCol.GetValueRange();
634638
}
635639
RColReprExtension extension{{srcReprIdx, static_cast<std::uint32_t>(dstNColReprs)}, newRepr};
636640
res.fColReprExtensions[field.fDst].push_back(extension);

tree/ntuple/src/RPageStorage.cxx

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr
11461146

11471147
ROOT::DescriptorId_t
11481148
ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field,
1149-
std::span<const ENTupleColumnType> newRepresentation)
1149+
std::span<const RColumnReprElement> newRepresentation)
11501150
{
11511151
const auto &descriptor = fDescriptorBuilder.GetDescriptor();
11521152

@@ -1162,10 +1162,15 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
11621162
fDescriptorBuilder.ShiftAliasColumns(newRepresentation.size());
11631163

11641164
std::uint16_t columnIndex = 0; // index into the representation
1165-
for (auto columnType : newRepresentation) {
1166-
// Extending columns with variable bit width is currently unsupported.
1167-
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnType);
1168-
R__ASSERT(rangeMin == rangeMax);
1165+
for (auto columnRepr : newRepresentation) {
1166+
std::size_t bitsOnStorage = columnRepr.fBitWidth;
1167+
if (!bitsOnStorage) {
1168+
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnRepr.fType);
1169+
if (rangeMin != rangeMax) {
1170+
throw ROOT::RException(R__FAIL("bit width must be given for columns of variable bit width"));
1171+
}
1172+
bitsOnStorage = rangeMin;
1173+
}
11691174

11701175
const ROOT::DescriptorId_t firstReprColumnId = field.GetLogicalColumnIds()[columnIndex];
11711176
const auto &firstReprColumnRange = fOpenColumnRanges.at(firstReprColumnId);
@@ -1175,12 +1180,13 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
11751180
columnBuilder.LogicalColumnId(columnId)
11761181
.PhysicalColumnId(columnId)
11771182
.FieldId(field.GetId())
1178-
.BitsOnStorage(rangeMax)
1179-
.Type(columnType)
1183+
.BitsOnStorage(bitsOnStorage)
1184+
.Type(columnRepr.fType)
11801185
.Index(columnIndex)
11811186
// NOTE: marking this column as suppressed with the minus sign
11821187
.FirstElementIndex(-firstReprColumnRange.GetFirstElementIndex())
1183-
.RepresentationIndex(reprIndex);
1188+
.RepresentationIndex(reprIndex)
1189+
.ValueRange(columnRepr.fValueRange);
11841190
fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap());
11851191

11861192
for (auto parentId = field.GetParentId(); parentId != ROOT::kInvalidDescriptorId;) {

tree/ntuple/test/ntuple_merger.cxx

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4223,3 +4223,184 @@ TEST(RNTupleMerger, MergeNewerVersion)
42234223
}
42244224
}
42254225
}
4226+
4227+
TEST(RNTupleMerger, MergeReal32Trunc)
4228+
{
4229+
// Merge two files, both containing the same Real32Trunc-encoded field, but with different bit widths.
4230+
FileRaii fileGuard1("test_ntuple_merge_real32trunc_in_1.root");
4231+
{
4232+
auto model = RNTupleModel::Create();
4233+
auto field = std::make_unique<RField<float>>("flt");
4234+
field->SetTruncated(14);
4235+
model->AddField(std::move(field));
4236+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4237+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4238+
for (int i = 0; i < 10; ++i) {
4239+
*fieldFlt = i;
4240+
ntuple->Fill();
4241+
}
4242+
}
4243+
FileRaii fileGuard2("test_ntuple_merge_real32trunc_in_2.root");
4244+
{
4245+
auto model = RNTupleModel::Create();
4246+
auto field = std::make_unique<RField<float>>("flt");
4247+
field->SetTruncated(24);
4248+
model->AddField(std::move(field));
4249+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4250+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4251+
for (int i = 0; i < 10; ++i) {
4252+
*fieldFlt = 10 + i;
4253+
ntuple->Fill();
4254+
}
4255+
}
4256+
{
4257+
// Gather the input sources
4258+
std::vector<std::unique_ptr<RPageSource>> sources;
4259+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4260+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4261+
std::vector<RPageSource *> sourcePtrs;
4262+
for (const auto &s : sources) {
4263+
sourcePtrs.push_back(s.get());
4264+
}
4265+
4266+
// Now merge the inputs
4267+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4268+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4269+
FileRaii fileGuardOut("test_ntuple_merge_real32trunc_out.root");
4270+
{
4271+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4272+
RNTupleMerger merger{std::move(destination)};
4273+
RNTupleMergeOptions opts;
4274+
opts.fMergingMode = mmode;
4275+
auto res = merger.Merge(sourcePtrs, opts);
4276+
// Currently we're not supporting merging columns with the same type but different metadata.
4277+
// TODO: support this.
4278+
EXPECT_FALSE(bool(res));
4279+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4280+
}
4281+
}
4282+
}
4283+
}
4284+
4285+
TEST(RNTupleMerger, MergeReal32Quant)
4286+
{
4287+
// Merge two files, both containing the same Real32Quant-encoded field, but with different value ranges.
4288+
FileRaii fileGuard1("test_ntuple_merge_real32quant_in_1.root");
4289+
{
4290+
auto model = RNTupleModel::Create();
4291+
auto field = std::make_unique<RField<float>>("flt");
4292+
field->SetQuantized(20, {0., 100.});
4293+
model->AddField(std::move(field));
4294+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4295+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4296+
for (int i = 0; i < 10; ++i) {
4297+
*fieldFlt = i;
4298+
ntuple->Fill();
4299+
}
4300+
}
4301+
FileRaii fileGuard2("test_ntuple_merge_real32quant_in_2.root");
4302+
{
4303+
auto model = RNTupleModel::Create();
4304+
auto field = std::make_unique<RField<float>>("flt");
4305+
field->SetQuantized(20, {-100., 100.});
4306+
model->AddField(std::move(field));
4307+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4308+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4309+
for (int i = 0; i < 10; ++i) {
4310+
*fieldFlt = 10 + i;
4311+
ntuple->Fill();
4312+
}
4313+
}
4314+
{
4315+
// Gather the input sources
4316+
std::vector<std::unique_ptr<RPageSource>> sources;
4317+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4318+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4319+
std::vector<RPageSource *> sourcePtrs;
4320+
for (const auto &s : sources) {
4321+
sourcePtrs.push_back(s.get());
4322+
}
4323+
4324+
// Now merge the inputs
4325+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4326+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4327+
FileRaii fileGuardOut("test_ntuple_merge_real32quant_out.root");
4328+
{
4329+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4330+
RNTupleMerger merger{std::move(destination)};
4331+
RNTupleMergeOptions opts;
4332+
opts.fMergingMode = mmode;
4333+
auto res = merger.Merge(sourcePtrs, opts);
4334+
// Currently we're not supporting merging columns with the same type but different metadata.
4335+
// TODO: support this.
4336+
ASSERT_FALSE(bool(res));
4337+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4338+
}
4339+
}
4340+
}
4341+
}
4342+
4343+
TEST(RNTupleMerger, MergeReal32TruncQuantMixed)
4344+
{
4345+
// Merge two files, both containing the same field, but with the first being Real32Trunc and the second Real32Quant
4346+
FileRaii fileGuard1("test_ntuple_merge_real32truncquant_in_1.root");
4347+
{
4348+
auto model = RNTupleModel::Create();
4349+
auto field = std::make_unique<RField<float>>("flt");
4350+
field->SetTruncated(24);
4351+
model->AddField(std::move(field));
4352+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4353+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4354+
for (int i = 0; i < 10; ++i) {
4355+
*fieldFlt = i;
4356+
ntuple->Fill();
4357+
}
4358+
}
4359+
FileRaii fileGuard2("test_ntuple_merge_real32truncquant_in_2.root");
4360+
{
4361+
auto model = RNTupleModel::Create();
4362+
auto field = std::make_unique<RField<float>>("flt");
4363+
field->SetQuantized(20, {-1., 100.});
4364+
model->AddField(std::move(field));
4365+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4366+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4367+
for (int i = 0; i < 10; ++i) {
4368+
*fieldFlt = 10 + i;
4369+
ntuple->Fill();
4370+
}
4371+
}
4372+
{
4373+
// Gather the input sources
4374+
std::vector<std::unique_ptr<RPageSource>> sources;
4375+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4376+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4377+
std::vector<RPageSource *> sourcePtrs;
4378+
for (const auto &s : sources) {
4379+
sourcePtrs.push_back(s.get());
4380+
}
4381+
4382+
// Now merge the inputs
4383+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4384+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4385+
FileRaii fileGuardOut("test_ntuple_merge_real32truncquant_out.root");
4386+
{
4387+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4388+
RNTupleMerger merger{std::move(destination)};
4389+
RNTupleMergeOptions opts;
4390+
opts.fMergingMode = mmode;
4391+
auto res = merger.Merge(sourcePtrs, opts);
4392+
EXPECT_TRUE(bool(res));
4393+
}
4394+
{
4395+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4396+
EXPECT_EQ(reader->GetNEntries(), 20);
4397+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4398+
auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4399+
for (auto i : reader->GetEntryRange()) {
4400+
reader->LoadEntry(i);
4401+
EXPECT_NEAR(*pFlt, i, 0.01f);
4402+
}
4403+
}
4404+
}
4405+
}
4406+
}

0 commit comments

Comments
 (0)