Skip to content

Commit 44c939d

Browse files
committed
[ntuple] support merging columns with metadata (with different types)
1 parent 8fcbcfa commit 44c939d

4 files changed

Lines changed: 209 additions & 12 deletions

File tree

tree/ntuple/inc/ROOT/RPageStorage.hxx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,10 +544,16 @@ public:
544544
[[nodiscard]] std::unique_ptr<RNTupleModel>
545545
InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters);
546546

547+
struct RColumnReprElement {
548+
ENTupleColumnType fType = ENTupleColumnType::kUnknown;
549+
// 0 means "use default". Only valid for fixed-bitwidth column types.
550+
std::uint16_t fBitWidth = 0;
551+
std::optional<RColumnDescriptor::RValueRange> fValueRange;
552+
};
547553
/// Adds a new column representation to the given field.
548554
/// \return The physical id of the first newly added column.
549555
ROOT::DescriptorId_t
550-
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const ENTupleColumnType> newRepresentation);
556+
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const RColumnReprElement> newRepresentation);
551557

552558
/// Adds a new alias column pointing to an existing column with the given physical id to the given field.
553559
void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field,

tree/ntuple/src/RNTupleMerger.cxx

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ struct RColReprMapping {
337337
};
338338

339339
struct RColReprExtension : RColReprMapping {
340-
ROOT::RFieldBase::ColumnRepresentation_t fSourceRepr;
340+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> fSourceRepr;
341341
};
342342

343343
static std::optional<std::uint32_t>
@@ -636,11 +636,15 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup
636636
} else if (matchingRepr < 0) {
637637
// this representation was not found in the destination
638638
assert(dstNColReprs < std::numeric_limits<std::uint32_t>::max());
639-
ROOT::RFieldBase::ColumnRepresentation_t newRepr;
639+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> newRepr;
640+
newRepr.reserve(srcColCardinality);
640641
for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) {
641642
const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx];
642643
const auto &srcCol = src.GetColumnDescriptor(srcColId);
643-
newRepr.push_back(srcCol.GetType());
644+
auto &reprElement = newRepr.emplace_back();
645+
reprElement.fType = srcCol.GetType();
646+
reprElement.fBitWidth = srcCol.GetBitsOnStorage();
647+
reprElement.fValueRange = srcCol.GetValueRange();
644648
}
645649
RColReprExtension extension{{srcReprIdx, static_cast<std::uint32_t>(dstNColReprs)}, newRepr};
646650
res.fColReprExtensions[field.fDst].push_back(extension);

tree/ntuple/src/RPageStorage.cxx

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr
10511051

10521052
ROOT::DescriptorId_t
10531053
ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field,
1054-
std::span<const ENTupleColumnType> newRepresentation)
1054+
std::span<const RColumnReprElement> newRepresentation)
10551055
{
10561056
const auto &descriptor = fDescriptorBuilder.GetDescriptor();
10571057

@@ -1067,10 +1067,15 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
10671067
fDescriptorBuilder.ShiftAliasColumns(newRepresentation.size());
10681068

10691069
std::uint16_t columnIndex = 0; // index into the representation
1070-
for (auto columnType : newRepresentation) {
1071-
// Extending columns with variable bit width is currently unsupported.
1072-
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnType);
1073-
R__ASSERT(rangeMin == rangeMax);
1070+
for (auto columnRepr : newRepresentation) {
1071+
std::size_t bitsOnStorage = columnRepr.fBitWidth;
1072+
if (!bitsOnStorage) {
1073+
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnRepr.fType);
1074+
if (rangeMin != rangeMax) {
1075+
throw ROOT::RException(R__FAIL("bit width must be given for columns of variable bit width"));
1076+
}
1077+
bitsOnStorage = rangeMin;
1078+
}
10741079

10751080
const ROOT::DescriptorId_t firstReprColumnId = field.GetLogicalColumnIds()[columnIndex];
10761081
const auto &firstReprColumnRange = fOpenColumnRanges.at(firstReprColumnId);
@@ -1080,12 +1085,13 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
10801085
columnBuilder.LogicalColumnId(columnId)
10811086
.PhysicalColumnId(columnId)
10821087
.FieldId(field.GetId())
1083-
.BitsOnStorage(rangeMax)
1084-
.Type(columnType)
1088+
.BitsOnStorage(bitsOnStorage)
1089+
.Type(columnRepr.fType)
10851090
.Index(columnIndex)
10861091
// NOTE: marking this column as suppressed with the minus sign
10871092
.FirstElementIndex(-firstReprColumnRange.GetFirstElementIndex())
1088-
.RepresentationIndex(reprIndex);
1093+
.RepresentationIndex(reprIndex)
1094+
.ValueRange(columnRepr.fValueRange);
10891095
fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap());
10901096

10911097
for (auto parentId = field.GetParentId(); parentId != ROOT::kInvalidDescriptorId;) {

tree/ntuple/test/ntuple_merger.cxx

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4136,3 +4136,184 @@ TEST(RNTupleMerger, MergeNewerVersion)
41364136
}
41374137
}
41384138
}
4139+
4140+
TEST(RNTupleMerger, MergeReal32Trunc)
4141+
{
4142+
// Merge two files, both containing the same Real32Trunc-encoded field, but with different bit widths.
4143+
FileRaii fileGuard1("test_ntuple_merge_real32trunc_in_1.root");
4144+
{
4145+
auto model = RNTupleModel::Create();
4146+
auto field = std::make_unique<RField<float>>("flt");
4147+
field->SetTruncated(14);
4148+
model->AddField(std::move(field));
4149+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4150+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4151+
for (int i = 0; i < 10; ++i) {
4152+
*fieldFlt = i;
4153+
ntuple->Fill();
4154+
}
4155+
}
4156+
FileRaii fileGuard2("test_ntuple_merge_real32trunc_in_2.root");
4157+
{
4158+
auto model = RNTupleModel::Create();
4159+
auto field = std::make_unique<RField<float>>("flt");
4160+
field->SetTruncated(24);
4161+
model->AddField(std::move(field));
4162+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4163+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4164+
for (int i = 0; i < 10; ++i) {
4165+
*fieldFlt = 10 + i;
4166+
ntuple->Fill();
4167+
}
4168+
}
4169+
{
4170+
// Gather the input sources
4171+
std::vector<std::unique_ptr<RPageSource>> sources;
4172+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4173+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4174+
std::vector<RPageSource *> sourcePtrs;
4175+
for (const auto &s : sources) {
4176+
sourcePtrs.push_back(s.get());
4177+
}
4178+
4179+
// Now merge the inputs
4180+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4181+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4182+
FileRaii fileGuardOut("test_ntuple_merge_real32trunc_out.root");
4183+
{
4184+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4185+
RNTupleMerger merger{std::move(destination)};
4186+
RNTupleMergeOptions opts;
4187+
opts.fMergingMode = mmode;
4188+
auto res = merger.Merge(sourcePtrs, opts);
4189+
// Currently we're not supporting merging columns with the same type but different metadata.
4190+
// TODO: support this.
4191+
EXPECT_FALSE(bool(res));
4192+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4193+
}
4194+
}
4195+
}
4196+
}
4197+
4198+
TEST(RNTupleMerger, MergeReal32Quant)
4199+
{
4200+
// Merge two files, both containing the same Real32Quant-encoded field, but with different value ranges.
4201+
FileRaii fileGuard1("test_ntuple_merge_real32quant_in_1.root");
4202+
{
4203+
auto model = RNTupleModel::Create();
4204+
auto field = std::make_unique<RField<float>>("flt");
4205+
field->SetQuantized(20, {0., 100.});
4206+
model->AddField(std::move(field));
4207+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4208+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4209+
for (int i = 0; i < 10; ++i) {
4210+
*fieldFlt = i;
4211+
ntuple->Fill();
4212+
}
4213+
}
4214+
FileRaii fileGuard2("test_ntuple_merge_real32quant_in_2.root");
4215+
{
4216+
auto model = RNTupleModel::Create();
4217+
auto field = std::make_unique<RField<float>>("flt");
4218+
field->SetQuantized(20, {-100., 100.});
4219+
model->AddField(std::move(field));
4220+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4221+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4222+
for (int i = 0; i < 10; ++i) {
4223+
*fieldFlt = 10 + i;
4224+
ntuple->Fill();
4225+
}
4226+
}
4227+
{
4228+
// Gather the input sources
4229+
std::vector<std::unique_ptr<RPageSource>> sources;
4230+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4231+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4232+
std::vector<RPageSource *> sourcePtrs;
4233+
for (const auto &s : sources) {
4234+
sourcePtrs.push_back(s.get());
4235+
}
4236+
4237+
// Now merge the inputs
4238+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4239+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4240+
FileRaii fileGuardOut("test_ntuple_merge_real32quant_out.root");
4241+
{
4242+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4243+
RNTupleMerger merger{std::move(destination)};
4244+
RNTupleMergeOptions opts;
4245+
opts.fMergingMode = mmode;
4246+
auto res = merger.Merge(sourcePtrs, opts);
4247+
// Currently we're not supporting merging columns with the same type but different metadata.
4248+
// TODO: support this.
4249+
ASSERT_FALSE(bool(res));
4250+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4251+
}
4252+
}
4253+
}
4254+
}
4255+
4256+
TEST(RNTupleMerger, MergeReal32TruncQuantMixed)
4257+
{
4258+
// Merge two files, both containing the same field, but with the first being Real32Trunc and the second Real32Quant
4259+
FileRaii fileGuard1("test_ntuple_merge_real32truncquant_in_1.root");
4260+
{
4261+
auto model = RNTupleModel::Create();
4262+
auto field = std::make_unique<RField<float>>("flt");
4263+
field->SetTruncated(24);
4264+
model->AddField(std::move(field));
4265+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4266+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4267+
for (int i = 0; i < 10; ++i) {
4268+
*fieldFlt = i;
4269+
ntuple->Fill();
4270+
}
4271+
}
4272+
FileRaii fileGuard2("test_ntuple_merge_real32truncquant_in_2.root");
4273+
{
4274+
auto model = RNTupleModel::Create();
4275+
auto field = std::make_unique<RField<float>>("flt");
4276+
field->SetQuantized(20, {-1., 100.});
4277+
model->AddField(std::move(field));
4278+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4279+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4280+
for (int i = 0; i < 10; ++i) {
4281+
*fieldFlt = 10 + i;
4282+
ntuple->Fill();
4283+
}
4284+
}
4285+
{
4286+
// Gather the input sources
4287+
std::vector<std::unique_ptr<RPageSource>> sources;
4288+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4289+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4290+
std::vector<RPageSource *> sourcePtrs;
4291+
for (const auto &s : sources) {
4292+
sourcePtrs.push_back(s.get());
4293+
}
4294+
4295+
// Now merge the inputs
4296+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4297+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4298+
FileRaii fileGuardOut("test_ntuple_merge_real32truncquant_out.root");
4299+
{
4300+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4301+
RNTupleMerger merger{std::move(destination)};
4302+
RNTupleMergeOptions opts;
4303+
opts.fMergingMode = mmode;
4304+
auto res = merger.Merge(sourcePtrs, opts);
4305+
EXPECT_TRUE(bool(res));
4306+
}
4307+
{
4308+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4309+
EXPECT_EQ(reader->GetNEntries(), 20);
4310+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4311+
auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4312+
for (auto i : reader->GetEntryRange()) {
4313+
reader->LoadEntry(i);
4314+
EXPECT_NEAR(*pFlt, i, 0.01f);
4315+
}
4316+
}
4317+
}
4318+
}
4319+
}

0 commit comments

Comments
 (0)