Skip to content

Commit 5ab7637

Browse files
committed
[ntuple] support merging columns with metadata (with different types)
1 parent 353fe6f commit 5ab7637

4 files changed

Lines changed: 211 additions & 14 deletions

File tree

tree/ntuple/inc/ROOT/RPageStorage.hxx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,10 +543,16 @@ public:
543543
[[nodiscard]] std::unique_ptr<RNTupleModel>
544544
InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters);
545545

546+
struct RColumnReprElement {
547+
ENTupleColumnType fType = ENTupleColumnType::kUnknown;
548+
// 0 means "use default". Only valid for fixed-bitwidth column types.
549+
std::uint16_t fBitWidth = 0;
550+
std::optional<RColumnDescriptor::RValueRange> fValueRange;
551+
};
546552
/// Adds a new column representation to the given field.
547553
/// \return The physical id of the first newly added column.
548554
ROOT::DescriptorId_t
549-
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const ENTupleColumnType> newRepresentation);
555+
AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span<const RColumnReprElement> newRepresentation);
550556

551557
/// Adds a new alias column pointing to an existing column with the given physical id to the given field.
552558
void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field,

tree/ntuple/src/RNTupleMerger.cxx

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -342,10 +342,10 @@ struct RColReprMapping {
342342
};
343343

344344
/// A column extension that needs to be added to an output field.
345-
/// Note that this also adds a mapping for the new representation, which is why this inherits RColReprMapping.
345+
/// Note that this also adds a mapping for each new representation, which is why it inherits RColReprMapping.
346346
struct RColReprExtension : RColReprMapping {
347-
/// The new representation to be added
348-
ROOT::RFieldBase::ColumnRepresentation_t fSourceRepr;
347+
/// The new representations to be added
348+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> fSourceRepr;
349349
};
350350

351351
static std::optional<std::uint32_t>
@@ -532,11 +532,15 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c
532532
} else if (matchingRepr < 0) {
533533
// this representation was not found in the destination
534534
assert(dstNColReprs < std::numeric_limits<std::uint32_t>::max());
535-
ROOT::RFieldBase::ColumnRepresentation_t newRepr;
535+
std::vector<ROOT::Internal::RPagePersistentSink::RColumnReprElement> newRepr;
536+
newRepr.reserve(srcColCardinality);
536537
for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) {
537538
const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx];
538539
const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId);
539-
newRepr.push_back(srcCol.GetType());
540+
auto &reprElement = newRepr.emplace_back();
541+
reprElement.fType = srcCol.GetType();
542+
reprElement.fBitWidth = srcCol.GetBitsOnStorage();
543+
reprElement.fValueRange = srcCol.GetValueRange();
540544
}
541545
RColReprExtension extension{{srcReprIdx, static_cast<std::uint32_t>(dstNColReprs)}, newRepr};
542546
result.fColReprExtensions[&dstField].push_back(extension);

tree/ntuple/src/RPageStorage.cxx

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr
11461146

11471147
ROOT::DescriptorId_t
11481148
ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field,
1149-
std::span<const ENTupleColumnType> newRepresentation)
1149+
std::span<const RColumnReprElement> newRepresentation)
11501150
{
11511151
const auto &descriptor = fDescriptorBuilder.GetDescriptor();
11521152

@@ -1162,10 +1162,15 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
11621162
fDescriptorBuilder.ShiftAliasColumns(newRepresentation.size());
11631163

11641164
std::uint16_t columnIndex = 0; // index into the representation
1165-
for (auto columnType : newRepresentation) {
1166-
// Extending columns with variable bit width is currently unsupported.
1167-
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnType);
1168-
R__ASSERT(rangeMin == rangeMax);
1165+
for (auto columnRepr : newRepresentation) {
1166+
std::size_t bitsOnStorage = columnRepr.fBitWidth;
1167+
if (!bitsOnStorage) {
1168+
const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnRepr.fType);
1169+
if (rangeMin != rangeMax) {
1170+
throw ROOT::RException(R__FAIL("bit width must be given for columns of variable bit width"));
1171+
}
1172+
bitsOnStorage = rangeMin;
1173+
}
11691174

11701175
const ROOT::DescriptorId_t firstReprColumnId = field.GetLogicalColumnIds()[columnIndex];
11711176
const auto &firstReprColumnRange = fOpenColumnRanges.at(firstReprColumnId);
@@ -1175,12 +1180,13 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD
11751180
columnBuilder.LogicalColumnId(columnId)
11761181
.PhysicalColumnId(columnId)
11771182
.FieldId(field.GetId())
1178-
.BitsOnStorage(rangeMax)
1179-
.Type(columnType)
1183+
.BitsOnStorage(bitsOnStorage)
1184+
.Type(columnRepr.fType)
11801185
.Index(columnIndex)
11811186
// NOTE: marking this column as suppressed with the minus sign
11821187
.FirstElementIndex(-firstReprColumnRange.GetFirstElementIndex())
1183-
.RepresentationIndex(reprIndex);
1188+
.RepresentationIndex(reprIndex)
1189+
.ValueRange(columnRepr.fValueRange);
11841190
fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap());
11851191

11861192
for (auto parentId = field.GetParentId(); parentId != ROOT::kInvalidDescriptorId;) {

tree/ntuple/test/ntuple_merger.cxx

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4225,3 +4225,184 @@ TEST(RNTupleMerger, MergeNewerVersion)
42254225
}
42264226
}
42274227
}
4228+
4229+
TEST(RNTupleMerger, MergeReal32Trunc)
4230+
{
4231+
// Merge two files, both containing the same Real32Trunc-encoded field, but with different bit widths.
4232+
FileRaii fileGuard1("test_ntuple_merge_real32trunc_in_1.root");
4233+
{
4234+
auto model = RNTupleModel::Create();
4235+
auto field = std::make_unique<RField<float>>("flt");
4236+
field->SetTruncated(14);
4237+
model->AddField(std::move(field));
4238+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4239+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4240+
for (int i = 0; i < 10; ++i) {
4241+
*fieldFlt = i;
4242+
ntuple->Fill();
4243+
}
4244+
}
4245+
FileRaii fileGuard2("test_ntuple_merge_real32trunc_in_2.root");
4246+
{
4247+
auto model = RNTupleModel::Create();
4248+
auto field = std::make_unique<RField<float>>("flt");
4249+
field->SetTruncated(24);
4250+
model->AddField(std::move(field));
4251+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4252+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4253+
for (int i = 0; i < 10; ++i) {
4254+
*fieldFlt = 10 + i;
4255+
ntuple->Fill();
4256+
}
4257+
}
4258+
{
4259+
// Gather the input sources
4260+
std::vector<std::unique_ptr<RPageSource>> sources;
4261+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4262+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4263+
std::vector<RPageSource *> sourcePtrs;
4264+
for (const auto &s : sources) {
4265+
sourcePtrs.push_back(s.get());
4266+
}
4267+
4268+
// Now merge the inputs
4269+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4270+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4271+
FileRaii fileGuardOut("test_ntuple_merge_real32trunc_out.root");
4272+
{
4273+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4274+
RNTupleMerger merger{std::move(destination)};
4275+
RNTupleMergeOptions opts;
4276+
opts.fMergingMode = mmode;
4277+
auto res = merger.Merge(sourcePtrs, opts);
4278+
// Currently we're not supporting merging columns with the same type but different metadata.
4279+
// TODO: support this.
4280+
EXPECT_FALSE(bool(res));
4281+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4282+
}
4283+
}
4284+
}
4285+
}
4286+
4287+
TEST(RNTupleMerger, MergeReal32Quant)
4288+
{
4289+
// Merge two files, both containing the same Real32Quant-encoded field, but with different value ranges.
4290+
FileRaii fileGuard1("test_ntuple_merge_real32quant_in_1.root");
4291+
{
4292+
auto model = RNTupleModel::Create();
4293+
auto field = std::make_unique<RField<float>>("flt");
4294+
field->SetQuantized(20, {0., 100.});
4295+
model->AddField(std::move(field));
4296+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4297+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4298+
for (int i = 0; i < 10; ++i) {
4299+
*fieldFlt = i;
4300+
ntuple->Fill();
4301+
}
4302+
}
4303+
FileRaii fileGuard2("test_ntuple_merge_real32quant_in_2.root");
4304+
{
4305+
auto model = RNTupleModel::Create();
4306+
auto field = std::make_unique<RField<float>>("flt");
4307+
field->SetQuantized(20, {-100., 100.});
4308+
model->AddField(std::move(field));
4309+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4310+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4311+
for (int i = 0; i < 10; ++i) {
4312+
*fieldFlt = 10 + i;
4313+
ntuple->Fill();
4314+
}
4315+
}
4316+
{
4317+
// Gather the input sources
4318+
std::vector<std::unique_ptr<RPageSource>> sources;
4319+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4320+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4321+
std::vector<RPageSource *> sourcePtrs;
4322+
for (const auto &s : sources) {
4323+
sourcePtrs.push_back(s.get());
4324+
}
4325+
4326+
// Now merge the inputs
4327+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4328+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4329+
FileRaii fileGuardOut("test_ntuple_merge_real32quant_out.root");
4330+
{
4331+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4332+
RNTupleMerger merger{std::move(destination)};
4333+
RNTupleMergeOptions opts;
4334+
opts.fMergingMode = mmode;
4335+
auto res = merger.Merge(sourcePtrs, opts);
4336+
// Currently we're not supporting merging columns with the same type but different metadata.
4337+
// TODO: support this.
4338+
ASSERT_FALSE(bool(res));
4339+
EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata"));
4340+
}
4341+
}
4342+
}
4343+
}
4344+
4345+
TEST(RNTupleMerger, MergeReal32TruncQuantMixed)
4346+
{
4347+
// Merge two files, both containing the same field, but with the first being Real32Trunc and the second Real32Quant
4348+
FileRaii fileGuard1("test_ntuple_merge_real32truncquant_in_1.root");
4349+
{
4350+
auto model = RNTupleModel::Create();
4351+
auto field = std::make_unique<RField<float>>("flt");
4352+
field->SetTruncated(24);
4353+
model->AddField(std::move(field));
4354+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath());
4355+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4356+
for (int i = 0; i < 10; ++i) {
4357+
*fieldFlt = i;
4358+
ntuple->Fill();
4359+
}
4360+
}
4361+
FileRaii fileGuard2("test_ntuple_merge_real32truncquant_in_2.root");
4362+
{
4363+
auto model = RNTupleModel::Create();
4364+
auto field = std::make_unique<RField<float>>("flt");
4365+
field->SetQuantized(20, {-1., 100.});
4366+
model->AddField(std::move(field));
4367+
auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath());
4368+
auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4369+
for (int i = 0; i < 10; ++i) {
4370+
*fieldFlt = 10 + i;
4371+
ntuple->Fill();
4372+
}
4373+
}
4374+
{
4375+
// Gather the input sources
4376+
std::vector<std::unique_ptr<RPageSource>> sources;
4377+
sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions()));
4378+
sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions()));
4379+
std::vector<RPageSource *> sourcePtrs;
4380+
for (const auto &s : sources) {
4381+
sourcePtrs.push_back(s.get());
4382+
}
4383+
4384+
// Now merge the inputs
4385+
for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) {
4386+
SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode));
4387+
FileRaii fileGuardOut("test_ntuple_merge_real32truncquant_out.root");
4388+
{
4389+
auto destination = std::make_unique<RPageSinkFile>("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions());
4390+
RNTupleMerger merger{std::move(destination)};
4391+
RNTupleMergeOptions opts;
4392+
opts.fMergingMode = mmode;
4393+
auto res = merger.Merge(sourcePtrs, opts);
4394+
EXPECT_TRUE(bool(res));
4395+
}
4396+
{
4397+
auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath());
4398+
EXPECT_EQ(reader->GetNEntries(), 20);
4399+
EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2);
4400+
auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr<float>("flt");
4401+
for (auto i : reader->GetEntryRange()) {
4402+
reader->LoadEntry(i);
4403+
EXPECT_NEAR(*pFlt, i, 0.01f);
4404+
}
4405+
}
4406+
}
4407+
}
4408+
}

0 commit comments

Comments
 (0)