From 92ef8ffbe5d17b538bacc4db181e1181df119dbc Mon Sep 17 00:00:00 2001 From: silverweed Date: Tue, 28 Apr 2026 11:44:23 +0200 Subject: [PATCH 01/17] [ntuple] Fix RFieldFundamental::GenerateColumns using the wrong repr idx --- .../inc/ROOT/RField/RFieldFundamental.hxx | 4 +-- tree/ntuple/test/ntuple_multi_column.cxx | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx b/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx index 9caf202bd0171..cc0033ea523ee 100644 --- a/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx +++ b/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx @@ -399,11 +399,11 @@ protected: fAvailableColumns.emplace_back(ROOT::Internal::RColumn::Create(onDiskTypes[0], 0, representationIndex)); if (onDiskTypes[0] == ROOT::ENTupleColumnType::kReal32Trunc) { const auto &fdesc = desc.GetFieldDescriptor(Base::GetOnDiskId()); - const auto &coldesc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[0]); + const auto &coldesc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[representationIndex]); column->SetBitsOnStorage(coldesc.GetBitsOnStorage()); } else if (onDiskTypes[0] == ROOT::ENTupleColumnType::kReal32Quant) { const auto &fdesc = desc.GetFieldDescriptor(Base::GetOnDiskId()); - const auto &coldesc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[0]); + const auto &coldesc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[representationIndex]); assert(coldesc.GetValueRange().has_value()); const auto [valMin, valMax] = *coldesc.GetValueRange(); column->SetBitsOnStorage(coldesc.GetBitsOnStorage()); diff --git a/tree/ntuple/test/ntuple_multi_column.cxx b/tree/ntuple/test/ntuple_multi_column.cxx index 7a4e6bb6eb4a4..4cc69ec5cfeff 100644 --- a/tree/ntuple/test/ntuple_multi_column.cxx +++ b/tree/ntuple/test/ntuple_multi_column.cxx @@ -377,6 +377,36 @@ TEST(RNTuple, MultiColumnRepresentationBulk) EXPECT_FLOAT_EQ(2.0, arr[0]); } +TEST(RNTuple, MultiColumnRepresentationVariableBitWidth) +{ + FileRaii fileGuard("test_ntuple_multi_column_representation_varbitwidth.root"); + + { + auto model = RNTupleModel::Create(); + auto fldPx = std::make_unique>("px"); + fldPx->SetTruncated(26); + fldPx->SetColumnRepresentatives({{ROOT::ENTupleColumnType::kReal32}, {ROOT::ENTupleColumnType::kReal32Trunc}}); + model->AddField(std::move(fldPx)); + auto ptrPx = model->GetDefaultEntry().GetPtr("px"); + auto writer = RNTupleWriter::Recreate(std::move(model), "ntpl", fileGuard.GetPath()); + *ptrPx = 1.0; + writer->Fill(); + writer->CommitCluster(); + ROOT::Internal::RFieldRepresentationModifier::SetPrimaryColumnRepresentation( + const_cast(writer->GetModel().GetConstField("px")), 1); + *ptrPx = 2.0; + writer->Fill(); + } + + auto reader = RNTupleReader::Open("ntpl", fileGuard.GetPath()); + auto fldPx = reader->GetModel().GetDefaultEntry().GetPtr("px"); + + reader->LoadEntry(0); + EXPECT_FLOAT_EQ(1.0, *fldPx); + reader->LoadEntry(1); + EXPECT_FLOAT_EQ(2.0, *fldPx); +} + TEST(RNTuple, MultiColumnRepresentationDedup) { FileRaii fileGuard("test_ntuple_multi_column_representation_dedup.root"); From 979768c1a5524df03d97548cdfdb689327f29be2 Mon Sep 17 00:00:00 2001 From: silverweed Date: Tue, 31 Mar 2026 08:39:02 +0200 Subject: [PATCH 02/17] [ntuple] Small improvement in RNTupleMerger Instead of calling continue multiple times in the AddColumnFromField loop, just early return in case of projected fields. --- tree/ntuple/src/RNTupleMerger.cxx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index 6ff1ad0ac8679..1dc902476a82e 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -1018,7 +1018,7 @@ RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span &columns, const RO { std::string name = prefix + '.' + srcFieldDesc.GetFieldName(); + // We don't want to try and merge alias columns. Note that subfields of projected fields + // must also be projected, so we don't need to check them. + if (srcFieldDesc.IsProjectedField()) + return; + const auto &columnIds = srcFieldDesc.GetLogicalColumnIds(); columns.reserve(columns.size() + columnIds.size()); // NOTE: here we can match the src and dst columns by column index because we forbid merging fields with // different column representations. for (auto i = 0u; i < srcFieldDesc.GetLogicalColumnIds().size(); ++i) { - // We don't want to try and merge alias columns - if (srcFieldDesc.IsProjectedField()) - continue; - auto srcColumnId = srcFieldDesc.GetLogicalColumnIds()[i]; const auto &srcColumn = srcDesc.GetColumnDescriptor(srcColumnId); From b52908817913f51f0849b5de74d764d9c4feb212 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 15 Apr 2026 16:08:30 +0200 Subject: [PATCH 03/17] [ntuple] Serializer: sort columns by ID before serializing them We are currently serializing columns per-field, but in case of late column extension this might result in inconsistent sorting of the columns in the serialized footer. e.g. assume you have fields "A" and "B", both late model extended, both with a single column: - col 0 -> field A, repr 0 - col 1 -> field B, repr 0 Now you add a new column representation to field "A"; this new column has id 2: - col 2 -> field A, repr 1 When serializing this RNTuple, all columns are written in the footer by RNTupleSerialize::SerializeColumnsForFields(). Before this change, they would end up on disk in order: [0, 2, 1]. This would corrupt the data by swapping the pages for columns 2 and 1. After this change, they get written as [0, 1, 2] which is the correct order. Note that this exact case is tested in ntuple_merger in the unit test MergeDeferredAdvanced. --- tree/ntuple/src/RNTupleSerialize.cxx | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tree/ntuple/src/RNTupleSerialize.cxx b/tree/ntuple/src/RNTupleSerialize.cxx index 77a63f863c3a0..a3fbeb10402bd 100644 --- a/tree/ntuple/src/RNTupleSerialize.cxx +++ b/tree/ntuple/src/RNTupleSerialize.cxx @@ -291,6 +291,7 @@ ROOT::RResult SerializeColumnsOfFields(const ROOT::RNTupleDescrip const auto *xHeader = !forHeaderExtension ? desc.GetHeaderExtension() : nullptr; + std::vector columnsToSerialize; for (auto parentId : fieldList) { // If we're serializing the non-extended header and we already have a header extension (which may happen if // we load an RNTuple for incremental merging), we need to skip all the extended fields, as they need to be @@ -301,12 +302,22 @@ ROOT::RResult SerializeColumnsOfFields(const ROOT::RNTupleDescrip for (const auto &c : desc.GetColumnIterable(parentId)) { if (c.IsAliasColumn() || (xHeader && xHeader->ContainsExtendedColumnRepresentation(c.GetLogicalId()))) continue; + columnsToSerialize.push_back(&c); + } + } - if (auto res = SerializePhysicalColumn(c, context, *where)) { - pos += res.Unwrap(); - } else { - return R__FORWARD_ERROR(res); - } + // Make sure the columns are sorted by physical ID. + // This is usually the case already, but it may not be true if we have a late-model-extended column in one + // of the fields. + std::sort(columnsToSerialize.begin(), columnsToSerialize.end(), [&context](const auto *a, const auto *b) { + return context.GetOnDiskColumnId(a->GetPhysicalId()) < context.GetOnDiskColumnId(b->GetPhysicalId()); + }); + + for (const auto *c : columnsToSerialize) { + if (auto res = SerializePhysicalColumn(*c, context, *where)) { + pos += res.Unwrap(); + } else { + return R__FORWARD_ERROR(res); } } From 93ada517fee9c96d5f465eee4d803cec5084f379 Mon Sep 17 00:00:00 2001 From: silverweed Date: Fri, 17 Apr 2026 11:08:30 +0200 Subject: [PATCH 04/17] [ntuple] update merger test to make sure we test nRepetitions --- tree/ntuple/test/ntuple_merger.cxx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tree/ntuple/test/ntuple_merger.cxx b/tree/ntuple/test/ntuple_merger.cxx index de27ead34d3f9..8e1279e3ed143 100644 --- a/tree/ntuple/test/ntuple_merger.cxx +++ b/tree/ntuple/test/ntuple_merger.cxx @@ -1015,12 +1015,13 @@ TEST(RNTupleMerger, MergeLateModelExtension) { auto model = RNTupleModel::Create(); auto fieldFoo = model->MakeField>("foo"); - auto fieldVfoo = model->MakeField>("vfoo"); + auto fieldVfoo = model->MakeField[3]>("vfoo"); auto fieldBar = model->MakeField("bar"); auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath(), RNTupleWriteOptions()); for (size_t i = 0; i < 10; ++i) { fieldFoo->insert(std::make_pair(std::to_string(i), i * 123)); - *fieldVfoo = {(int)i * 123}; + fieldVfoo[0] = {(int)i * 123}; + fieldVfoo[2] = {(int)i * 345}; *fieldBar = i * 321; ntuple->Fill(); } @@ -1031,14 +1032,15 @@ TEST(RNTupleMerger, MergeLateModelExtension) auto model = RNTupleModel::Create(); auto fieldBaz = model->MakeField("baz"); auto fieldFoo = model->MakeField>("foo"); - auto fieldVfoo = model->MakeField>("vfoo"); + auto fieldVfoo = model->MakeField[3]>("vfoo"); auto wopts = RNTupleWriteOptions(); wopts.SetCompression(0); auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath(), wopts); for (size_t i = 0; i < 10; ++i) { *fieldBaz = i * 567; fieldFoo->insert(std::make_pair(std::to_string(i), i * 765)); - *fieldVfoo = {(int)i * 765}; + fieldVfoo[0] = {(int)i * 765}; + fieldVfoo[2] = {(int)i * 987}; ntuple->Fill(); } } @@ -1072,21 +1074,25 @@ TEST(RNTupleMerger, MergeLateModelExtension) auto ntuple = RNTupleReader::Open("ntuple", fileGuard3.GetPath()); EXPECT_EQ(ntuple->GetNEntries(), 20); auto foo = ntuple->GetModel().GetDefaultEntry().GetPtr>("foo"); - auto vfoo = ntuple->GetModel().GetDefaultEntry().GetPtr>("vfoo"); + auto vfoo = ntuple->GetModel().GetDefaultEntry().GetPtr[3]>("vfoo"); auto bar = ntuple->GetModel().GetDefaultEntry().GetPtr("bar"); auto baz = ntuple->GetModel().GetDefaultEntry().GetPtr("baz"); for (int i = 0; i < 10; ++i) { ntuple->LoadEntry(i); ASSERT_EQ((*foo)[std::to_string(i)], i * 123); - ASSERT_EQ((*vfoo)[0], i * 123); + ASSERT_EQ(vfoo[0][0], i * 123); + ASSERT_EQ(vfoo[2][0], i * 345); + ASSERT_TRUE(vfoo[1].empty()); ASSERT_EQ(*bar, i * 321); ASSERT_EQ(*baz, 0); } for (int i = 10; i < 20; ++i) { ntuple->LoadEntry(i); ASSERT_EQ((*foo)[std::to_string(i - 10)], (i - 10) * 765); - ASSERT_EQ((*vfoo)[0], (i - 10) * 765); + ASSERT_EQ(vfoo[0][0], (i - 10) * 765); + ASSERT_EQ(vfoo[2][0], (i - 10) * 987); + ASSERT_TRUE(vfoo[1].empty()); ASSERT_EQ(*bar, 0); ASSERT_EQ(*baz, (i - 10) * 567); } From e1599a0c4db83de3aab13a61185bcadc88496874 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 22 Apr 2026 14:47:11 +0200 Subject: [PATCH 05/17] [ntuple] Clarify a bit RFieldBase::EntryToColumnElementIndex Also fix the type of result --- tree/ntuple/inc/ROOT/RFieldBase.hxx | 7 ++++--- tree/ntuple/src/RFieldBase.cxx | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RFieldBase.hxx b/tree/ntuple/inc/ROOT/RFieldBase.hxx index 6e691a9f98d95..c5ba2a98dba08 100644 --- a/tree/ntuple/inc/ROOT/RFieldBase.hxx +++ b/tree/ntuple/inc/ROOT/RFieldBase.hxx @@ -260,14 +260,15 @@ private: func(target); } - /// Translate an entry index to a column element index of the principal column and vice versa. These functions - /// take into account the role and number of repetitions on each level of the field hierarchy as follows: + /// Translate an entry index to a column element index of the principal column. This function + /// takes into account the role and number of repetitions on each level of the field hierarchy as follows: /// - Top level fields: element index == entry index /// - Record fields propagate their principal column index to the principal columns of direct descendant fields /// - Collection and variant fields set the principal column index of their children to 0 /// /// The column element index also depends on the number of repetitions of each field in the hierarchy, e.g., given a - /// field with type `std::array, 2>`, this function returns 8 for the innermost field. + /// field with type `std::array, 2>`, this function called with `globalIndex == 1` + /// returns 8 for the innermost field. ROOT::NTupleSize_t EntryToColumnElementIndex(ROOT::NTupleSize_t globalIndex) const; /// Flushes data from active columns diff --git a/tree/ntuple/src/RFieldBase.cxx b/tree/ntuple/src/RFieldBase.cxx index e185389f1190a..0e833729882ab 100644 --- a/tree/ntuple/src/RFieldBase.cxx +++ b/tree/ntuple/src/RFieldBase.cxx @@ -683,14 +683,14 @@ void ROOT::RFieldBase::Attach(std::unique_ptr child, std::stri ROOT::NTupleSize_t ROOT::RFieldBase::EntryToColumnElementIndex(ROOT::NTupleSize_t globalIndex) const { - std::size_t result = globalIndex; + ROOT::NTupleSize_t result = globalIndex; for (auto f = this; f != nullptr; f = f->GetParent()) { auto parent = f->GetParent(); if (parent && (parent->GetStructure() == ROOT::ENTupleStructure::kCollection || parent->GetStructure() == ROOT::ENTupleStructure::kVariant)) { return 0U; } - result *= std::max(f->GetNRepetitions(), std::size_t{1U}); + result *= std::max(f->GetNRepetitions(), ROOT::NTupleSize_t{1U}); } return result; } From 1d1d5a351f8f50bc9f1b2f2ca4ea9a2f218efb97 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 17 Jun 2026 17:02:01 +0200 Subject: [PATCH 06/17] [ntuple] Allow adding duplicate column representations This is necessary to support multiple representations that use the same column type but different metadata (e.g. different bit width on Real32Trunc columns) --- tree/ntuple/src/RFieldBase.cxx | 5 +---- tree/ntuple/test/ntuple_multi_column.cxx | 9 --------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/tree/ntuple/src/RFieldBase.cxx b/tree/ntuple/src/RFieldBase.cxx index 0e833729882ab..b7efc3a527b01 100644 --- a/tree/ntuple/src/RFieldBase.cxx +++ b/tree/ntuple/src/RFieldBase.cxx @@ -850,10 +850,7 @@ void ROOT::RFieldBase::SetColumnRepresentatives(const RColumnRepresentations::Se if (itRepresentative == std::end(validTypes)) throw RException(R__FAIL("invalid column representative")); - // don't add a duplicate representation - if (std::find_if(fColumnRepresentatives.begin(), fColumnRepresentatives.end(), - [&r](const auto &rep) { return r == rep.get(); }) == fColumnRepresentatives.end()) - fColumnRepresentatives.emplace_back(*itRepresentative); + fColumnRepresentatives.emplace_back(*itRepresentative); } } diff --git a/tree/ntuple/test/ntuple_multi_column.cxx b/tree/ntuple/test/ntuple_multi_column.cxx index 4cc69ec5cfeff..963c5fabd9277 100644 --- a/tree/ntuple/test/ntuple_multi_column.cxx +++ b/tree/ntuple/test/ntuple_multi_column.cxx @@ -406,12 +406,3 @@ TEST(RNTuple, MultiColumnRepresentationVariableBitWidth) reader->LoadEntry(1); EXPECT_FLOAT_EQ(2.0, *fldPx); } - -TEST(RNTuple, MultiColumnRepresentationDedup) -{ - FileRaii fileGuard("test_ntuple_multi_column_representation_dedup.root"); - - auto fldPx = RFieldBase::Create("px", "float").Unwrap(); - fldPx->SetColumnRepresentatives({{ROOT::ENTupleColumnType::kReal16}, {ROOT::ENTupleColumnType::kReal16}}); - EXPECT_EQ(fldPx->GetColumnRepresentatives().size(), 1); -} From a88fefe48d1c595e22a4a51228abe77618adf45e Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 22 Apr 2026 14:48:46 +0200 Subject: [PATCH 07/17] [ntuple] Add RClusterDescriptor::TryGetColumnRange --- tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx b/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx index 6a1c35ea264ac..99b09a0cd9998 100644 --- a/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx @@ -514,6 +514,12 @@ public: ROOT::NTupleSize_t GetFirstEntryIndex() const { return fFirstEntryIndex; } ROOT::NTupleSize_t GetNEntries() const { return fNEntries; } const RColumnRange &GetColumnRange(ROOT::DescriptorId_t physicalId) const { return fColumnRanges.at(physicalId); } + const RColumnRange *TryGetColumnRange(ROOT::DescriptorId_t physicalId) const + { + if (auto it = fColumnRanges.find(physicalId); it != fColumnRanges.end()) + return &it->second; + return nullptr; + } const RPageRange &GetPageRange(ROOT::DescriptorId_t physicalId) const { return fPageRanges.at(physicalId); } /// Returns an iterator over pairs { columnId, columnRange }. The iteration order is unspecified. RColumnRangeIterable GetColumnRangeIterable() const; From 669ab512835725f648cdd4e459752386960eb27e Mon Sep 17 00:00:00 2001 From: silverweed Date: Mon, 4 May 2026 14:45:20 +0200 Subject: [PATCH 08/17] [ntuple] Introduce feature flag 0 (nested deferred columns) In code, this feature is represented by an update in the logic of RClusterDescriptorBuilder::AddExtendedColumnRanges(), which now handles properly the case where a column is added in a later cluster and therefore cannot rely on the (NEntries * NRepetitions) calculation, so it instead copies its FirstElementIndex and NElements from the 0th representation (which is guaranteed to have valid numbers for them). --- tree/ntuple/doc/BinaryFormatSpecification.md | 9 +++++++-- tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx | 6 ++++++ tree/ntuple/src/RNTupleDescriptor.cxx | 17 +++++++++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tree/ntuple/doc/BinaryFormatSpecification.md b/tree/ntuple/doc/BinaryFormatSpecification.md index 2a87e514a8752..a04f1520606f4 100644 --- a/tree/ntuple/doc/BinaryFormatSpecification.md +++ b/tree/ntuple/doc/BinaryFormatSpecification.md @@ -1,4 +1,4 @@ -# RNTuple Binary Format Specification 1.0.2.0 +# RNTuple Binary Format Specification 1.1.0.0 ## Versioning Notes @@ -167,7 +167,12 @@ That means that readers need to continue reading feature flags as long as their Readers should gracefully abort reading when they encounter unknown bits set. -At the moment, there are no feature flag bits defined. +Here is the list of all currently-defined feature flags. Note that the flag name is only for informational purposes +and is not normative. + +| Flag Bit | Introduced in | Name | Meaning | +|----------|---------------|-------------------------|----------------------------------------------| +| 0 | 1.1.0.0 | Nested Deferred Columns | Signals that the RNTuple contains at least one deferred column that is part of a collection and was extended
(i.e. it appears in the footer). This can happen when merging two RNTuples that have the same collection field
backed by columns with different encoding, e.g. a `vector` whose elements are represented by SplitReal32
in the first ntuple and by Real32 in the second. | ## Frames diff --git a/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx b/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx index 99b09a0cd9998..7d447e1caf26e 100644 --- a/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleDescriptor.hxx @@ -774,6 +774,12 @@ public: /// All known feature flags. /// Note that the flag values represent the bit _index_, not the already-bitshifted integer. enum EFeatureFlags { + /// Signals that the RNTuple contains at least one deferred column that is part of a collection and was extended + /// (i.e. it appears in the footer). This can happen when merging two RNTuples that have the same collection field + /// backed by columns with different encoding, e.g. a vector whose elements are represented by SplitReal32 + /// in the first ntuple and by Real32 in the second. + /// Added in version 1.1.0.0 of the binary format. + kFeatureFlag_NestedDeferredColumns = 0, // Insert new feature flags here, with contiguous values. If at any point a "hole" appears in the valid feature // flags values, the check in RNTupleSerialize must be updated. diff --git a/tree/ntuple/src/RNTupleDescriptor.cxx b/tree/ntuple/src/RNTupleDescriptor.cxx index b1e8407b86b68..2653460526ff6 100644 --- a/tree/ntuple/src/RNTupleDescriptor.cxx +++ b/tree/ntuple/src/RNTupleDescriptor.cxx @@ -931,8 +931,21 @@ ROOT::Internal::RClusterDescriptorBuilder::AddExtendedColumnRanges(const RNTuple // `ROOT::RFieldBase::EntryToColumnElementIndex()`, i.e. it is a principal column reachable from the // field zero excluding subfields of collection and variant fields. if (c.IsDeferredColumn()) { - columnRange.SetFirstElementIndex(fCluster.GetFirstEntryIndex() * nRepetitions); - columnRange.SetNElements(fCluster.GetNEntries() * nRepetitions); + if (c.GetRepresentationIndex() == 0) { + columnRange.SetFirstElementIndex(fCluster.GetFirstEntryIndex() * nRepetitions); + columnRange.SetNElements(fCluster.GetNEntries() * nRepetitions); + } else { + // Deferred representations which are not the first cannot count on the number of elements being + // equal to Entries * nRepetitions because they might have been added in a later cluster. But they + // can rely on the first representation having the correct FirstElement/NElements (by definition + // the first representation cannot be an "extended" one), therefore they can just copy the value + // from it. + const auto &field = desc.GetFieldDescriptor(fieldId); + const auto firstReprColumnId = field.GetLogicalColumnIds()[c.GetIndex()]; + const auto &firstReprColumnRange = fCluster.fColumnRanges[firstReprColumnId]; + columnRange.SetFirstElementIndex(firstReprColumnRange.GetFirstElementIndex()); + columnRange.SetNElements(firstReprColumnRange.GetNElements()); + } if (!columnRange.IsSuppressed()) { auto &pageRange = fCluster.fPageRanges[physicalId]; pageRange.fPhysicalColumnId = physicalId; From f692f1821960c0a4c0f7c1f9a20a9068f0d1fbbb Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 22 Apr 2026 14:51:04 +0200 Subject: [PATCH 09/17] [ntuple] Add functionality to RPagePersistentSink for column extension - RPagePersistentSink::AddColumnRepresentation - RPagePersistentSink::AddAliasColumn Internal functionality to be used by the Merger. This entails 2 additional changes: - AddExtendedColumnRanges needs to be updated to handle the case where a column representation is added to a field during writing after some clusters have already been written; - ShiftAliasColumns needs to properly shift the ids of extended alias columns when called, otherwise a mismatch may happen when serializing the footer --- tree/ntuple/inc/ROOT/RNTuple.hxx | 4 +- tree/ntuple/inc/ROOT/RPageStorage.hxx | 7 +++ tree/ntuple/src/RNTupleDescriptor.cxx | 8 +++ tree/ntuple/src/RPageStorage.cxx | 87 +++++++++++++++++++++++++++ tree/ntuple/test/ntuple_minifile.cxx | 4 +- 5 files changed, 106 insertions(+), 4 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RNTuple.hxx b/tree/ntuple/inc/ROOT/RNTuple.hxx index ae5a071030394..a127fe9ab241f 100644 --- a/tree/ntuple/inc/ROOT/RNTuple.hxx +++ b/tree/ntuple/inc/ROOT/RNTuple.hxx @@ -76,8 +76,8 @@ class RNTuple final { public: static constexpr std::uint16_t kVersionEpoch = 1; - static constexpr std::uint16_t kVersionMajor = 0; - static constexpr std::uint16_t kVersionMinor = 2; + static constexpr std::uint16_t kVersionMajor = 1; + static constexpr std::uint16_t kVersionMinor = 0; static constexpr std::uint16_t kVersionPatch = 0; /// Returns the RNTuple version in the following form: diff --git a/tree/ntuple/inc/ROOT/RPageStorage.hxx b/tree/ntuple/inc/ROOT/RPageStorage.hxx index 5a0723fd0248d..54fca0e23e50c 100644 --- a/tree/ntuple/inc/ROOT/RPageStorage.hxx +++ b/tree/ntuple/inc/ROOT/RPageStorage.hxx @@ -543,6 +543,13 @@ public: [[nodiscard]] std::unique_ptr InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters); + void + AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span newRepresentation); + + /// Adds a new alias column pointing to an existing column with the given physical id to the given field. + void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field, + ROOT::DescriptorId_t physicalId); + void CommitSuppressedColumn(ColumnHandle_t columnHandle) final; void CommitPage(ColumnHandle_t columnHandle, const ROOT::Internal::RPage &page) final; void CommitSealedPage(ROOT::DescriptorId_t physicalColumnId, const RPageStorage::RSealedPage &sealedPage) final; diff --git a/tree/ntuple/src/RNTupleDescriptor.cxx b/tree/ntuple/src/RNTupleDescriptor.cxx index 2653460526ff6..145dee51c22b3 100644 --- a/tree/ntuple/src/RNTupleDescriptor.cxx +++ b/tree/ntuple/src/RNTupleDescriptor.cxx @@ -1363,6 +1363,14 @@ void ROOT::Internal::RNTupleDescriptorBuilder::ShiftAliasColumns(std::uint32_t o R__ASSERT(fDescriptor.fColumnDescriptors.count(c.fLogicalColumnId) == 0); fDescriptor.fColumnDescriptors.emplace(c.fLogicalColumnId, std::move(c)); } + + // Patch up column ids in the header extension + if (auto &xHeader = fDescriptor.fHeaderExtension) { + for (auto &columnId : xHeader->fExtendedColumnRepresentations) { + if (columnId >= fDescriptor.GetNPhysicalColumns()) + columnId += offset; + } + } } ROOT::RResult ROOT::Internal::RNTupleDescriptorBuilder::AddCluster(RClusterDescriptor &&clusterDesc) diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index ee3269b2be9a6..03b7767c8b3a3 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -1144,6 +1144,93 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr return model; } +void ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field, + std::span newRepresentation) +{ + const auto &descriptor = fDescriptorBuilder.GetDescriptor(); + + assert(&descriptor.GetFieldDescriptor(field.GetId()) == &field); + assert(!field.IsProjectedField()); + assert(field.GetColumnCardinality() > 0); + assert(!field.GetLogicalColumnIds().empty()); + assert(newRepresentation.size() == field.GetColumnCardinality()); + + const std::size_t firstPhysicalIndex = fDescriptorBuilder.GetDescriptor().GetNPhysicalColumns(); + const std::uint16_t reprIndex = field.GetLogicalColumnIds().size() / field.GetColumnCardinality(); + + fDescriptorBuilder.ShiftAliasColumns(newRepresentation.size()); + + std::uint16_t columnIndex = 0; // index into the representation + for (auto columnType : newRepresentation) { + // Extending columns with variable bit width is currently unsupported. + const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnType); + R__ASSERT(rangeMin == rangeMax); + + const ROOT::DescriptorId_t firstReprColumnId = field.GetLogicalColumnIds()[columnIndex]; + const auto &firstReprColumnRange = fOpenColumnRanges.at(firstReprColumnId); + const ROOT::DescriptorId_t columnId = firstPhysicalIndex + columnIndex; + + RColumnDescriptorBuilder columnBuilder; + columnBuilder.LogicalColumnId(columnId) + .PhysicalColumnId(columnId) + .FieldId(field.GetId()) + .BitsOnStorage(rangeMax) + .Type(columnType) + .Index(columnIndex) + // NOTE: marking this column as suppressed with the minus sign + .FirstElementIndex(-firstReprColumnRange.GetFirstElementIndex()) + .RepresentationIndex(reprIndex); + fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap()); + + for (auto parentId = field.GetParentId(); parentId != ROOT::kInvalidDescriptorId;) { + const ROOT::RFieldDescriptor &parent = descriptor.GetFieldDescriptor(parentId); + if (parent.GetStructure() == ROOT::ENTupleStructure::kCollection || + parent.GetStructure() == ROOT::ENTupleStructure::kVariant) { + fDescriptorBuilder.SetFeature(RNTupleDescriptor::kFeatureFlag_NestedDeferredColumns); + break; + } + parentId = parent.GetParentId(); + } + + ROOT::RClusterDescriptor::RColumnRange columnRange; + columnRange.SetPhysicalColumnId(columnId); + columnRange.SetFirstElementIndex(firstReprColumnRange.GetFirstElementIndex()); + columnRange.SetNElements(0); + columnRange.SetCompressionSettings(GetWriteOptions().GetCompression()); + fOpenColumnRanges.emplace_back(columnRange); + + ROOT::RClusterDescriptor::RPageRange pageRange; + pageRange.SetPhysicalColumnId(columnId); + fOpenPageRanges.emplace_back(std::move(pageRange)); + + fSerializationContext.MapPhysicalColumnId(columnId); + + ++columnIndex; + } +} + +void ROOT::Internal::RPagePersistentSink::AddAliasColumn(const ROOT::RNTupleDescriptor &desc, + const ROOT::RFieldDescriptor &field, + ROOT::DescriptorId_t physicalId) +{ + const auto &pointedColumn = desc.GetColumnDescriptor(physicalId); + assert(!pointedColumn.IsAliasColumn()); + assert(field.IsProjectedField()); + + const auto columnId = fDescriptorBuilder.GetDescriptor().GetNLogicalColumns(); + RColumnDescriptorBuilder columnBuilder; + columnBuilder.LogicalColumnId(columnId) + .PhysicalColumnId(physicalId) + .FieldId(field.GetId()) + .Type(pointedColumn.GetType()) + .Index(pointedColumn.GetIndex()) + .BitsOnStorage(pointedColumn.GetBitsOnStorage()) + .ValueRange(pointedColumn.GetValueRange()) + .FirstElementIndex(pointedColumn.GetFirstElementIndex()) + .RepresentationIndex(pointedColumn.GetRepresentationIndex()); + fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap()); +} + void ROOT::Internal::RPagePersistentSink::CommitSuppressedColumn(ColumnHandle_t columnHandle) { fOpenColumnRanges.at(columnHandle.fPhysicalId).SetIsSuppressed(true); diff --git a/tree/ntuple/test/ntuple_minifile.cxx b/tree/ntuple/test/ntuple_minifile.cxx index 6dd2ab7ef4c70..8c448248435d4 100644 --- a/tree/ntuple/test/ntuple_minifile.cxx +++ b/tree/ntuple/test/ntuple_minifile.cxx @@ -16,8 +16,8 @@ namespace { // NOTE: these should be in sync with the version in RNTuple.hxx and are duplicated here // as a double check that we increment the version correctly. constexpr auto kVersionEpoch = 1; -constexpr auto kVersionMajor = 0; -constexpr auto kVersionMinor = 2; +constexpr auto kVersionMajor = 1; +constexpr auto kVersionMinor = 0; constexpr auto kVersionPatch = 0; bool IsEqual(const ROOT::RNTuple &a, const ROOT::RNTuple &b) From 9b7db69bdc1546c732bd938c29328e0f6ef5a643 Mon Sep 17 00:00:00 2001 From: silverweed Date: Tue, 28 Apr 2026 10:43:25 +0200 Subject: [PATCH 10/17] [ntuple] Add multiple column representation support in the Merger --- tree/ntuple/inc/ROOT/RNTupleMerger.hxx | 14 +- tree/ntuple/inc/ROOT/RPageStorage.hxx | 4 +- tree/ntuple/src/RNTupleMerger.cxx | 635 +++++++++++++++---------- tree/ntuple/src/RPageStorage.cxx | 7 +- tree/ntuple/test/ntuple_merger.cxx | 171 +++++-- 5 files changed, 547 insertions(+), 284 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RNTupleMerger.hxx b/tree/ntuple/inc/ROOT/RNTupleMerger.hxx index 50b38a8fe3700..deaa22bcca330 100644 --- a/tree/ntuple/inc/ROOT/RNTupleMerger.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleMerger.hxx @@ -117,16 +117,16 @@ class RNTupleMerger final { std::unique_ptr fModel; [[nodiscard]] - ROOT::RResult MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, - const ROOT::RClusterDescriptor &clusterDesc, - std::span commonColumns, - const ROOT::Internal::RCluster::ColumnSet_t &commonColumnSet, - std::size_t nCommonColumnsInCluster, RSealedPageMergeData &sealedPageData, - const RNTupleMergeData &mergeData, ROOT::Internal::RPageAllocator &pageAlloc); + ROOT::RResult + MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, const ROOT::RClusterDescriptor &clusterDesc, + std::span commonColumns, + const ROOT::Internal::RCluster::ColumnSet_t &commonColumnSet, std::size_t nCommonColumnsInCluster, + RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData, + ROOT::Internal::RPageAllocator &pageAlloc); [[nodiscard]] ROOT::RResult - MergeSourceClusters(ROOT::Internal::RPageSource &source, std::span commonColumns, + MergeSourceClusters(ROOT::Internal::RPageSource &source, std::span commonColumns, std::span extraDstColumns, RNTupleMergeData &mergeData); /// Creates a RNTupleMerger with the given destination. diff --git a/tree/ntuple/inc/ROOT/RPageStorage.hxx b/tree/ntuple/inc/ROOT/RPageStorage.hxx index 54fca0e23e50c..9fbc3a10400fd 100644 --- a/tree/ntuple/inc/ROOT/RPageStorage.hxx +++ b/tree/ntuple/inc/ROOT/RPageStorage.hxx @@ -543,7 +543,9 @@ public: [[nodiscard]] std::unique_ptr InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters); - void + /// Adds a new column representation to the given field. + /// \return The physical id of the first newly added column. + ROOT::DescriptorId_t AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span newRepresentation); /// Adds a new alias column pointing to an existing column with the given physical id to the given field. diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index 1dc902476a82e..b50edf9382f1e 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -269,12 +269,10 @@ try { } namespace { -// Functor used to change the compression of a page to `options.fCompressionSettings`. +// Functor used to change the compression of a page to `fCompressionSettings`. struct RChangeCompressionFunc { const RColumnElementBase &fSrcColElement; - const RColumnElementBase &fDstColElement; - const RNTupleMergeOptions &fMergeOptions; - + std::uint32_t fCompressionSettings; RPageStorage::RSealedPage &fSealedPage; ROOT::Internal::RPageAllocator &fPageAlloc; std::byte *fBuffer; @@ -283,16 +281,13 @@ struct RChangeCompressionFunc { void operator()() const { - assert(fSrcColElement.GetIdentifier() == fDstColElement.GetIdentifier()); - fSealedPage.VerifyChecksumIfEnabled().ThrowOnError(); const auto bytesPacked = fSrcColElement.GetPackedSize(fSealedPage.GetNElements()); - const auto compression = fMergeOptions.fCompressionSettings.value_or(0); // TODO: this buffer could be kept and reused across pages std::unique_ptr unzipBufOwned; std::byte *unzipBuf; - if (compression != 0) { + if (fCompressionSettings != 0) { unzipBufOwned = MakeUninitArray(bytesPacked); unzipBuf = unzipBufOwned.get(); } else { @@ -303,45 +298,18 @@ struct RChangeCompressionFunc { const auto checksumSize = fWriteOpts.GetEnablePageChecksums() * sizeof(std::uint64_t); std::size_t nBytesZipped; - if (compression != 0) { + if (fCompressionSettings != 0) { assert(fBuffer != unzipBuf); assert(fBufSize >= bytesPacked + checksumSize); - nBytesZipped = ROOT::Internal::RNTupleCompressor::Zip(unzipBuf, bytesPacked, compression, fBuffer); + nBytesZipped = ROOT::Internal::RNTupleCompressor::Zip(unzipBuf, bytesPacked, fCompressionSettings, fBuffer); } else { nBytesZipped = bytesPacked; } - fSealedPage = {fBuffer, nBytesZipped + checksumSize, fSealedPage.GetNElements(), fSealedPage.GetHasChecksum()}; fSealedPage.ChecksumIfEnabled(); } }; -struct RResealFunc { - const RColumnElementBase &fSrcColElement; - const RColumnElementBase &fDstColElement; - const RNTupleMergeOptions &fMergeOptions; - - RPageStorage::RSealedPage &fSealedPage; - ROOT::Internal::RPageAllocator &fPageAlloc; - std::byte *fBuffer; - std::size_t fBufSize; - const ROOT::RNTupleWriteOptions &fWriteOpts; - - void operator()() const - { - auto page = RPageSource::UnsealPage(fSealedPage, fSrcColElement, fPageAlloc).Unwrap(); - RPageSink::RSealPageConfig sealConf; - sealConf.fElement = &fDstColElement; - sealConf.fPage = &page; - sealConf.fBuffer = fBuffer; - sealConf.fCompressionSettings = *fMergeOptions.fCompressionSettings; - sealConf.fWriteChecksum = fWriteOpts.GetEnablePageChecksums(); - assert(fBufSize >= fSealedPage.GetDataSize() + fSealedPage.GetHasChecksum() * sizeof(std::uint64_t)); - auto refSealedPage = RPageSink::SealPage(sealConf); - fSealedPage = refSealedPage; - } -}; - struct RTaskVisitor { std::optional &fGroup; @@ -362,23 +330,56 @@ struct RCommonField { RCommonField(const ROOT::RFieldDescriptor &src, const ROOT::RFieldDescriptor &dst) : fSrc(&src), fDst(&dst) {} }; +/// Maps a column representation from a source to a destination RNTuple. +/// fSource and fDest are the representation indices of a specific column. +/// +/// When we merge fields from different RNTuples, two compatible fields may use different column +/// representations. When merging their columns we need to make sure that we keep the output +/// representation coherent, which is what this mapping is here for. +struct RColReprMapping { + std::uint32_t fSource; + std::uint32_t fDest; +}; + +/// A column extension that needs to be added to an output field. +/// Note that this also adds a mapping for the new representation, which is why this inherits RColReprMapping. +struct RColReprExtension : RColReprMapping { + /// The new representation to be added + ROOT::RFieldBase::ColumnRepresentation_t fSourceRepr; +}; + +static std::optional +FindColumnReprMapping(const std::vector &mappings, std::uint32_t sourceReprIndex) +{ + for (const auto [src, dst] : mappings) + if (src == sourceReprIndex) + return dst; + return std::nullopt; +} + +template +using FieldCollectionMap_t = std::unordered_map>; + struct RDescriptorsComparison { std::vector fExtraDstFields; std::vector fExtraSrcFields; std::vector fCommonFields; + // For each field that has more than 1 column representation in the output model, + // maps the column representatives of the source field with those of the destination. + // The key is the destination field. + FieldCollectionMap_t fColReprMappings; + FieldCollectionMap_t fColReprExtensions; }; struct RColumnOutInfo { - ROOT::DescriptorId_t fColumnId; - ENTupleColumnType fColumnType; + ROOT::DescriptorId_t fColumnId = ROOT::kInvalidDescriptorId; }; -// { fully.qualified.fieldName.colInputId => colOutputInfo } +// { ".fully.qualified.fieldName.colInputIndex.colOutputReprIndex" => colOutputInfo } using ColumnIdMap_t = std::unordered_map; struct RColumnInfoGroup { std::vector fExtraDstColumns; - // These are sorted by InputId std::vector fCommonColumns; }; @@ -392,14 +393,14 @@ struct RColumnMergeInfo { // e.g. "Muon.pt.x._0" std::string fColumnName; // The column id in the source RNTuple - ROOT::DescriptorId_t fInputId; + ROOT::DescriptorId_t fInputId = kInvalidDescriptorId; // The corresponding column id in the destination RNTuple (the mapping happens in AddColumnsFromField()) - ROOT::DescriptorId_t fOutputId; - ENTupleColumnType fColumnType; + ROOT::DescriptorId_t fOutputId = kInvalidDescriptorId; + std::uint16_t fOutputReprIndex = 0; // If nullopt, use the default in-memory type std::optional fInMemoryType; - const ROOT::RFieldDescriptor *fParentFieldDescriptor; - const ROOT::RNTupleDescriptor *fParentNTupleDescriptor; + const ROOT::RFieldDescriptor *fParentFieldDescriptor = nullptr; + const ROOT::RNTupleDescriptor *fParentNTupleDescriptor = nullptr; }; // Data related to a single call of RNTupleMerger::Merge() @@ -411,6 +412,7 @@ struct RNTupleMergeData { const ROOT::RNTupleDescriptor *fSrcDescriptor = nullptr; std::vector fColumns; + // Maps input column IDs to output IDs ColumnIdMap_t fColumnIdMap; ROOT::NTupleSize_t fNumDstEntries = 0; @@ -441,33 +443,6 @@ std::ostream &operator<<(std::ostream &os, const std::optional @@ -495,8 +470,9 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup } for (const auto &srcField : src.GetTopLevelFields()) { const auto dstFieldId = dst.FindFieldId(srcField.GetFieldName()); - if (dstFieldId == ROOT::kInvalidDescriptorId) + if (dstFieldId == ROOT::kInvalidDescriptorId) { res.fExtraSrcFields.push_back(&srcField); + } } // Check compatibility of common fields @@ -583,55 +559,103 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup } // Require that column representations match - const auto srcNCols = field.fSrc->GetLogicalColumnIds().size(); - const auto dstNCols = field.fDst->GetLogicalColumnIds().size(); - if (srcNCols != dstNCols) { - std::stringstream ss; - ss << "Field `" << field.fSrc->GetFieldName() - << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols - << ", new: " << srcNCols << ")"; - errors.push_back(ss.str()); - } else { - for (auto i = 0u; i < srcNCols; ++i) { - const auto srcColId = field.fSrc->GetLogicalColumnIds()[i]; - const auto dstColId = field.fDst->GetLogicalColumnIds()[i]; - const auto &srcCol = src.GetColumnDescriptor(srcColId); - const auto &dstCol = dst.GetColumnDescriptor(dstColId); - // TODO(gparolini): currently we refuse to merge columns of different types unless they are Split/non-Split - // version of the same type, because we know how to treat that specific case. We should also properly handle - // different but compatible types. - if (srcCol.GetType() != dstCol.GetType() && - !IsSplitOrUnsplitVersionOf(srcCol.GetType(), dstCol.GetType())) { - std::stringstream ss; - ss << i << "-th column of field `" << field.fSrc->GetFieldName() - << "` has a different column type of the same column on the previously-seen field with the same name " - "(old: " - << RColumnElementBase::GetColumnTypeName(srcCol.GetType()) - << ", new: " << RColumnElementBase::GetColumnTypeName(dstCol.GetType()) << ")"; - errors.push_back(ss.str()); - } - if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage()) { - std::stringstream ss; - ss << i << "-th column of field `" << field.fSrc->GetFieldName() - << "` has a different number of bits of the same column on the previously-seen field with the same " - "name " - "(old: " - << srcCol.GetBitsOnStorage() << ", new: " << dstCol.GetBitsOnStorage() << ")"; - errors.push_back(ss.str()); - } - if (srcCol.GetValueRange() != dstCol.GetValueRange()) { - std::stringstream ss; - ss << i << "-th column of field `" << field.fSrc->GetFieldName() - << "` has a different value range of the same column on the previously-seen field with the same name " - "(old: " - << srcCol.GetValueRange() << ", new: " << dstCol.GetValueRange() << ")"; - errors.push_back(ss.str()); - } - if (srcCol.GetRepresentationIndex() > 0) { + if (!field.fSrc->IsProjectedField()) { + const auto &srcColumns = field.fSrc->GetLogicalColumnIds(); + const auto &dstColumns = field.fDst->GetLogicalColumnIds(); + const auto srcNCols = srcColumns.size(); + const auto dstNCols = dstColumns.size(); + if (srcNCols != dstNCols) { + std::stringstream ss; + ss << "Field `" << field.fSrc->GetFieldName() + << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols + << ", new: " << srcNCols << ")"; + errors.push_back(ss.str()); + } else { + const std::uint32_t srcColCardinality = field.fSrc->GetColumnCardinality(); + const std::uint32_t dstColCardinality = field.fDst->GetColumnCardinality(); + if (srcColCardinality != dstColCardinality) { std::stringstream ss; - ss << i << "-th column of field `" << field.fSrc->GetFieldName() - << "` has a representation index higher than 0. This is not supported yet by the merger."; + ss << "Field `" << field.fSrc->GetFieldName() + << "` has a different column cardinality than previously-seen field with the same name (old: " + << dstColCardinality << ", new: " << srcColCardinality << ")"; errors.push_back(ss.str()); + } else if (srcColCardinality > 0) { + const auto srcNColReprs = srcNCols / srcColCardinality; + const auto dstNColReprs = dstNCols / dstColCardinality; + + // For each column representation of the source, check if it matches one in the descriptor. + // If so, and if it doesn't match the destination's repr index, add a mapping for it. + // If nothing matches, schedule the column representation to be added later. + // NOTE: this has quadratic complexity but the numbers involved are small so it's fine. + for (auto srcReprIdx = 0u; srcReprIdx < srcNColReprs; ++srcReprIdx) { + std::int64_t matchingRepr = -1; + for (auto dstReprIdx = 0u; dstReprIdx < dstNColReprs; ++dstReprIdx) { + bool matches = true; + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = src.GetColumnDescriptor(srcColId); + const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; + const auto &dstCol = dst.GetColumnDescriptor(dstColId); + if (srcCol.GetType() != dstCol.GetType()) { + matches = false; + break; + } + } + + if (matches) { + // If this column representation matches by column type, we need to make sure that it also has + // matching column metadata. Since we currently do not support multiple column representations + // that only differ by such metadata, we forbid merging such columns (e.g. we cannot merge two + // Real32Trunc columns with different bit widths). This could technically be supported, but it + // would require significant effort, so we currently don't. + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = src.GetColumnDescriptor(srcColId); + const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; + const auto &dstCol = dst.GetColumnDescriptor(dstColId); + if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() || + srcCol.GetValueRange() != dstCol.GetValueRange()) { + std::stringstream ss; + ss << "Source field `" << field.fSrc->GetFieldName() + << "` has a matching column representation as its destination field, however one or " + "more " + "of its columns have different column metadata (bit width and/or value range). " + "Merging variable-sized columns is currently only supported if all metadata is " + "identical between source and destination columns." + << "\n bit width src: " << srcCol.GetBitsOnStorage() + << ", dst: " << dstCol.GetBitsOnStorage() << "" + << "\n value range src: " << srcCol.GetValueRange() + << ", dst: " << dstCol.GetValueRange(); + errors.push_back(ss.str()); + break; + } + } + matchingRepr = dstReprIdx; + break; + } + } + + if (errors.empty()) { + if (matchingRepr >= 0 && matchingRepr != srcReprIdx) { + // a different matching representation was found + assert(matchingRepr < std::numeric_limits::max()); + res.fColReprMappings[field.fDst].push_back( + RColReprMapping{srcReprIdx, static_cast(matchingRepr)}); + } else if (matchingRepr < 0) { + // this representation was not found in the destination + assert(dstNColReprs < std::numeric_limits::max()); + ROOT::RFieldBase::ColumnRepresentation_t newRepr; + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = src.GetColumnDescriptor(srcColId); + newRepr.push_back(srcCol.GetType()); + } + RColReprExtension extension{{srcReprIdx, static_cast(dstNColReprs)}, newRepr}; + res.fColReprExtensions[field.fDst].push_back(extension); + res.fColReprMappings[field.fDst].push_back(extension); + } + } + } } } } @@ -669,13 +693,13 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup return ROOT::RResult(res); } -// Applies late model extension to `destination`, adding all `newFields` to it. +// Applies late model extension to `mergeData.fDestination`, adding all `descCmp.fExtraSrcFields` to it. [[nodiscard]] static ROOT::RResult -ExtendDestinationModel(std::span newFields, ROOT::RNTupleModel &dstModel, - RNTupleMergeData &mergeData, std::vector &commonFields) +ExtendDestinationModel(RDescriptorsComparison &descCmp, ROOT::RNTupleModel &dstModel, RNTupleMergeData &mergeData) { - assert(newFields.size() > 0); // no point in calling this with 0 new cols + const auto &newFields = descCmp.fExtraSrcFields; + auto &commonFields = descCmp.fCommonFields; dstModel.Unfreeze(); ROOT::Internal::RNTupleModelChangeset changeset{dstModel}; @@ -695,10 +719,19 @@ ExtendDestinationModel(std::span newFields, ROOT changeset.fAddedFields.reserve(newFields.size()); // First add all non-projected fields... for (const auto *fieldDesc : newFields) { - if (!fieldDesc->IsProjectedField()) { - auto field = fieldDesc->CreateField(*mergeData.fSrcDescriptor); - changeset.AddField(std::move(field)); + if (fieldDesc->IsProjectedField()) + continue; + + auto field = fieldDesc->CreateField(*mergeData.fSrcDescriptor); + // Explicitly set the field representatives. This prevents UpdateSchema() from changing our column + // representations via AutoAdjustColumnTypes. + ROOT::RFieldBase::ColumnRepresentation_t representatives; + for (const auto &colId : fieldDesc->GetLogicalColumnIds()) { + const auto &column = mergeData.fSrcDescriptor->GetColumnDescriptor(colId); + representatives.push_back(column.GetType()); } + field->SetColumnRepresentatives({representatives}); + changeset.AddField(std::move(field)); } // ...then add all projected fields. for (const auto *fieldDesc : newFields) { @@ -723,16 +756,40 @@ ExtendDestinationModel(std::span newFields, ROOT } dstModel.Freeze(); try { + // FIXME: here we are connecting the new fields/columns to the sink! + // We should avoid doing that, as all other non-extended fields never get connected (and we don't + // need to connect these either in principle). + // NOTE: this calls AutoAdjustColumnTypes, but we have set the column representations of all fields + // explicitly, so it will not change it under the hood. mergeData.fDestination.UpdateSchema(changeset, mergeData.fNumDstEntries); } catch (const ROOT::RException &ex) { return R__FAIL(ex.GetError().GetReport()); } commonFields.reserve(commonFields.size() + newFields.size()); - for (const auto *field : newFields) { + // NOTE(gparolini): Insert the new fields at the beginning of `commonFields`. + // We need to make sure the extended fields appear before all other common fields for the following reason: + // in general, when we GatherColumnInfos we (potentially) assign new column output ids in field order; this + // assignment happens whenever we find new columns, which happens in 3 cases: + // 1. we are in the first source and we're adding the first set of (common) fields; + // 2. we are adding a new set of extended common fields (which come from this function); + // 3. we are adding new column representations for fields that we already had before processing this source. + // + // It's important that the output id assigned to the new columns is coherent with the order of the column descriptors + // as they appear in the header and footer. This is in turn determined by the order by which we append new columns to + // the dst descriptor during the merging process. + // Since we call ExtendDestinationModel (this function) *before* adding the new column representations, it is always + // the case that the dst descriptor gets updated with the new column descriptors coming from the extended fields + // (since they are added in UpdateSchema a few lines above) before it gets updated with the extended column + // representations (which happens later in sink->AddColumnRepresentation). + // However, the new column output ids are added sequentially in *field* order in GatherColumnInfos and the fields + // containing the new column representations are already in that list from earlier! So, to make sure the new output + // ids are assigned to our extended fields first, we push them in from on the list so that they are visited first. + for (auto it = newFields.rbegin(); it != newFields.rend(); ++it) { + const auto *field = *it; const auto newFieldInDstId = mergeData.fDstDescriptor.FindFieldId(field->GetFieldName()); const auto &newFieldInDst = mergeData.fDstDescriptor.GetFieldDescriptor(newFieldInDstId); - commonFields.emplace_back(*field, newFieldInDst); + commonFields.insert(commonFields.begin(), RCommonField{*field, newFieldInDst}); } return ROOT::RResult::Success(); @@ -776,10 +833,10 @@ GenerateZeroPagesForColumns(size_t nEntriesToGenerate, std::spanGetStructure(); if (structure == ROOT::ENTupleStructure::kStreamer) { - return R__FAIL( - "Destination RNTuple contains a streamer field (" + field->GetFieldName() + - ") that is not present in one of the sources. " - "Creating a default value for a streamer field is ill-defined, therefore the merging process will abort."); + return R__FAIL("Destination RNTuple contains a streamer field (" + field->GetFieldName() + + ") that is not present in one of the sources. " + "Creating a default value for a streamer field is ill-defined, therefore the merging " + "process will abort."); } // NOTE: we cannot have a Record here because it has no associated columns. @@ -826,7 +883,7 @@ GenerateZeroPagesForColumns(size_t nEntriesToGenerate, std::span RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, const ROOT::RClusterDescriptor &clusterDesc, - std::span commonColumns, + std::span commonColumns, const RCluster::ColumnSet_t &commonColumnSet, std::size_t nCommonColumnsInCluster, RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData, ROOT::Internal::RPageAllocator &pageAlloc) @@ -838,9 +895,11 @@ RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, const RCluster *cluster = clusterPool.GetCluster(clusterDesc.GetId(), commonColumnSet); // we expect the cluster pool to contain the requested set of columns, since they were - // validated by CompareDescriptorStructure(). + // validated by CompareDescriptorStructure() and MergeSourceClusters(). assert(cluster); + const std::uint32_t outCompression = mergeData.fMergeOpts.fCompressionSettings.value(); + for (size_t colIdx = 0; colIdx < nCommonColumnsInCluster; ++colIdx) { const auto &column = commonColumns[colIdx]; const auto &columnId = column.fInputId; @@ -850,9 +909,6 @@ RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, const auto srcColElement = column.fInMemoryType ? ROOT::Internal::GenerateColumnElement(*column.fInMemoryType, columnDesc.GetType()) : RColumnElementBase::Generate(columnDesc.GetType()); - const auto dstColElement = column.fInMemoryType - ? ROOT::Internal::GenerateColumnElement(*column.fInMemoryType, column.fColumnType) - : RColumnElementBase::Generate(column.fColumnType); // Now get the pages for this column in this cluster const auto &pages = clusterDesc.GetPageRange(columnId); @@ -863,28 +919,23 @@ RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, // Each column range potentially has a distinct compression settings const auto colRangeCompressionSettings = clusterDesc.GetColumnRange(columnId).GetCompressionSettings().value(); - // Select "merging level". There are 3 levels, from fastest to slowest, depending on the case: + // Select "merging level". There are 2 levels, from fastest to slowest, depending on the case: // L1: compression and encoding of src and dest both match: we can simply copy the page - // L2: compression of dest doesn't match the src but encoding does: we must recompress the page but can avoid - // resealing it. - // L3: on-disk encoding doesn't match: we need to reseal the page, which implies decompressing and recompressing - // it. - const bool compressionIsDifferent = - colRangeCompressionSettings != mergeData.fMergeOpts.fCompressionSettings.value(); - const bool needsResealing = - srcColElement->GetIdentifier().fOnDiskType != dstColElement->GetIdentifier().fOnDiskType; - const bool needsRecompressing = compressionIsDifferent || needsResealing; + // L2: compression of dest doesn't match the src we must recompress the page. + // Note that in no case do we need to re-encode the page, as if the encoding differs we simply + // append a new column representation to the field. + const bool needsRecompressing = colRangeCompressionSettings != outCompression; if (needsRecompressing && mergeData.fMergeOpts.fExtraVerbose) { - R__LOG_INFO(NTupleMergeLog()) - << (needsResealing ? "Resealing" : "Recompressing") << " column " << column.fColumnName - << ": { compression: " << colRangeCompressionSettings << " => " - << mergeData.fMergeOpts.fCompressionSettings.value() - << ", onDiskType: " << RColumnElementBase::GetColumnTypeName(srcColElement->GetIdentifier().fOnDiskType) - << " => " << RColumnElementBase::GetColumnTypeName(dstColElement->GetIdentifier().fOnDiskType); + R__LOG_INFO(NTupleMergeLog()) << "Recompressing column " << column.fColumnName + << ": { compression: " << colRangeCompressionSettings << " => " + << mergeData.fMergeOpts.fCompressionSettings.value() << ", onDiskType: " + << RColumnElementBase::GetColumnTypeName( + srcColElement->GetIdentifier().fOnDiskType) + << "}"; } - size_t pageBufferBaseIdx = sealedPageData.fBuffers.size(); + const size_t pageBufferBaseIdx = sealedPageData.fBuffers.size(); // If the column range already has the right compression we don't need to allocate any new buffer, so we don't // bother reserving memory for them. if (needsRecompressing) @@ -933,29 +984,15 @@ RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, buffer = MakeUninitArray(bufSize); // clang-format off - if (needsResealing) { - RTaskVisitor{fTaskGroup}(RResealFunc{ - *srcColElement, - *dstColElement, - mergeData.fMergeOpts, - sealedPage, - *fPageAlloc, - buffer.get(), - bufSize, - mergeData.fDestination.GetWriteOptions() - }); - } else { - RTaskVisitor{fTaskGroup}(RChangeCompressionFunc{ - *srcColElement, - *dstColElement, - mergeData.fMergeOpts, - sealedPage, - *fPageAlloc, - buffer.get(), - bufSize, - mergeData.fDestination.GetWriteOptions() - }); - } + RTaskVisitor{fTaskGroup}(RChangeCompressionFunc{ + *srcColElement, + outCompression, + sealedPage, + *fPageAlloc, + buffer.get(), + bufSize, + mergeData.fDestination.GetWriteOptions() + }); // clang-format on } @@ -979,9 +1016,9 @@ RNTupleMerger::MergeCommonColumns(ROOT::Internal::RClusterPool &clusterPool, // the destination's schemas. // The pages may be "fast-merged" (i.e. simply copied with no decompression/recompression) if the target // compression is unspecified or matches the original compression settings. -ROOT::RResult -RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span commonColumns, - std::span extraDstColumns, RNTupleMergeData &mergeData) +ROOT::RResult RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span commonColumns, + std::span extraDstColumns, + RNTupleMergeData &mergeData) { ROOT::Internal::RClusterPool clusterPool{source}; @@ -996,31 +1033,60 @@ RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span 0); - // NOTE: just because a column is in `commonColumns` it doesn't mean that each cluster in the source contains it, - // as it may be a deferred column that only has real data in a future cluster. - // We need to figure out which columns are actually present in this cluster so we only merge their pages (the - // missing columns are handled by synthesizing zero pages - see below). - size_t nCommonColumnsInCluster = commonColumns.size(); - while (nCommonColumnsInCluster > 0) { - // Since `commonColumns` is sorted by column input id, we can simply traverse it from the back and stop as - // soon as we find a common column that appears in this cluster: we know that in that case all previous - // columns must appear as well. - if (clusterDesc.ContainsColumn(commonColumns[nCommonColumnsInCluster - 1].fInputId)) - break; - --nCommonColumnsInCluster; - } - + // Deduce which columns are suppressed (cluster by cluster) by exclusion, as: + // (columns in the columnIdMap) - (columns in commonColumns which are not suppressed). + // Note that some suppressed columns may not be in commonColumns because they might not appear at all in the + // current source. + FieldCollectionMap_t activeColumns; + + // NOTE: just because a column is in `commonColumns` it doesn't mean that each cluster in the source contains + // it, as it may be a deferred column that only has real data in a future cluster. We need to figure out which + // columns are actually present in this cluster so we only merge their pages (the missing columns are handled + // by synthesizing zero pages - see below). + size_t nCommonColumnsInCluster = 0; // Convert columns to a ColumnSet for the ClusterPool query RCluster::ColumnSet_t commonColumnSet; commonColumnSet.reserve(nCommonColumnsInCluster); - for (size_t i = 0; i < nCommonColumnsInCluster; ++i) - commonColumnSet.emplace(commonColumns[i].fInputId); + // Collect all common columns appearing in this cluster into commonColumnSet and reorganize commonColumns so + // that those columns are at the start of it (whereas missing columns are at its end). + // NOTE: it's fine if this scrambles the order of columns: the RNTupleSerializer will sort them by physical ID. + std::partition(commonColumns.begin(), commonColumns.end(), [&](const auto &column) { + if (const auto *colRange = clusterDesc.TryGetColumnRange(column.fInputId)) { + if (!colRange->IsSuppressed()) { + ++nCommonColumnsInCluster; + commonColumnSet.emplace(column.fInputId); + activeColumns[column.fParentFieldDescriptor].push_back(column.fOutputId); + return true; + } + } + return false; + }); - // For each cluster, the "missing columns" are the union of the extraDstColumns and the common columns - // that are not present in the cluster. We generate zero pages for all of them. - missingColumns.resize(extraDstColumns.size()); // NOTE: this clears all common columns of the previous cluster - for (size_t i = nCommonColumnsInCluster; i < commonColumns.size(); ++i) - missingColumns.push_back(commonColumns[i]); + // Commit all suppressed columns. + // This is a fairly involved operation, as we need to commit all known columns that: + // a) do not appear in extraDstColumns (those are "missing", not suppressed), and + // b) do not appear in commonColumnSet (those are the active columns). + // Not that these may or may not appear in commonColumns as suppressed columns, since they may or may not be + // present in the current source. + // The only way to find all the columns is to go and get them from fColumnIdMap, which keeps track of every + // column we added to the destination so far. However, since it also contains the extraDstColumns, we need to + // specifically only query those columns that belong to a field that has at least 1 column in commonColumns + // (remember that commonColumns contains all columns associated to the common fields for this source). + for (const auto &[fieldDesc, activeIds] : activeColumns) { + const auto &fieldFQName = mergeData.fSrcDescriptor->GetQualifiedFieldName(fieldDesc->GetId()); + const auto cardinality = fieldDesc->GetColumnCardinality(); + for (auto i = 0u; i < fieldDesc->GetLogicalColumnIds().size(); ++i) { + const auto colIndex = i % cardinality; + const auto reprIndex = i / cardinality; + const auto colName = "." + fieldFQName + '.' + std::to_string(colIndex) + '.' + std::to_string(reprIndex); + const auto colIt = mergeData.fColumnIdMap.find(colName); + assert(colIt != mergeData.fColumnIdMap.end()); + const auto colOutId = colIt->second.fColumnId; + if (std::find(activeIds.begin(), activeIds.end(), colOutId) == activeIds.end()) { + mergeData.fDestination.CommitSuppressedColumn(ROOT::Internal::RPageStorage::ColumnHandle_t{colOutId}); + } + } + } RSealedPageMergeData sealedPageData; auto res = MergeCommonColumns(clusterPool, clusterDesc, commonColumns, commonColumnSet, nCommonColumnsInCluster, @@ -1028,6 +1094,14 @@ RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span ColumnInMemoryType(std::string_view fieldT return std::nullopt; } -// Given a field, fill `columns` and `mergeData.fColumnIdMap` with information about all columns belonging to it and its -// subfields. `mergeData.fColumnIdMap` is used to map matching columns from different sources to the same output column -// in the destination. We match columns by their "fully qualified name", which is the concatenation of their ancestor -// fields' names and the column index. By this point, since we called `CompareDescriptorStructure()` earlier, we should -// be guaranteed that two matching columns will have at least compatible representations. NOTE: srcFieldDesc and -// dstFieldDesc may alias. +// Given a field, fill `columns` and `mergeData.fColumnIdMap` with information about all columns belonging to it and +// its subfields. `mergeData.fColumnIdMap` is used to map matching columns from different sources to the same output +// column in the destination. We match columns by their "fully qualified name", which is the concatenation of their +// ancestor fields' names and the column index. By this point, since we called `CompareDescriptorStructure()` +// earlier, we should be guaranteed that two matching columns will have at least compatible representations. +// This function is recursive as it needs to call itself on the entire subfield hierarchy of the source field. +// NOTE: srcFieldDesc and dstFieldDesc may alias. static void AddColumnsFromField(std::vector &columns, const ROOT::RNTupleDescriptor &srcDesc, + const FieldCollectionMap_t &colReprMappings, RNTupleMergeData &mergeData, const ROOT::RFieldDescriptor &srcFieldDesc, const ROOT::RFieldDescriptor &dstFieldDesc, const std::string &prefix = "") { @@ -1099,21 +1175,19 @@ static void AddColumnsFromField(std::vector &columns, const RO const auto &columnIds = srcFieldDesc.GetLogicalColumnIds(); columns.reserve(columns.size() + columnIds.size()); - // NOTE: here we can match the src and dst columns by column index because we forbid merging fields with - // different column representations. + for (auto i = 0u; i < srcFieldDesc.GetLogicalColumnIds().size(); ++i) { auto srcColumnId = srcFieldDesc.GetLogicalColumnIds()[i]; const auto &srcColumn = srcDesc.GetColumnDescriptor(srcColumnId); RColumnMergeInfo info{}; - info.fColumnName = name + '.' + std::to_string(srcColumn.GetIndex()); info.fInputId = srcColumn.GetPhysicalId(); // NOTE(gparolini): the parent field is used when synthesizing zero pages, which happens in 2 situations: // 1. when adding extra dst columns (in which case we need to synthesize zero pages for the incoming src), and // 2. when merging a deferred column into an existing column (in which case we need to fill the "hole" with - // zeroes). For the first case srcFieldDesc and dstFieldDesc are the same (see the calling site of this function), - // but for the second case they're not, and we need to pick the source field because we will then check the - // column's *input* id inside fParentFieldDescriptor to see if it's a suppressed column (see + // zeroes). For the first case srcFieldDesc and dstFieldDesc are the same (see the calling site of this + // function), but for the second case they're not, and we need to pick the source field because we will then + // check the column's *input* id inside fParentFieldDescriptor to see if it's a suppressed column (see // GenerateZeroPagesForColumns()). info.fParentFieldDescriptor = &srcFieldDesc; // Save the parent field descriptor since this may be either the source or destination descriptor depending on @@ -1121,22 +1195,34 @@ static void AddColumnsFromField(std::vector &columns, const RO // properly walk up the field hierarchy. info.fParentNTupleDescriptor = &srcDesc; + const auto mappingsIt = colReprMappings.find(&dstFieldDesc); + std::uint16_t reprIndex = srcColumn.GetRepresentationIndex(); + if (mappingsIt != colReprMappings.end()) { + if (auto outReprIdx = FindColumnReprMapping(mappingsIt->second, reprIndex); outReprIdx) + reprIndex = *outReprIdx; + } + + info.fColumnName = name + '.' + std::to_string(srcColumn.GetIndex()) + '.' + std::to_string(reprIndex); + + ENTupleColumnType columnType = ENTupleColumnType::kUnknown; + if (auto it = mergeData.fColumnIdMap.find(info.fColumnName); it != mergeData.fColumnIdMap.end()) { + // We had already added this column to the column id map: just copy its data. info.fOutputId = it->second.fColumnId; - info.fColumnType = it->second.fColumnType; + info.fOutputReprIndex = reprIndex; } else { + // New column: assign it the next ouput id. info.fOutputId = mergeData.fColumnIdMap.size(); - // NOTE(gparolini): map the type of src column to the type of dst column. - // This mapping is only relevant for common columns and it's done to ensure we keep a consistent - // on-disk representation of the same column. - // This is also important to do for first source when it is used to generate the destination sink, - // because even in that case their column representations may differ. - // e.g. if the destination has a different compression than the source, an integer column might be - // zigzag-encoded in the source but not in the destination. - auto dstColumnId = dstFieldDesc.GetLogicalColumnIds()[i]; + // NOTE(gparolini): map the representation index of src column to that of dst column. + // This mapping is only relevant for common columns and it's done to ensure we have the correct representation + // index in the output column metadata. + assert(dstFieldDesc.GetColumnCardinality() == srcFieldDesc.GetColumnCardinality()); + const auto dstColumnIndex = reprIndex * dstFieldDesc.GetColumnCardinality() + srcColumn.GetIndex(); + const auto dstColumnId = dstFieldDesc.GetLogicalColumnIds()[dstColumnIndex]; const auto &dstColumn = mergeData.fDstDescriptor.GetColumnDescriptor(dstColumnId); - info.fColumnType = dstColumn.GetType(); - mergeData.fColumnIdMap[info.fColumnName] = {info.fOutputId, info.fColumnType}; + columnType = dstColumn.GetType(); + info.fOutputReprIndex = reprIndex; + mergeData.fColumnIdMap[info.fColumnName] = RColumnOutInfo{info.fOutputId}; } if (mergeData.fMergeOpts.fExtraVerbose) { @@ -1144,12 +1230,12 @@ static void AddColumnsFromField(std::vector &columns, const RO << ", phys.id " << srcColumn.GetPhysicalId() << ", type " << RColumnElementBase::GetColumnTypeName(srcColumn.GetType()) << " -> log.id " << info.fOutputId << ", type " - << RColumnElementBase::GetColumnTypeName(info.fColumnType); + << RColumnElementBase::GetColumnTypeName(columnType); } // Since we disallow merging fields of different types, src and dstFieldDesc must have the same type name. assert(srcFieldDesc.GetTypeName() == dstFieldDesc.GetTypeName()); - info.fInMemoryType = ColumnInMemoryType(srcFieldDesc.GetTypeName(), info.fColumnType); + info.fInMemoryType = ColumnInMemoryType(srcFieldDesc.GetTypeName(), columnType); columns.emplace_back(info); } @@ -1159,7 +1245,7 @@ static void AddColumnsFromField(std::vector &columns, const RO for (auto i = 0u; i < srcChildrenIds.size(); ++i) { const auto &srcChild = srcDesc.GetFieldDescriptor(srcChildrenIds[i]); const auto &dstChild = mergeData.fDstDescriptor.GetFieldDescriptor(dstChildrenIds[i]); - AddColumnsFromField(columns, srcDesc, mergeData, srcChild, dstChild, name); + AddColumnsFromField(columns, srcDesc, colReprMappings, mergeData, srcChild, dstChild, name); } } @@ -1171,17 +1257,12 @@ static RColumnInfoGroup GatherColumnInfos(const RDescriptorsComparison &descCmp, { RColumnInfoGroup res; for (const ROOT::RFieldDescriptor *field : descCmp.fExtraDstFields) { - AddColumnsFromField(res.fExtraDstColumns, mergeData.fDstDescriptor, mergeData, *field, *field); + AddColumnsFromField(res.fExtraDstColumns, mergeData.fDstDescriptor, descCmp.fColReprMappings, mergeData, *field, + *field); } for (const auto &[srcField, dstField] : descCmp.fCommonFields) { - AddColumnsFromField(res.fCommonColumns, srcDesc, mergeData, *srcField, *dstField); + AddColumnsFromField(res.fCommonColumns, srcDesc, descCmp.fColReprMappings, mergeData, *srcField, *dstField); } - - // Sort the commonColumns by ID so we can more easily tell how many common columns each cluster has - // (since each cluster must contain all columns of the previous cluster plus potentially some new ones) - std::sort(res.fCommonColumns.begin(), res.fCommonColumns.end(), - [](const auto &a, const auto &b) { return a.fInputId < b.fInputId; }); - return res; } @@ -1192,9 +1273,9 @@ static void PrefillColumnMap(const ROOT::RNTupleDescriptor &desc, const ROOT::RF for (const auto &colId : fieldDesc.GetLogicalColumnIds()) { const auto &colDesc = desc.GetColumnDescriptor(colId); RColumnOutInfo info{}; - const auto colName = name + '.' + std::to_string(colDesc.GetIndex()); info.fColumnId = colDesc.GetLogicalId(); - info.fColumnType = colDesc.GetType(); + const auto colName = + name + '.' + std::to_string(colDesc.GetIndex()) + '.' + std::to_string(colDesc.GetRepresentationIndex()); colIdMap[colName] = info; } @@ -1204,6 +1285,25 @@ static void PrefillColumnMap(const ROOT::RNTupleDescriptor &desc, const ROOT::RF } } +static void AddColumnExtensionsInFieldOrder( + const ROOT::RFieldDescriptor &field, const ROOT::RNTupleDescriptor &desc, + const FieldCollectionMap_t &extensions, + std::vector>> &outExtensions, + std::unordered_map> &outProjectionPointees) +{ + const auto it = extensions.find(&field); + if (it != extensions.end()) + outExtensions.emplace_back(it->first, it->second); + + if (field.IsProjectedField()) + outProjectionPointees[field.GetProjectionSourceId()].push_back(&field); + + for (auto childId : field.GetLinkIds()) { + const auto &child = desc.GetFieldDescriptor(childId); + AddColumnExtensionsInFieldOrder(child, desc, extensions, outExtensions, outProjectionPointees); + } +} + RNTupleMerger::RNTupleMerger(std::unique_ptr destination, std::unique_ptr model) // TODO(gparolini): consider using an arena allocator instead, since we know the precise lifetime @@ -1244,6 +1344,10 @@ ROOT::RResult RNTupleMerger::Merge(std::span sources, const } } + // Maps projection source fields to all their projections. + std::unordered_map> projectionPointees; + bool projectionPointeesInitialized = false; + // we should have a model if and only if the destination is initialized. if (!!fModel != fDestination->IsInitialized()) { return R__FAIL( @@ -1295,7 +1399,7 @@ ROOT::RResult RNTupleMerger::Merge(std::span sources, const } } - // Create sink from the input model if not initialized + // Create sink and model from the input descriptor if not initialized if (!fModel) { fModel = fDestination->InitFromDescriptor(srcDescriptor.GetRef(), false /* copyClusters */); } @@ -1321,10 +1425,10 @@ ROOT::RResult RNTupleMerger::Merge(std::span sources, const } // handle extra src fields - if (descCmp.fExtraSrcFields.size()) { + if (!descCmp.fExtraSrcFields.empty()) { if (mergeOpts.fMergingMode == ENTupleMergingMode::kUnion) { // late model extension for all fExtraSrcFields in Union mode - auto res = ExtendDestinationModel(descCmp.fExtraSrcFields, *fModel, mergeData, descCmp.fCommonFields); + auto res = ExtendDestinationModel(descCmp, *fModel, mergeData); if (!res) return R__FORWARD_ERROR(res); } else if (mergeOpts.fMergingMode == ENTupleMergingMode::kStrict) { @@ -1337,6 +1441,53 @@ ROOT::RResult RNTupleMerger::Merge(std::span sources, const } } + //// Extend columns if needed + if (!descCmp.fColReprExtensions.empty()) { + if (!projectionPointeesInitialized) { + for (const auto &field : descCmp.fExtraDstFields) { + if (field->IsProjectedField()) + projectionPointees[field->GetProjectionSourceId()].push_back(field); + } + projectionPointeesInitialized = true; + } + + // We need to extend the columns in the proper order, i.e. so that they appear in the same order as + // their first representation. This is to ensure that the pages we write to the cluster are in a consistent + // order as their column descriptors. The page creation order is determined by the order of + // columnInfos.fCommonColumns, which in turn depends on the common fields order (see GatherColumnInfos). + // XXX: do we need this separate sort step? Why not just create this vector directly in + // CompareDescriptorStructure? + std::vector>> colExtensions; + colExtensions.reserve(descCmp.fColReprExtensions.size()); + for (const auto &commonField : descCmp.fCommonFields) { + const auto *field = commonField.fDst; + AddColumnExtensionsInFieldOrder(*field, mergeData.fDstDescriptor, descCmp.fColReprExtensions, colExtensions, + projectionPointees); + } + for (const auto &field : descCmp.fExtraSrcFields) { + if (field->IsProjectedField()) + projectionPointees[field->GetProjectionSourceId()].push_back(field); + } + + for (const auto &[fieldDesc, extensions] : colExtensions) { + auto &mappings = descCmp.fColReprMappings[fieldDesc]; + for (const auto &extension : extensions) { + const auto firstColumnId = fDestination->AddColumnRepresentation(*fieldDesc, extension.fSourceRepr); + + // When adding new column representations to an existing field which is the source of some projected + // fields, we need to also add new alias columns to those fields so that they can point to the proper + // representation. + if (auto it = projectionPointees.find(fieldDesc->GetId()); it != projectionPointees.end()) { + for (const auto &projection : it->second) { + for (auto colIdx = 0u; colIdx < extension.fSourceRepr.size(); ++colIdx) + fDestination->AddAliasColumn(mergeData.fDstDescriptor, *projection, firstColumnId + colIdx); + } + } + mappings.push_back(extension); + } + } + } + // handle extra dst fields & common fields auto columnInfos = GatherColumnInfos(descCmp, srcDescriptor.GetRef(), mergeData); auto res = MergeSourceClusters(*source, columnInfos.fCommonColumns, columnInfos.fExtraDstColumns, mergeData); diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index 03b7767c8b3a3..282dfc58ea320 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -1144,8 +1144,9 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr return model; } -void ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field, - std::span newRepresentation) +ROOT::DescriptorId_t +ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field, + std::span newRepresentation) { const auto &descriptor = fDescriptorBuilder.GetDescriptor(); @@ -1207,6 +1208,8 @@ void ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RF ++columnIndex; } + + return firstPhysicalIndex; } void ROOT::Internal::RPagePersistentSink::AddAliasColumn(const ROOT::RNTupleDescriptor &desc, diff --git a/tree/ntuple/test/ntuple_merger.cxx b/tree/ntuple/test/ntuple_merger.cxx index 8e1279e3ed143..56467baf013b2 100644 --- a/tree/ntuple/test/ntuple_merger.cxx +++ b/tree/ntuple/test/ntuple_merger.cxx @@ -1011,7 +1011,7 @@ TEST(RNTupleMerger, MergeLateModelExtension) { // Write two test ntuples to be merged, with different models. // Use EMergingMode::kUnion so the output ntuple has all the fields of its inputs. - FileRaii fileGuard1("test_ntuple_merge_in_1.root"); + FileRaii fileGuard1("test_ntuple_merge_lmext_in_1.root"); { auto model = RNTupleModel::Create(); auto fieldFoo = model->MakeField>("foo"); @@ -1027,11 +1027,12 @@ TEST(RNTupleMerger, MergeLateModelExtension) } } - FileRaii fileGuard2("test_ntuple_merge_in_2.root"); + FileRaii fileGuard2("test_ntuple_merge_lmext_in_2.root"); { auto model = RNTupleModel::Create(); auto fieldBaz = model->MakeField("baz"); auto fieldFoo = model->MakeField>("foo"); + auto fieldQux = model->MakeField("qux"); auto fieldVfoo = model->MakeField[3]>("vfoo"); auto wopts = RNTupleWriteOptions(); wopts.SetCompression(0); @@ -1041,12 +1042,13 @@ TEST(RNTupleMerger, MergeLateModelExtension) fieldFoo->insert(std::make_pair(std::to_string(i), i * 765)); fieldVfoo[0] = {(int)i * 765}; fieldVfoo[2] = {(int)i * 987}; + *fieldQux = i * 777; ntuple->Fill(); } } // Now merge the inputs - FileRaii fileGuard3("test_ntuple_merge_out.root"); + FileRaii fileGuard3("test_ntuple_merge_lmext_out.root"); { // Gather the input sources std::vector> sources; @@ -1077,6 +1079,7 @@ TEST(RNTupleMerger, MergeLateModelExtension) auto vfoo = ntuple->GetModel().GetDefaultEntry().GetPtr[3]>("vfoo"); auto bar = ntuple->GetModel().GetDefaultEntry().GetPtr("bar"); auto baz = ntuple->GetModel().GetDefaultEntry().GetPtr("baz"); + auto qux = ntuple->GetModel().GetDefaultEntry().GetPtr("qux"); for (int i = 0; i < 10; ++i) { ntuple->LoadEntry(i); @@ -1086,6 +1089,7 @@ TEST(RNTupleMerger, MergeLateModelExtension) ASSERT_TRUE(vfoo[1].empty()); ASSERT_EQ(*bar, i * 321); ASSERT_EQ(*baz, 0); + ASSERT_EQ(*qux, 0); } for (int i = 10; i < 20; ++i) { ntuple->LoadEntry(i); @@ -1095,6 +1099,7 @@ TEST(RNTupleMerger, MergeLateModelExtension) ASSERT_TRUE(vfoo[1].empty()); ASSERT_EQ(*bar, 0); ASSERT_EQ(*baz, (i - 10) * 567); + ASSERT_EQ(*qux, (i - 10) * 777); } } } @@ -1182,8 +1187,10 @@ TEST(RNTupleMerger, DifferentCompatibleRepresentations) auto model = RNTupleModel::Create(); auto pFoo = model->MakeField("foo"); auto clonedModel = model->Clone(); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); { - auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath(), wopts); for (size_t i = 0; i < 10; ++i) { *pFoo = i * 123; ntuple->Fill(); @@ -1195,12 +1202,12 @@ TEST(RNTupleMerger, DifferentCompatibleRepresentations) { auto &fieldFooDbl = clonedModel->GetMutableField("foo"); fieldFooDbl.SetColumnRepresentatives({{ROOT::ENTupleColumnType::kReal32}}); - auto ntuple = RNTupleWriter::Recreate(std::move(clonedModel), "ntuple", fileGuard2.GetPath()); + auto ntuple = RNTupleWriter::Recreate(std::move(clonedModel), "ntuple", fileGuard2.GetPath(), wopts); auto e = ntuple->CreateEntry(); auto pFoo2 = e->GetPtr("foo"); for (size_t i = 0; i < 10; ++i) { *pFoo2 = i * 567; - ntuple->Fill(); + ntuple->Fill(*e); } } @@ -1220,30 +1227,18 @@ TEST(RNTupleMerger, DifferentCompatibleRepresentations) auto sourcePtrs2 = sourcePtrs; { - auto wopts = RNTupleWriteOptions(); - wopts.SetCompression(0); auto destination = std::make_unique("ntuple", fileGuard3.GetPath(), wopts); auto opts = RNTupleMergeOptions(); opts.fCompressionSettings = 0; RNTupleMerger merger{std::move(destination)}; auto res = merger.Merge(sourcePtrs, opts); - // TODO(gparolini): we want to support this in the future - EXPECT_FALSE(bool(res)); - if (res.GetError()) { - EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("different column type")); - } - // EXPECT_TRUE(bool(res)); + EXPECT_TRUE(bool(res)); } { auto destination = std::make_unique("ntuple", fileGuard4.GetPath(), RNTupleWriteOptions()); RNTupleMerger merger{std::move(destination)}; auto res = merger.Merge(sourcePtrs); - // TODO(gparolini): we want to support this in the future - EXPECT_FALSE(bool(res)); - if (res.GetError()) { - EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("different column type")); - } - // EXPECT_TRUE(bool(res)); + EXPECT_TRUE(bool(res)); } } } @@ -1513,6 +1508,113 @@ TEST(RNTupleMerger, MergeProjectedFieldsMultiple) } } +TEST(RNTupleMerger, MergeProjectedFieldsDifferentCompression) +{ + // Verify that we correctly handle projected fields with different compressions + FileRaii fileGuard1("test_ntuple_merge_proj_diff_comp_in_1.root"); + { + auto model = RNTupleModel::Create(); + auto fieldInt = model->MakeField("int"); + auto fieldFlt = model->MakeField("flt"); + auto projIntProj = std::make_unique>("intProj"); + model->AddProjectedField(std::move(projIntProj), [](const std::string &) { return "int"; }); + auto projFltProj = std::make_unique>("fltProj"); + model->AddProjectedField(std::move(projFltProj), [](const std::string &) { return "flt"; }); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + for (size_t i = 0; i < 10; ++i) { + *fieldInt = i * 123; + *fieldFlt = i * 456; + ntuple->Fill(); + } + } + FileRaii fileGuard2("test_ntuple_merge_proj_diff_comp_in_2.root"); + { + auto model = RNTupleModel::Create(); + auto fieldInt = model->MakeField("int"); + auto fieldFlt = model->MakeField("flt"); + auto projIntProj = std::make_unique>("intProj"); + model->AddProjectedField(std::move(projIntProj), [](const std::string &) { return "int"; }); + auto projFltProj = std::make_unique>("fltProj"); + model->AddProjectedField(std::move(projFltProj), [](const std::string &) { return "flt"; }); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath(), wopts); + for (size_t i = 0; i < 10; ++i) { + *fieldInt = (i + 10) * 123; + *fieldFlt = (i + 10) * 456; + ntuple->Fill(); + } + } + + FileRaii fileGuard3("test_ntuple_merge_proj_diff_comp_out.root"); + { + // Gather the input sources + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + // Now merge the inputs + auto destination = std::make_unique("ntuple", fileGuard3.GetPath(), RNTupleWriteOptions()); + RNTupleMerger merger{std::move(destination)}; + auto res = merger.Merge(sourcePtrs); + EXPECT_TRUE(bool(res)); + } + { + auto ntuple1 = RNTupleReader::Open("ntuple", fileGuard1.GetPath()); + auto ntuple2 = RNTupleReader::Open("ntuple", fileGuard2.GetPath()); + auto ntuple3 = RNTupleReader::Open("ntuple", fileGuard3.GetPath()); + EXPECT_EQ(ntuple1->GetNEntries() + ntuple2->GetNEntries(), ntuple3->GetNEntries()); + const auto &desc1 = ntuple1->GetDescriptor(); + const auto nAliasColumns1 = desc1.GetNLogicalColumns() - desc1.GetNPhysicalColumns(); + EXPECT_EQ(nAliasColumns1, 2); + const auto &desc2 = ntuple2->GetDescriptor(); + const auto nAliasColumns2 = desc2.GetNLogicalColumns() - desc2.GetNPhysicalColumns(); + EXPECT_EQ(nAliasColumns2, 2); + const auto &desc3 = ntuple3->GetDescriptor(); + const auto nAliasColumns3 = desc3.GetNLogicalColumns() - desc3.GetNPhysicalColumns(); + EXPECT_EQ(nAliasColumns3, 4); + + auto int1 = ntuple1->GetModel().GetDefaultEntry().GetPtr("int"); + auto int2 = ntuple2->GetModel().GetDefaultEntry().GetPtr("int"); + auto int3 = ntuple3->GetModel().GetDefaultEntry().GetPtr("int"); + auto intProj1 = ntuple1->GetModel().GetDefaultEntry().GetPtr("intProj"); + auto intProj2 = ntuple2->GetModel().GetDefaultEntry().GetPtr("intProj"); + auto intProj3 = ntuple3->GetModel().GetDefaultEntry().GetPtr("intProj"); + + auto flt1 = ntuple1->GetModel().GetDefaultEntry().GetPtr("flt"); + auto flt2 = ntuple2->GetModel().GetDefaultEntry().GetPtr("flt"); + auto flt3 = ntuple3->GetModel().GetDefaultEntry().GetPtr("flt"); + auto fltProj1 = ntuple1->GetModel().GetDefaultEntry().GetPtr("fltProj"); + auto fltProj2 = ntuple2->GetModel().GetDefaultEntry().GetPtr("fltProj"); + auto fltProj3 = ntuple3->GetModel().GetDefaultEntry().GetPtr("fltProj"); + + for (auto i = 0u; i < ntuple1->GetNEntries(); ++i) { + ntuple1->LoadEntry(i); + ntuple3->LoadEntry(i); + EXPECT_EQ(*int1, *int3); + EXPECT_EQ(*intProj1, *intProj3); + EXPECT_FLOAT_EQ(*flt1, *flt3); + EXPECT_FLOAT_EQ(*fltProj1, *fltProj3); + EXPECT_FLOAT_EQ(*fltProj1, *flt1); + EXPECT_FLOAT_EQ(*fltProj3, *flt3); + } + for (auto i = 0u; i < ntuple2->GetNEntries(); ++i) { + ntuple2->LoadEntry(i); + ntuple3->LoadEntry(ntuple1->GetNEntries() + i); + EXPECT_EQ(*int2, *int3); + EXPECT_EQ(*intProj2, *intProj3); + EXPECT_FLOAT_EQ(*flt2, *flt3); + EXPECT_FLOAT_EQ(*fltProj2, *fltProj3); + EXPECT_FLOAT_EQ(*fltProj2, *flt2); + EXPECT_FLOAT_EQ(*fltProj3, *flt3); + } + } +} + TEST(RNTupleMerger, MergeProjectedFieldsOnlyFirst) { // Merge two files where the first has a projection and the second doesn't, and verify that we can @@ -1533,7 +1635,9 @@ TEST(RNTupleMerger, MergeProjectedFieldsOnlyFirst) { auto model = RNTupleModel::Create(); auto fieldFoo = model->MakeField("foo"); - auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath()); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath(), wopts); for (size_t i = 0; i < 10; ++i) { *fieldFoo = i * 123; ntuple->Fill(); @@ -1567,16 +1671,18 @@ TEST(RNTupleMerger, MergeProjectedFieldsOnlyFirst) auto ntuple1 = RNTupleReader::Open("ntuple", fileGuard1.GetPath()); auto ntuple2 = RNTupleReader::Open("ntuple", fileGuard2.GetPath()); auto ntuple3 = RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); - ASSERT_EQ(ntuple1->GetNEntries() + ntuple2->GetNEntries(), ntuple3->GetNEntries()); + EXPECT_EQ(ntuple1->GetNEntries() + ntuple2->GetNEntries(), ntuple3->GetNEntries()); const auto &desc1 = ntuple1->GetDescriptor(); const auto &desc2 = ntuple2->GetDescriptor(); const auto &desc3 = ntuple3->GetDescriptor(); const auto nAliasColumns1 = desc1.GetNLogicalColumns() - desc1.GetNPhysicalColumns(); const auto nAliasColumns2 = desc2.GetNLogicalColumns() - desc2.GetNPhysicalColumns(); const auto nAliasColumns3 = desc3.GetNLogicalColumns() - desc3.GetNPhysicalColumns(); - ASSERT_EQ(nAliasColumns1, 1); - ASSERT_EQ(nAliasColumns2, 0); - ASSERT_EQ(nAliasColumns3, 1); + EXPECT_EQ(nAliasColumns1, 1); + EXPECT_EQ(nAliasColumns2, 0); + // The output RNTuple has 2 alias columns because one was created by the merger to point to the extended + // column that was added to field "foo" (since source 2 had a different encoding than source 1). + EXPECT_EQ(nAliasColumns3, 2); auto foo1 = ntuple1->GetModel().GetDefaultEntry().GetPtr("foo"); auto foo2 = ntuple2->GetModel().GetDefaultEntry().GetPtr("foo"); @@ -1588,16 +1694,16 @@ TEST(RNTupleMerger, MergeProjectedFieldsOnlyFirst) for (auto i = 0u; i < ntuple1->GetNEntries(); ++i) { ntuple1->LoadEntry(i); ntuple3->LoadEntry(i); - ASSERT_EQ(*foo1, *foo3); - ASSERT_EQ(*bar1, *foo3); - ASSERT_EQ(*bar1, *bar3); + EXPECT_EQ(*foo1, *foo3); + EXPECT_EQ(*bar1, *foo3); + EXPECT_EQ(*bar1, *bar3); } for (auto i = 0u; i < ntuple2->GetNEntries(); ++i) { ntuple2->LoadEntry(i); ntuple3->LoadEntry(ntuple1->GetNEntries() + i); - ASSERT_EQ(*foo2, *foo3); + EXPECT_EQ(*foo2, *foo3); // we should be able to read the data from the second ntuple using the projection defined in the first. - ASSERT_EQ(*foo2, *bar3); + EXPECT_EQ(*foo2, *bar3); } } } @@ -2540,7 +2646,8 @@ TEST(RNTupleMerger, MergeDeferredAdvanced) auto model1 = RNTupleModel::Create(); auto wopts = RNTupleWriteOptions(); wopts.SetCompression(0); - auto writer1 = RNTupleWriter::Recreate(std::move(model1), "ntuple", fileGuard1.GetPath(), wopts); + auto tfile = TFile::Open(fileGuard1.GetPath().c_str(), "RECREATE"); + auto writer1 = RNTupleWriter::Append(std::move(model1), "ntuple", *tfile, wopts); auto updater = writer1->CreateModelUpdater(); updater->BeginUpdate(); updater->AddField(RFieldBase::Create("flt", "float").Unwrap()); @@ -2613,7 +2720,7 @@ TEST(RNTupleMerger, MergeDeferredAdvanced) auto pInt = reader->GetModel().GetDefaultEntry().GetPtr("int"); auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr("flt"); - for (auto i = 0u; i < reader->GetNEntries(); ++i) { + for (auto i : reader->GetEntryRange()) { reader->LoadEntry(i); float expectedFlt = (i >= 10 && i < 15) ? 0 : i; EXPECT_FLOAT_EQ(*pFlt, expectedFlt); From cd91ac297bff57e5307e2709c94cad3e47ba83f2 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 17 Jun 2026 16:43:26 +0200 Subject: [PATCH 11/17] [ntuple] Merger: extract column repr matching into a separate function --- tree/ntuple/src/RNTupleMerger.cxx | 214 ++++++++++++++++-------------- 1 file changed, 116 insertions(+), 98 deletions(-) diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index b50edf9382f1e..01239ff10a46f 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -443,6 +443,121 @@ std::ostream &operator<<(std::ostream &os, const std::optional &errors) +{ + const auto &srcColumns = srcField.GetLogicalColumnIds(); + const auto &dstColumns = dstField.GetLogicalColumnIds(); + + // Fields must have the same number of columns + const auto srcNCols = srcColumns.size(); + const auto dstNCols = dstColumns.size(); + if (srcNCols != dstNCols) { + std::stringstream ss; + ss << "Field `" << srcField.GetFieldName() + << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols + << ", new: " << srcNCols << ")"; + errors.push_back(ss.str()); + return; + } + + // Fields must have the same cardinality + const std::uint32_t srcColCardinality = srcField.GetColumnCardinality(); + const std::uint32_t dstColCardinality = dstField.GetColumnCardinality(); + if (srcColCardinality != dstColCardinality) { + std::stringstream ss; + ss << "Field `" << srcField.GetFieldName() + << "` has a different column cardinality than previously-seen field with the same name (old: " + << dstColCardinality << ", new: " << srcColCardinality << ")"; + errors.push_back(ss.str()); + return; + } + + if (srcColCardinality == 0) + return; // no columns to match + + const auto srcNColReprs = srcNCols / srcColCardinality; + const auto dstNColReprs = dstNCols / dstColCardinality; + + // For each column representation of the source, check if it matches one in the descriptor. + // If so, and if it doesn't match the destination's repr index, add a mapping for it. + // If nothing matches, schedule the column representation to be added later. + // NOTE: this has quadratic complexity but the numbers involved are small so it's fine. + for (auto srcReprIdx = 0u; srcReprIdx < srcNColReprs; ++srcReprIdx) { + std::int64_t matchingRepr = -1; + for (auto dstReprIdx = 0u; dstReprIdx < dstNColReprs; ++dstReprIdx) { + bool matches = true; + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId); + const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; + const auto &dstCol = dstDesc.GetColumnDescriptor(dstColId); + if (srcCol.GetType() != dstCol.GetType()) { + matches = false; + break; + } + } + + if (matches) { + // If this column representation matches by column type, we need to make sure that it also has + // matching column metadata. Since we currently do not support multiple column representations + // that only differ by such metadata, we forbid merging such columns (e.g. we cannot merge two + // Real32Trunc columns with different bit widths). This could technically be supported, but it + // would require significant effort, so we currently don't. + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId); + const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; + const auto &dstCol = dstDesc.GetColumnDescriptor(dstColId); + if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() || + srcCol.GetValueRange() != dstCol.GetValueRange()) { + std::stringstream ss; + ss << "Source field `" << srcField.GetFieldName() + << "` has a matching column representation as its destination field, however one or " + "more " + "of its columns have different column metadata (bit width and/or value range). " + "Merging variable-sized columns is currently only supported if all metadata is " + "identical between source and destination columns." + << "\n bit width src: " << srcCol.GetBitsOnStorage() << ", dst: " << dstCol.GetBitsOnStorage() + << "" + << "\n value range src: " << srcCol.GetValueRange() << ", dst: " << dstCol.GetValueRange(); + errors.push_back(ss.str()); + break; + } + } + + // We found a valid matching representation: break the loop on the dst column representations. + matchingRepr = dstReprIdx; + break; + } + } + + if (errors.empty()) { + if (matchingRepr >= 0 && matchingRepr != srcReprIdx) { + // a different matching representation was found + assert(matchingRepr < std::numeric_limits::max()); + result.fColReprMappings[&dstField].push_back( + RColReprMapping{srcReprIdx, static_cast(matchingRepr)}); + } else if (matchingRepr < 0) { + // this representation was not found in the destination + assert(dstNColReprs < std::numeric_limits::max()); + ROOT::RFieldBase::ColumnRepresentation_t newRepr; + for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { + const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; + const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId); + newRepr.push_back(srcCol.GetType()); + } + RColReprExtension extension{{srcReprIdx, static_cast(dstNColReprs)}, newRepr}; + result.fColReprExtensions[&dstField].push_back(extension); + result.fColReprMappings[&dstField].push_back(extension); + } + } + } +} + /// Compares the top level fields of `dst` and `src` and determines whether they can be merged or not. /// In addition, returns the differences between `dst` and `src`'s structures static ROOT::RResult @@ -560,104 +675,7 @@ CompareDescriptorStructure(const ROOT::RNTupleDescriptor &dst, const ROOT::RNTup // Require that column representations match if (!field.fSrc->IsProjectedField()) { - const auto &srcColumns = field.fSrc->GetLogicalColumnIds(); - const auto &dstColumns = field.fDst->GetLogicalColumnIds(); - const auto srcNCols = srcColumns.size(); - const auto dstNCols = dstColumns.size(); - if (srcNCols != dstNCols) { - std::stringstream ss; - ss << "Field `" << field.fSrc->GetFieldName() - << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols - << ", new: " << srcNCols << ")"; - errors.push_back(ss.str()); - } else { - const std::uint32_t srcColCardinality = field.fSrc->GetColumnCardinality(); - const std::uint32_t dstColCardinality = field.fDst->GetColumnCardinality(); - if (srcColCardinality != dstColCardinality) { - std::stringstream ss; - ss << "Field `" << field.fSrc->GetFieldName() - << "` has a different column cardinality than previously-seen field with the same name (old: " - << dstColCardinality << ", new: " << srcColCardinality << ")"; - errors.push_back(ss.str()); - } else if (srcColCardinality > 0) { - const auto srcNColReprs = srcNCols / srcColCardinality; - const auto dstNColReprs = dstNCols / dstColCardinality; - - // For each column representation of the source, check if it matches one in the descriptor. - // If so, and if it doesn't match the destination's repr index, add a mapping for it. - // If nothing matches, schedule the column representation to be added later. - // NOTE: this has quadratic complexity but the numbers involved are small so it's fine. - for (auto srcReprIdx = 0u; srcReprIdx < srcNColReprs; ++srcReprIdx) { - std::int64_t matchingRepr = -1; - for (auto dstReprIdx = 0u; dstReprIdx < dstNColReprs; ++dstReprIdx) { - bool matches = true; - for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { - const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; - const auto &srcCol = src.GetColumnDescriptor(srcColId); - const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; - const auto &dstCol = dst.GetColumnDescriptor(dstColId); - if (srcCol.GetType() != dstCol.GetType()) { - matches = false; - break; - } - } - - if (matches) { - // If this column representation matches by column type, we need to make sure that it also has - // matching column metadata. Since we currently do not support multiple column representations - // that only differ by such metadata, we forbid merging such columns (e.g. we cannot merge two - // Real32Trunc columns with different bit widths). This could technically be supported, but it - // would require significant effort, so we currently don't. - for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { - const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; - const auto &srcCol = src.GetColumnDescriptor(srcColId); - const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; - const auto &dstCol = dst.GetColumnDescriptor(dstColId); - if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() || - srcCol.GetValueRange() != dstCol.GetValueRange()) { - std::stringstream ss; - ss << "Source field `" << field.fSrc->GetFieldName() - << "` has a matching column representation as its destination field, however one or " - "more " - "of its columns have different column metadata (bit width and/or value range). " - "Merging variable-sized columns is currently only supported if all metadata is " - "identical between source and destination columns." - << "\n bit width src: " << srcCol.GetBitsOnStorage() - << ", dst: " << dstCol.GetBitsOnStorage() << "" - << "\n value range src: " << srcCol.GetValueRange() - << ", dst: " << dstCol.GetValueRange(); - errors.push_back(ss.str()); - break; - } - } - matchingRepr = dstReprIdx; - break; - } - } - - if (errors.empty()) { - if (matchingRepr >= 0 && matchingRepr != srcReprIdx) { - // a different matching representation was found - assert(matchingRepr < std::numeric_limits::max()); - res.fColReprMappings[field.fDst].push_back( - RColReprMapping{srcReprIdx, static_cast(matchingRepr)}); - } else if (matchingRepr < 0) { - // this representation was not found in the destination - assert(dstNColReprs < std::numeric_limits::max()); - ROOT::RFieldBase::ColumnRepresentation_t newRepr; - for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { - const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; - const auto &srcCol = src.GetColumnDescriptor(srcColId); - newRepr.push_back(srcCol.GetType()); - } - RColReprExtension extension{{srcReprIdx, static_cast(dstNColReprs)}, newRepr}; - res.fColReprExtensions[field.fDst].push_back(extension); - res.fColReprMappings[field.fDst].push_back(extension); - } - } - } - } - } + MatchColumnRepresentations(src, dst, *field.fSrc, *field.fDst, res, errors); } // Require that subfields are compatible From ed6f08eb29d9f78b42dcc20310e311abbde0d39e Mon Sep 17 00:00:00 2001 From: silverweed Date: Mon, 8 Jun 2026 15:39:53 +0200 Subject: [PATCH 12/17] [ntuple] Merger: accept different sizes for fields' logical column ids Two field descriptors might legitimately have different numbers of elements in their LogicalColumnIds vectors due to having a different number of representations. The proper check to do is the one on the column cardinality, which makes sure that the same number of columns is actually used at a time. --- tree/ntuple/src/RNTupleMerger.cxx | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index 01239ff10a46f..16be7a1ca3f01 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -452,18 +452,6 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c const auto &srcColumns = srcField.GetLogicalColumnIds(); const auto &dstColumns = dstField.GetLogicalColumnIds(); - // Fields must have the same number of columns - const auto srcNCols = srcColumns.size(); - const auto dstNCols = dstColumns.size(); - if (srcNCols != dstNCols) { - std::stringstream ss; - ss << "Field `" << srcField.GetFieldName() - << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols - << ", new: " << srcNCols << ")"; - errors.push_back(ss.str()); - return; - } - // Fields must have the same cardinality const std::uint32_t srcColCardinality = srcField.GetColumnCardinality(); const std::uint32_t dstColCardinality = dstField.GetColumnCardinality(); @@ -479,8 +467,8 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c if (srcColCardinality == 0) return; // no columns to match - const auto srcNColReprs = srcNCols / srcColCardinality; - const auto dstNColReprs = dstNCols / dstColCardinality; + const auto srcNColReprs = srcColumns.size() / srcColCardinality; + const auto dstNColReprs = dstColumns.size() / dstColCardinality; // For each column representation of the source, check if it matches one in the descriptor. // If so, and if it doesn't match the destination's repr index, add a mapping for it. From 353fe6f3bb0bd4a7e7c5476ffaa755546810fb75 Mon Sep 17 00:00:00 2001 From: silverweed Date: Mon, 8 Jun 2026 15:36:55 +0200 Subject: [PATCH 13/17] [ntuple] Add new merger test This test checks two things: 1. that the fix applied by 14ade04188a29b2821e7126c11d5ab9769129840 works properly (this is already checked by MergeDeferredAdvanced but that might not be the case anymore if we update the merger to change the column encodings depending on the output compression - in which case that test would not be adding a late extended column anymore); 2. that the merger properly handles fields with the same cardinality but different number of column representations --- tree/ntuple/test/ntuple_merger.cxx | 87 ++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tree/ntuple/test/ntuple_merger.cxx b/tree/ntuple/test/ntuple_merger.cxx index 56467baf013b2..d5f4cafd865cf 100644 --- a/tree/ntuple/test/ntuple_merger.cxx +++ b/tree/ntuple/test/ntuple_merger.cxx @@ -2729,6 +2729,93 @@ TEST(RNTupleMerger, MergeDeferredAdvanced) } } +TEST(RNTupleMerger, MergeDeferredAdvanced2) +{ + // Like MergeDeferredAdvanced, but make sure the fields have different column types so that the merging produces + // a late-extended column. + FileRaii fileGuard1("test_ntuple_merge_deferred_adv2_in_1.root"); + FileRaii fileGuard2("test_ntuple_merge_deferred_adv2_in_2.root"); + + // First RNTuple with late model extended field "flt" (column is not deferred because it's still entry 0) + { + auto model1 = RNTupleModel::Create(); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); + auto tfile = TFile::Open(fileGuard1.GetPath().c_str(), "RECREATE"); + auto writer1 = RNTupleWriter::Append(std::move(model1), "ntuple", *tfile, wopts); + auto updater = writer1->CreateModelUpdater(); + auto field = std::make_unique>("flt"); + field->SetHalfPrecision(); + updater->BeginUpdate(); + updater->AddField(std::move(field)); + updater->CommitUpdate(); + auto pFlt1 = writer1->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *pFlt1 = i; + writer1->Fill(); + } + } + + // Second RNTuple with late model extended field "flt" + { + auto model2 = RNTupleModel::Create(); + // Add a non-late model extended field so we can write some entries before we extend the model and obtain + // actual deferred columns in the extension header. + auto pInt = model2->MakeField("int"); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); + auto writer2 = RNTupleWriter::Recreate(std::move(model2), "ntuple", fileGuard2.GetPath(), wopts); + for (int i = 0; i < 5; ++i) { + *pInt = 10 + i; + writer2->Fill(); + } + auto updater = writer2->CreateModelUpdater(); + auto field = std::make_unique>("flt"); + field->SetColumnRepresentatives({{ROOT::ENTupleColumnType::kReal32}}); + updater->BeginUpdate(); + updater->AddField(std::move(field)); + updater->CommitUpdate(); + auto pFlt2 = writer2->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 5; i < 10; ++i) { + *pInt = 10 + i; + *pFlt2 = 10 + i; + writer2->Fill(); + } + } + + // Now merge them + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + FileRaii fileGuardOut("test_ntuple_merge_deferred_adv2_out.root"); + auto wopts = RNTupleWriteOptions(); + wopts.SetCompression(0); + auto destination = std::make_unique("ntuple", fileGuardOut.GetPath(), wopts); + RNTupleMerger merger{std::move(destination)}; + auto opts = RNTupleMergeOptions(); + opts.fMergingMode = ENTupleMergingMode::kUnion; + auto res = merger.Merge(sourcePtrs, opts); + ASSERT_TRUE(bool(res)); + + auto reader = RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 20); + + auto pInt = reader->GetModel().GetDefaultEntry().GetPtr("int"); + auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr("flt"); + for (auto i : reader->GetEntryRange()) { + reader->LoadEntry(i); + float expectedFlt = (i >= 10 && i < 15) ? 0 : i; + EXPECT_FLOAT_EQ(*pFlt, expectedFlt); + int expectedInt = (i >= 10 && i < 20) * i; + EXPECT_EQ(*pInt, expectedInt); + } +} + TEST(RNTupleMerger, MergeIncrementalLMExt) { // Create the input files: From 5ab7637240d8a0de3d35ec9d1aa2694dd5af59e0 Mon Sep 17 00:00:00 2001 From: silverweed Date: Tue, 28 Apr 2026 11:01:45 +0200 Subject: [PATCH 14/17] [ntuple] support merging columns with metadata (with different types) --- tree/ntuple/inc/ROOT/RPageStorage.hxx | 8 +- tree/ntuple/src/RNTupleMerger.cxx | 14 +- tree/ntuple/src/RPageStorage.cxx | 22 ++-- tree/ntuple/test/ntuple_merger.cxx | 181 ++++++++++++++++++++++++++ 4 files changed, 211 insertions(+), 14 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RPageStorage.hxx b/tree/ntuple/inc/ROOT/RPageStorage.hxx index 9fbc3a10400fd..643c5c592a697 100644 --- a/tree/ntuple/inc/ROOT/RPageStorage.hxx +++ b/tree/ntuple/inc/ROOT/RPageStorage.hxx @@ -543,10 +543,16 @@ public: [[nodiscard]] std::unique_ptr InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters); + struct RColumnReprElement { + ENTupleColumnType fType = ENTupleColumnType::kUnknown; + // 0 means "use default". Only valid for fixed-bitwidth column types. + std::uint16_t fBitWidth = 0; + std::optional fValueRange; + }; /// Adds a new column representation to the given field. /// \return The physical id of the first newly added column. ROOT::DescriptorId_t - AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span newRepresentation); + AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span newRepresentation); /// Adds a new alias column pointing to an existing column with the given physical id to the given field. void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field, diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index 16be7a1ca3f01..bbf08a2cda3b7 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -342,10 +342,10 @@ struct RColReprMapping { }; /// A column extension that needs to be added to an output field. -/// Note that this also adds a mapping for the new representation, which is why this inherits RColReprMapping. +/// Note that this also adds a mapping for each new representation, which is why it inherits RColReprMapping. struct RColReprExtension : RColReprMapping { - /// The new representation to be added - ROOT::RFieldBase::ColumnRepresentation_t fSourceRepr; + /// The new representations to be added + std::vector fSourceRepr; }; static std::optional @@ -532,11 +532,15 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c } else if (matchingRepr < 0) { // this representation was not found in the destination assert(dstNColReprs < std::numeric_limits::max()); - ROOT::RFieldBase::ColumnRepresentation_t newRepr; + std::vector newRepr; + newRepr.reserve(srcColCardinality); for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId); - newRepr.push_back(srcCol.GetType()); + auto &reprElement = newRepr.emplace_back(); + reprElement.fType = srcCol.GetType(); + reprElement.fBitWidth = srcCol.GetBitsOnStorage(); + reprElement.fValueRange = srcCol.GetValueRange(); } RColReprExtension extension{{srcReprIdx, static_cast(dstNColReprs)}, newRepr}; result.fColReprExtensions[&dstField].push_back(extension); diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index 282dfc58ea320..cb516d7306df1 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -1146,7 +1146,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr ROOT::DescriptorId_t ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field, - std::span newRepresentation) + std::span newRepresentation) { const auto &descriptor = fDescriptorBuilder.GetDescriptor(); @@ -1162,10 +1162,15 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD fDescriptorBuilder.ShiftAliasColumns(newRepresentation.size()); std::uint16_t columnIndex = 0; // index into the representation - for (auto columnType : newRepresentation) { - // Extending columns with variable bit width is currently unsupported. - const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnType); - R__ASSERT(rangeMin == rangeMax); + for (auto columnRepr : newRepresentation) { + std::size_t bitsOnStorage = columnRepr.fBitWidth; + if (!bitsOnStorage) { + const auto [rangeMin, rangeMax] = ROOT::Internal::RColumnElementBase::GetValidBitRange(columnRepr.fType); + if (rangeMin != rangeMax) { + throw ROOT::RException(R__FAIL("bit width must be given for columns of variable bit width")); + } + bitsOnStorage = rangeMin; + } const ROOT::DescriptorId_t firstReprColumnId = field.GetLogicalColumnIds()[columnIndex]; const auto &firstReprColumnRange = fOpenColumnRanges.at(firstReprColumnId); @@ -1175,12 +1180,13 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD columnBuilder.LogicalColumnId(columnId) .PhysicalColumnId(columnId) .FieldId(field.GetId()) - .BitsOnStorage(rangeMax) - .Type(columnType) + .BitsOnStorage(bitsOnStorage) + .Type(columnRepr.fType) .Index(columnIndex) // NOTE: marking this column as suppressed with the minus sign .FirstElementIndex(-firstReprColumnRange.GetFirstElementIndex()) - .RepresentationIndex(reprIndex); + .RepresentationIndex(reprIndex) + .ValueRange(columnRepr.fValueRange); fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap()); for (auto parentId = field.GetParentId(); parentId != ROOT::kInvalidDescriptorId;) { diff --git a/tree/ntuple/test/ntuple_merger.cxx b/tree/ntuple/test/ntuple_merger.cxx index d5f4cafd865cf..ab2eccada9ba6 100644 --- a/tree/ntuple/test/ntuple_merger.cxx +++ b/tree/ntuple/test/ntuple_merger.cxx @@ -4225,3 +4225,184 @@ TEST(RNTupleMerger, MergeNewerVersion) } } } + +TEST(RNTupleMerger, MergeReal32Trunc) +{ + // Merge two files, both containing the same Real32Trunc-encoded field, but with different bit widths. + FileRaii fileGuard1("test_ntuple_merge_real32trunc_in_1.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetTruncated(14); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = i; + ntuple->Fill(); + } + } + FileRaii fileGuard2("test_ntuple_merge_real32trunc_in_2.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetTruncated(24); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = 10 + i; + ntuple->Fill(); + } + } + { + // Gather the input sources + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + // Now merge the inputs + for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) { + SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode)); + FileRaii fileGuardOut("test_ntuple_merge_real32trunc_out.root"); + { + auto destination = std::make_unique("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions()); + RNTupleMerger merger{std::move(destination)}; + RNTupleMergeOptions opts; + opts.fMergingMode = mmode; + auto res = merger.Merge(sourcePtrs, opts); + // Currently we're not supporting merging columns with the same type but different metadata. + // TODO: support this. + EXPECT_FALSE(bool(res)); + EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata")); + } + } + } +} + +TEST(RNTupleMerger, MergeReal32Quant) +{ + // Merge two files, both containing the same Real32Quant-encoded field, but with different value ranges. + FileRaii fileGuard1("test_ntuple_merge_real32quant_in_1.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetQuantized(20, {0., 100.}); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = i; + ntuple->Fill(); + } + } + FileRaii fileGuard2("test_ntuple_merge_real32quant_in_2.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetQuantized(20, {-100., 100.}); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = 10 + i; + ntuple->Fill(); + } + } + { + // Gather the input sources + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + // Now merge the inputs + for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) { + SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode)); + FileRaii fileGuardOut("test_ntuple_merge_real32quant_out.root"); + { + auto destination = std::make_unique("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions()); + RNTupleMerger merger{std::move(destination)}; + RNTupleMergeOptions opts; + opts.fMergingMode = mmode; + auto res = merger.Merge(sourcePtrs, opts); + // Currently we're not supporting merging columns with the same type but different metadata. + // TODO: support this. + ASSERT_FALSE(bool(res)); + EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata")); + } + } + } +} + +TEST(RNTupleMerger, MergeReal32TruncQuantMixed) +{ + // Merge two files, both containing the same field, but with the first being Real32Trunc and the second Real32Quant + FileRaii fileGuard1("test_ntuple_merge_real32truncquant_in_1.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetTruncated(24); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = i; + ntuple->Fill(); + } + } + FileRaii fileGuard2("test_ntuple_merge_real32truncquant_in_2.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("flt"); + field->SetQuantized(20, {-1., 100.}); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath()); + auto fieldFlt = ntuple->GetModel().GetDefaultEntry().GetPtr("flt"); + for (int i = 0; i < 10; ++i) { + *fieldFlt = 10 + i; + ntuple->Fill(); + } + } + { + // Gather the input sources + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + // Now merge the inputs + for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) { + SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode)); + FileRaii fileGuardOut("test_ntuple_merge_real32truncquant_out.root"); + { + auto destination = std::make_unique("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions()); + RNTupleMerger merger{std::move(destination)}; + RNTupleMergeOptions opts; + opts.fMergingMode = mmode; + auto res = merger.Merge(sourcePtrs, opts); + EXPECT_TRUE(bool(res)); + } + { + auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 20); + EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2); + auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr("flt"); + for (auto i : reader->GetEntryRange()) { + reader->LoadEntry(i); + EXPECT_NEAR(*pFlt, i, 0.01f); + } + } + } + } +} From b3e6c54a8c3dc6711764029fb5b2970c74a79aa1 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 17 Jun 2026 17:01:54 +0200 Subject: [PATCH 15/17] [ntuple] Support merging columns with same type and different metadata --- tree/ntuple/inc/ROOT/RColumnElementBase.hxx | 8 ++ tree/ntuple/inc/ROOT/RPageStorage.hxx | 10 +-- tree/ntuple/src/RNTupleMerger.cxx | 27 +++--- tree/ntuple/src/RPageStorage.cxx | 2 +- tree/ntuple/test/ntuple_merger.cxx | 95 +++++++++++++++++++-- 5 files changed, 108 insertions(+), 34 deletions(-) diff --git a/tree/ntuple/inc/ROOT/RColumnElementBase.hxx b/tree/ntuple/inc/ROOT/RColumnElementBase.hxx index 1807c31128baf..d6f0d889996a4 100644 --- a/tree/ntuple/inc/ROOT/RColumnElementBase.hxx +++ b/tree/ntuple/inc/ROOT/RColumnElementBase.hxx @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -147,6 +148,13 @@ std::unique_ptr RColumnElementBase::Generate(ROOT::ENTupleCo template <> std::unique_ptr RColumnElementBase::Generate(ROOT::ENTupleColumnType onDiskType); +struct RColumnFormat { + ENTupleColumnType fType = ENTupleColumnType::kUnknown; + // 0 means "use default". Only valid for fixed-bitwidth column types. + std::uint16_t fBitWidth = 0; + std::optional fValueRange; +}; + } // namespace ROOT::Internal #endif diff --git a/tree/ntuple/inc/ROOT/RPageStorage.hxx b/tree/ntuple/inc/ROOT/RPageStorage.hxx index 643c5c592a697..0c20957524b28 100644 --- a/tree/ntuple/inc/ROOT/RPageStorage.hxx +++ b/tree/ntuple/inc/ROOT/RPageStorage.hxx @@ -543,16 +543,10 @@ public: [[nodiscard]] std::unique_ptr InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters); - struct RColumnReprElement { - ENTupleColumnType fType = ENTupleColumnType::kUnknown; - // 0 means "use default". Only valid for fixed-bitwidth column types. - std::uint16_t fBitWidth = 0; - std::optional fValueRange; - }; /// Adds a new column representation to the given field. /// \return The physical id of the first newly added column. - ROOT::DescriptorId_t - AddColumnRepresentation(const ROOT::RFieldDescriptor &field, std::span newRepresentation); + ROOT::DescriptorId_t AddColumnRepresentation(const ROOT::RFieldDescriptor &field, + std::span newRepresentation); /// Adds a new alias column pointing to an existing column with the given physical id to the given field. void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field, diff --git a/tree/ntuple/src/RNTupleMerger.cxx b/tree/ntuple/src/RNTupleMerger.cxx index bbf08a2cda3b7..3502e95970ad6 100644 --- a/tree/ntuple/src/RNTupleMerger.cxx +++ b/tree/ntuple/src/RNTupleMerger.cxx @@ -345,7 +345,7 @@ struct RColReprMapping { /// Note that this also adds a mapping for each new representation, which is why it inherits RColReprMapping. struct RColReprExtension : RColReprMapping { /// The new representations to be added - std::vector fSourceRepr; + std::vector fSourceRepr; }; static std::optional @@ -500,26 +500,19 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c const auto &srcCol = srcDesc.GetColumnDescriptor(srcColId); const auto dstColId = dstColumns[dstReprIdx * dstColCardinality + reprColIdx]; const auto &dstCol = dstDesc.GetColumnDescriptor(dstColId); - if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() || + if (srcCol.GetType() != dstCol.GetType() || + srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage() || srcCol.GetValueRange() != dstCol.GetValueRange()) { - std::stringstream ss; - ss << "Source field `" << srcField.GetFieldName() - << "` has a matching column representation as its destination field, however one or " - "more " - "of its columns have different column metadata (bit width and/or value range). " - "Merging variable-sized columns is currently only supported if all metadata is " - "identical between source and destination columns." - << "\n bit width src: " << srcCol.GetBitsOnStorage() << ", dst: " << dstCol.GetBitsOnStorage() - << "" - << "\n value range src: " << srcCol.GetValueRange() << ", dst: " << dstCol.GetValueRange(); - errors.push_back(ss.str()); + matches = false; break; } } - // We found a valid matching representation: break the loop on the dst column representations. - matchingRepr = dstReprIdx; - break; + if (matches) { + // We found a valid matching representation. + matchingRepr = dstReprIdx; + break; + } } } @@ -532,7 +525,7 @@ static void MatchColumnRepresentations(const ROOT::RNTupleDescriptor &srcDesc, c } else if (matchingRepr < 0) { // this representation was not found in the destination assert(dstNColReprs < std::numeric_limits::max()); - std::vector newRepr; + std::vector newRepr; newRepr.reserve(srcColCardinality); for (auto reprColIdx = 0u; reprColIdx < srcColCardinality; ++reprColIdx) { const auto srcColId = srcColumns[srcReprIdx * srcColCardinality + reprColIdx]; diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index cb516d7306df1..f3a4733204417 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -1146,7 +1146,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr ROOT::DescriptorId_t ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldDescriptor &field, - std::span newRepresentation) + std::span newRepresentation) { const auto &descriptor = fDescriptorBuilder.GetDescriptor(); diff --git a/tree/ntuple/test/ntuple_merger.cxx b/tree/ntuple/test/ntuple_merger.cxx index ab2eccada9ba6..fde82b7fb4d47 100644 --- a/tree/ntuple/test/ntuple_merger.cxx +++ b/tree/ntuple/test/ntuple_merger.cxx @@ -4275,10 +4275,17 @@ TEST(RNTupleMerger, MergeReal32Trunc) RNTupleMergeOptions opts; opts.fMergingMode = mmode; auto res = merger.Merge(sourcePtrs, opts); - // Currently we're not supporting merging columns with the same type but different metadata. - // TODO: support this. - EXPECT_FALSE(bool(res)); - EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata")); + EXPECT_TRUE(bool(res)); + } + { + auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 20); + EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2); + auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr("flt"); + for (auto i : reader->GetEntryRange()) { + reader->LoadEntry(i); + EXPECT_NEAR(*pFlt, i, 0.01f); + } } } } @@ -4333,10 +4340,17 @@ TEST(RNTupleMerger, MergeReal32Quant) RNTupleMergeOptions opts; opts.fMergingMode = mmode; auto res = merger.Merge(sourcePtrs, opts); - // Currently we're not supporting merging columns with the same type but different metadata. - // TODO: support this. - ASSERT_FALSE(bool(res)); - EXPECT_THAT(res.GetError()->GetReport(), testing::HasSubstr("have different column metadata")); + EXPECT_TRUE(bool(res)); + } + { + auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 20); + EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2); + auto pFlt = reader->GetModel().GetDefaultEntry().GetPtr("flt"); + for (auto i : reader->GetEntryRange()) { + reader->LoadEntry(i); + EXPECT_NEAR(*pFlt, i, 0.01f); + } } } } @@ -4406,3 +4420,68 @@ TEST(RNTupleMerger, MergeReal32TruncQuantMixed) } } } + +TEST(RNTupleMerger, MergeRealRegularQuantMixed) +{ + // Merge two files, both containing the same field, but with the first being a SplitReal64 and the second Real32Quant + FileRaii fileGuard1("test_ntuple_merge_realregquant_in_1.root"); + { + auto model = RNTupleModel::Create(); + auto fieldDbl = model->MakeField("dbl"); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard1.GetPath()); + for (int i = 0; i < 10; ++i) { + *fieldDbl = i; + ntuple->Fill(); + } + } + FileRaii fileGuard2("test_ntuple_merge_realregquant_in_2.root"); + { + auto model = RNTupleModel::Create(); + auto field = std::make_unique>("dbl"); + field->SetQuantized(29, {0., 20.}); + model->AddField(std::move(field)); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard2.GetPath()); + auto fieldDbl = ntuple->GetModel().GetDefaultEntry().GetPtr("dbl"); + for (int i = 0; i < 10; ++i) { + *fieldDbl = 10 + i; + ntuple->Fill(); + } + } + { + // Gather the input sources + std::vector> sources; + sources.push_back(RPageSource::Create("ntuple", fileGuard1.GetPath(), RNTupleReadOptions())); + sources.push_back(RPageSource::Create("ntuple", fileGuard2.GetPath(), RNTupleReadOptions())); + std::vector sourcePtrs; + for (const auto &s : sources) { + sourcePtrs.push_back(s.get()); + } + + // Now merge the inputs + for (const auto mmode : {ENTupleMergingMode::kFilter, ENTupleMergingMode::kStrict, ENTupleMergingMode::kUnion}) { + SCOPED_TRACE(std::string("with merging mode = ") + ToString(mmode)); + FileRaii fileGuardOut("test_ntuple_merge_realregquant_out.root"); + { + auto destination = std::make_unique("ntuple", fileGuardOut.GetPath(), RNTupleWriteOptions()); + RNTupleMerger merger{std::move(destination)}; + RNTupleMergeOptions opts; + opts.fMergingMode = mmode; + auto res = merger.Merge(sourcePtrs, opts); + EXPECT_TRUE(bool(res)); + } + { + auto reader = ROOT::RNTupleReader::Open("ntuple", fileGuardOut.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 20); + EXPECT_EQ(reader->GetDescriptor().GetNPhysicalColumns(), 2); + auto pDbl = reader->GetModel().GetDefaultEntry().GetPtr("dbl"); + for (auto i : reader->GetEntryRange()) { + reader->LoadEntry(i); + if (i < 10) + EXPECT_DOUBLE_EQ(*pDbl, i); + else + EXPECT_NEAR(*pDbl, i, 0.01f); + } + } + } + } +} From 1feb2aa44d93409009b6d54e545e125ea1f23149 Mon Sep 17 00:00:00 2001 From: silverweed Date: Tue, 28 Apr 2026 13:00:18 +0200 Subject: [PATCH 16/17] [ntuple][NFC] update Merging.md --- tree/ntuple/doc/Merging.md | 60 ++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/tree/ntuple/doc/Merging.md b/tree/ntuple/doc/Merging.md index 5cebd6483613c..e69a924e47012 100644 --- a/tree/ntuple/doc/Merging.md +++ b/tree/ntuple/doc/Merging.md @@ -15,11 +15,13 @@ Please note that the RNTupleMerger is currently experimental and the content of Currently there is no guarantee for the user about which mode will be used to generate the merged RNTuple. At the moment, this is how it works: -- if both compression and encoding of the target column match those of the source column, L1 is used; -- otherwise, if compression matches but encoding doesn't, L2 is used; -- otherwise L3 is used. +- if the compression of the target column match that of the source column, L1 is used; +- otherwise, L2 is used. -Note that L0 and L4 are currently never used. +L0, L3 and L4 are currently never used. + +**NOTE**: prior to ROOT 6.42, if two columns had the same compression but different encoding they would undergo L3 merging (implying a recompression and resealing); +from 6.42 onwards the RNTupleMerger will instead attach a new column to the parent field as a new representation and L1-merge them. ## Goal The goal of the RNTuple merging process is producing one output RNTuple from *N* input RNTuples that can be used as if it were produced directly in the merged state. This means that: @@ -44,15 +46,16 @@ Consequences of R3 and R4: The following properties are currently true but they are subject to change: * P1: all output pages have the **same compression** (which may be different from the input pages' compression); -* P2: all pages in the same output column have the **same encoding** (which may be different from the inputs' encoding); -* P3: the output clusters are **the same as the input clusters**; -* P4: the output RNTuple **always has 1 cluster group** +* P2: the output clusters are **the same as the input clusters**; +* P3: the output RNTuple **always has 1 cluster group** + +Note that these properties influence and are influenced by the level of merging used. +E.g. P1 is currently true because we only support L1 merging of pages with identical compressions. This is a limitation that we intend to lift at some point (both for L1 and L0 if we ever support it). +P2 and P3 would not necessarily be true with L4 support (which might be desirable in some cases, e.g. to group pages into smaller/larger clusters). -Note that these properties influence and are influenced by the level of merging used. -E.g. P1 and P2 are currently true because we only support L1 merging of pages with identical compressions. This is a limitation that we intend to lift at some point (both for L1 and L0 if we ever support it). -P3 and P4 would not necessarily be true with L4 support (which might be desirable in some cases, e.g. to group pages into smaller/larger clusters). +Also note that the output pages coming from matching columns of a field may use mixed encodings. -Therefore we *will* want to drop these properties at some point, in order to improve the capabilities of the Merger. +Therefore we *will* want to drop at least some of these properties at some point, in order to improve the capabilities of the Merger. ## High-level description The merging process requires at least 1 input, in the form of an `RPageSource`. @@ -64,14 +67,15 @@ In `Union` mode only, we allow any subsequent input RNTuple to define new fields ## Descriptor compatibility and validation Whenever a new input is processed, we compare its descriptor with the output descriptor to verify that merging is possible. -The comparison function does 3 main things: +The comparison function does 4 main things: - collect all "extra destination fields" (i.e. fields that exist in the output but not in this input RNTuple) - collect all "extra source fields" from the input RNTuple -- collect and validate all common fields. +- collect and validate all common fields +- collect all columns that need to be extended with additional representations. -If the Merging Mode is set to **Filter** we require the "extra destination fields" list to be empty. -If the Merging Mode is set to **Strict** we require both the "extra destination fields" and "extra source fields" lists to be empty. -If the Merging Mode is set to **Union**, the "extra source fields" list is used to late model extend the destination model. +If the merging mode is set to **Filter** we require the "extra destination fields" list to be empty. +If the merging mode is set to **Strict** we require both the "extra destination fields" and "extra source fields" lists to be empty. +If the merging mode is set to **Union**, the "extra source fields" list is used to late model extend the destination model. As for common fields, they are matched by name and validated as follows: - any field that is projected in the destination must be also projected in the source and must be projected to the same field; @@ -90,3 +94,27 @@ As for common fields, they are matched by name and validated as follows: 1: these restrictions will likely not be required for L4 merging. + +## Column representation extension +In all merging modes, we allow new column representations to be attached to the source fields. This is done to allow for L1 merging of columns with different encodings, which would otherwise require recompressing. +These new column representations are added to the output RNTuple's footer and become part of its Schema Extension section. Note that in general these columns will be added as deferred *and* suppressed. + +**Technical note**: this is *not* done via the regular late model extension API, but uses internal functionality. + +We add new (physical) column representations in the following cases: + +- when one or more columns of a field has a different type than its matching counterpart in the destination RNTuple; +- when one or more columns of a field has the same type but different metadata than its matching counterpart in the destination RNTuple (e.g. in case of a Real32Quant column, different bit width or value range). + +Whenever we extend a physical column that is referred to by one or more alias columns in some projected fields, we also add a corresponding new alias column in those fields. + +#### Example +Suppose we merge source RNTuples **S1** and **S2**, each with the following fields: + +1. `foo` of type `int` +1. `fooProj` projecting onto field `foo` + +Suppose that S1 is compressed and thus its `foo` field is represented by a column of type `kSplitInt32`, whereas S2 is uncompressed and its `foo` field is represented by a column `kInt32`. +When merging S1 and S2 we collate those two representations under the same field `foo`, so that it will now have representatives: `{kSplitInt32, kInt32}`. +At the same time, we add a second alias column to the field `fooProj`, which will now have its first column aliasing the `kSplitInt32` column (column 0 of field `foo`) and its second one aliasing the `kInt32` one (column 1 of field `foo`). + From 9eaef1b8f1bb99a15815389fca97ec8562c86ee3 Mon Sep 17 00:00:00 2001 From: silverweed Date: Wed, 17 Jun 2026 16:12:28 +0200 Subject: [PATCH 17/17] [ntuple] ensure valid descriptor in AddAliasColumn/AddColumnRepresentation Also set the anchor version in InitFromDescriptor(). --- tree/ntuple/src/RPageStorage.cxx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index f3a4733204417..c4a02934ad6dc 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -1077,6 +1077,7 @@ ROOT::Internal::RPagePersistentSink::InitFromDescriptor(const ROOT::RNTupleDescr { // Create new descriptor fDescriptorBuilder.SetSchemaFromExisting(srcDescriptor); + fDescriptorBuilder.SetVersionForWriting(); const auto &descriptor = fDescriptorBuilder.GetDescriptor(); // Create column/page ranges @@ -1215,6 +1216,8 @@ ROOT::Internal::RPagePersistentSink::AddColumnRepresentation(const ROOT::RFieldD ++columnIndex; } + fDescriptorBuilder.EnsureValidDescriptor().ThrowOnError(); + return firstPhysicalIndex; } @@ -1238,6 +1241,8 @@ void ROOT::Internal::RPagePersistentSink::AddAliasColumn(const ROOT::RNTupleDesc .FirstElementIndex(pointedColumn.GetFirstElementIndex()) .RepresentationIndex(pointedColumn.GetRepresentationIndex()); fDescriptorBuilder.AddColumn(columnBuilder.MakeDescriptor().Unwrap()); + + fDescriptorBuilder.EnsureValidDescriptor().ThrowOnError(); } void ROOT::Internal::RPagePersistentSink::CommitSuppressedColumn(ColumnHandle_t columnHandle)