-
Notifications
You must be signed in to change notification settings - Fork 1.5k
[ntuple] Support multiple column representations in the merger #22017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
92ef8ff
979768c
b529088
93ada51
e1599a0
1d1d5a3
a88fefe
669ab51
f692f18
9b7db69
cd91ac2
ed6f08e
353fe6f
5ab7637
b3e6c54
1feb2aa
9eaef1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -514,6 +514,12 @@ public: | |
| ROOT::NTupleSize_t GetFirstEntryIndex() const { return fFirstEntryIndex; } | ||
| ROOT::NTupleSize_t GetNEntries() const { return fNEntries; } | ||
| const RColumnRange &GetColumnRange(ROOT::DescriptorId_t physicalId) const { return fColumnRanges.at(physicalId); } | ||
| const RColumnRange *TryGetColumnRange(ROOT::DescriptorId_t physicalId) const | ||
| { | ||
| if (auto it = fColumnRanges.find(physicalId); it != fColumnRanges.end()) | ||
| return &it->second; | ||
| return nullptr; | ||
| } | ||
|
Comment on lines
+517
to
+522
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is only used once and can be replaed by
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the other hand, is it really a problem to have the method? I imagine this could be useful for users in general and since we're hiding the underlying maps we should not force a suboptimal access pattern via the API imho. |
||
| const RPageRange &GetPageRange(ROOT::DescriptorId_t physicalId) const { return fPageRanges.at(physicalId); } | ||
| /// Returns an iterator over pairs { columnId, columnRange }. The iteration order is unspecified. | ||
| RColumnRangeIterable GetColumnRangeIterable() const; | ||
|
|
@@ -768,6 +774,12 @@ public: | |
| /// All known feature flags. | ||
| /// Note that the flag values represent the bit _index_, not the already-bitshifted integer. | ||
| enum EFeatureFlags { | ||
| /// Signals that the RNTuple contains at least one deferred column that is part of a collection and was extended | ||
| /// (i.e. it appears in the footer). This can happen when merging two RNTuples that have the same collection field | ||
| /// backed by columns with different encoding, e.g. a vector<float> whose elements are represented by SplitReal32 | ||
| /// in the first ntuple and by Real32 in the second. | ||
| /// Added in version 1.1.0.0 of the binary format. | ||
| kFeatureFlag_NestedDeferredColumns = 0, | ||
| // Insert new feature flags here, with contiguous values. If at any point a "hole" appears in the valid feature | ||
| // flags values, the check in RNTupleSerialize must be updated. | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -543,6 +543,15 @@ public: | |
| [[nodiscard]] std::unique_ptr<RNTupleModel> | ||
| InitFromDescriptor(const ROOT::RNTupleDescriptor &descriptor, bool copyClusters); | ||
|
|
||
| /// Adds a new column representation to the given field. | ||
| /// \return The physical id of the first newly added column. | ||
| ROOT::DescriptorId_t AddColumnRepresentation(const ROOT::RFieldDescriptor &field, | ||
| std::span<const ROOT::Internal::RColumnFormat> newRepresentation); | ||
|
|
||
| /// Adds a new alias column pointing to an existing column with the given physical id to the given field. | ||
| void AddAliasColumn(const ROOT::RNTupleDescriptor &desc, const ROOT::RFieldDescriptor &field, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pass field id instead of descriptor
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See comment above about this being an internal function (maybe an assert is good enough?)
silverweed marked this conversation as resolved.
|
||
| ROOT::DescriptorId_t physicalId); | ||
|
|
||
| void CommitSuppressedColumn(ColumnHandle_t columnHandle) final; | ||
| void CommitPage(ColumnHandle_t columnHandle, const ROOT::Internal::RPage &page) final; | ||
| void CommitSealedPage(ROOT::DescriptorId_t physicalColumnId, const RPageStorage::RSealedPage &sealedPage) final; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -683,14 +683,14 @@ void ROOT::RFieldBase::Attach(std::unique_ptr<ROOT::RFieldBase> child, std::stri | |
|
|
||
| ROOT::NTupleSize_t ROOT::RFieldBase::EntryToColumnElementIndex(ROOT::NTupleSize_t globalIndex) const | ||
| { | ||
| std::size_t result = globalIndex; | ||
| ROOT::NTupleSize_t result = globalIndex; | ||
| for (auto f = this; f != nullptr; f = f->GetParent()) { | ||
| auto parent = f->GetParent(); | ||
| if (parent && (parent->GetStructure() == ROOT::ENTupleStructure::kCollection || | ||
| parent->GetStructure() == ROOT::ENTupleStructure::kVariant)) { | ||
| return 0U; | ||
| } | ||
| result *= std::max(f->GetNRepetitions(), std::size_t{1U}); | ||
| result *= std::max<ROOT::NTupleSize_t>(f->GetNRepetitions(), ROOT::NTupleSize_t{1U}); | ||
| } | ||
| return result; | ||
| } | ||
|
|
@@ -850,10 +850,7 @@ void ROOT::RFieldBase::SetColumnRepresentatives(const RColumnRepresentations::Se | |
| if (itRepresentative == std::end(validTypes)) | ||
| throw RException(R__FAIL("invalid column representative")); | ||
|
|
||
| // don't add a duplicate representation | ||
| if (std::find_if(fColumnRepresentatives.begin(), fColumnRepresentatives.end(), | ||
| [&r](const auto &rep) { return r == rep.get(); }) == fColumnRepresentatives.end()) | ||
| fColumnRepresentatives.emplace_back(*itRepresentative); | ||
| fColumnRepresentatives.emplace_back(*itRepresentative); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps also for backport. |
||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -931,8 +931,21 @@ ROOT::Internal::RClusterDescriptorBuilder::AddExtendedColumnRanges(const RNTuple | |
| // `ROOT::RFieldBase::EntryToColumnElementIndex()`, i.e. it is a principal column reachable from the | ||
| // field zero excluding subfields of collection and variant fields. | ||
| if (c.IsDeferredColumn()) { | ||
| columnRange.SetFirstElementIndex(fCluster.GetFirstEntryIndex() * nRepetitions); | ||
| columnRange.SetNElements(fCluster.GetNEntries() * nRepetitions); | ||
| if (c.GetRepresentationIndex() == 0) { | ||
| columnRange.SetFirstElementIndex(fCluster.GetFirstEntryIndex() * nRepetitions); | ||
| columnRange.SetNElements(fCluster.GetNEntries() * nRepetitions); | ||
| } else { | ||
| // Deferred representations which are not the first cannot count on the number of elements being | ||
| // equal to Entries * nRepetitions because they might have been added in a later cluster. But they | ||
| // can rely on the first representation having the correct FirstElement/NElements (by definition | ||
| // the first representation cannot be an "extended" one), therefore they can just copy the value | ||
| // from it. | ||
| const auto &field = desc.GetFieldDescriptor(fieldId); | ||
| const auto firstReprColumnId = field.GetLogicalColumnIds()[c.GetIndex()]; | ||
| const auto &firstReprColumnRange = fCluster.fColumnRanges[firstReprColumnId]; | ||
| columnRange.SetFirstElementIndex(firstReprColumnRange.GetFirstElementIndex()); | ||
| columnRange.SetNElements(firstReprColumnRange.GetNElements()); | ||
| } | ||
|
silverweed marked this conversation as resolved.
|
||
| if (!columnRange.IsSuppressed()) { | ||
| auto &pageRange = fCluster.fPageRanges[physicalId]; | ||
| pageRange.fPhysicalColumnId = physicalId; | ||
|
|
@@ -1350,6 +1363,14 @@ void ROOT::Internal::RNTupleDescriptorBuilder::ShiftAliasColumns(std::uint32_t o | |
| R__ASSERT(fDescriptor.fColumnDescriptors.count(c.fLogicalColumnId) == 0); | ||
| fDescriptor.fColumnDescriptors.emplace(c.fLogicalColumnId, std::move(c)); | ||
| } | ||
|
|
||
| // Patch up column ids in the header extension | ||
| if (auto &xHeader = fDescriptor.fHeaderExtension) { | ||
| for (auto &columnId : xHeader->fExtendedColumnRepresentations) { | ||
| if (columnId >= fDescriptor.GetNPhysicalColumns()) | ||
| columnId += offset; | ||
| } | ||
| } | ||
|
Comment on lines
+1367
to
+1373
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should be able to add a test for this oversight independently in |
||
| } | ||
|
|
||
| ROOT::RResult<void> ROOT::Internal::RNTupleDescriptorBuilder::AddCluster(RClusterDescriptor &&clusterDesc) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This mentions explicitly collections. Is the feature (merging RNTuple with 'same' column with different representation) not supported for simple type (i.e. just a
floatinstead of avector<float>)? If not, why not?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is supported but it requires no feature flag because it produces files that are still readable by previous ROOT versions.
The feature flag is only added if the feature produces files that are forward-incompatible for older versions, which is only the case for collections.