doc/v634/RNTupleMerger_8cxx_source.html

/// \file RNTupleMerger.cxx

/// \ingroup NTuple ROOT7

/// \author Jakob Blomer <jblomer@cern.ch>, Max Orok <maxwellorok@gmail.com>, Alaettin Serhan Mete <amete@anl.gov>,

/// Giacomo Parolini <giacomo.parolini@cern.ch>

/// \date 2020-07-08

/// \warning This is part of the ROOT 7 prototype! It will

/// change without notice. It might trigger earthquakes. Feedback is welcome!


/*************************************************************************

 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers.               *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


#include <ROOT/RError.hxx>

#include <ROOT/RNTuple.hxx>

#include <ROOT/RNTupleDescriptor.hxx>

#include <ROOT/RNTupleMerger.hxx>

#include <ROOT/RNTupleModel.hxx>

#include <ROOT/RNTupleUtil.hxx>

#include <ROOT/RPageStorageFile.hxx>

#include <ROOT/RPageStorage.hxx>

#include <ROOT/RClusterPool.hxx>

#include <ROOT/RNTupleSerialize.hxx>

#include <ROOT/RNTupleZip.hxx>

#include <ROOT/RColumnElementBase.hxx>

#include <TROOT.h>

#include <TFileMergeInfo.h>

#include <TError.h>

#include <TFile.h>

#include <TKey.h>


#include <algorithm>

#include <deque>

#include <inttypes.h> // for PRIu64

#include <unordered_map>

#include <vector>


using namespace ROOT::Experimental;

using namespace ROOT::Experimental::Internal;


// Entry point for TFileMerger. Internally calls RNTupleMerger::Merge().

Long64_t ROOT::RNTuple::Merge(TCollection *inputs, TFileMergeInfo *mergeInfo)

// IMPORTANT: this function must not throw, as it is used in exception-unsafe code (TFileMerger).

try {

   // Check the inputs

   if (!inputs || inputs->GetEntries() < 3 || !mergeInfo) {

      Error("RNTuple::Merge", "Invalid inputs.");

      return -1;

   }


   // Parse the input parameters

   TIter itr(inputs);


   // First entry is the RNTuple name

   std::string ntupleName = std::string(itr()->GetName());


   // Second entry is the output file

   TObject *secondArg = itr();

   TFile *outFile = dynamic_cast<TFile *>(secondArg);

   if (!outFile) {

      Error("RNTuple::Merge", "Second input parameter should be a TFile, but it's a %s.", secondArg->ClassName());

      return -1;

   }


   // Check if the output file already has a key with that name

   TKey *outKey = outFile->FindKey(ntupleName.c_str());

   ROOT::RNTuple *outNTuple = nullptr;

   if (outKey) {

      outNTuple = outKey->ReadObject<ROOT::RNTuple>();

      if (!outNTuple) {

         Error("RNTuple::Merge", "Output file already has key, but not of type RNTuple!");

         return -1;

      }

      // In principle, we should already be working on the RNTuple object from the output file, but just continue with

      // pointer we just got.

   }


   const bool defaultComp = mergeInfo->fOptions.Contains("default_compression");

   const bool firstSrcComp = mergeInfo->fOptions.Contains("first_source_compression");

   if (defaultComp && firstSrcComp) {

      // this should never happen through hadd, but a user may call RNTuple::Merge() from custom code...

      Warning(

         "RNTuple::Merge",

         "Passed both options \"default_compression\" and \"first_source_compression\": only the latter will apply.");

   }

   int compression = kUnknownCompressionSettings;

   if (firstSrcComp) {

      // user passed -ff or -fk: use the same compression as the first RNTuple we find in the sources.

      // (do nothing here, the compression will be fetched below)

   } else if (!defaultComp) {

      // compression was explicitly passed by the user: use it.

      compression = outFile->GetCompressionSettings();

   } else {

      // user passed no compression-related options: use default

      compression = RCompressionSetting::EDefaults::kUseGeneralPurpose;

      Info("RNTuple::Merge", "Using the default compression: %d", compression);

   }


   // The remaining entries are the input files

   std::vector<std::unique_ptr<RPageSourceFile>> sources;

   std::vector<RPageSource *> sourcePtrs;


   while (const auto &pitr = itr()) {

      TFile *inFile = dynamic_cast<TFile *>(pitr);

      ROOT::RNTuple *anchor = inFile ? inFile->Get<ROOT::RNTuple>(ntupleName.c_str()) : nullptr;

      if (!anchor) {

         Error("RNTuple::Merge", "Failed to retrieve RNTuple anchor named '%s' from file '%s'", ntupleName.c_str(),

               inFile->GetName());

         return -1;

      }


      auto source = RPageSourceFile::CreateFromAnchor(*anchor);

      if (compression == kUnknownCompressionSettings) {

         // Get the compression of this RNTuple and use it as the output compression.

         // We currently assume all column ranges have the same compression, so we just peek at the first one.

         source->Attach();

         auto descriptor = source->GetSharedDescriptorGuard();

         auto clusterIter = descriptor->GetClusterIterable();

         auto firstCluster = clusterIter.begin();

         if (firstCluster == clusterIter.end()) {

            Error("RNTuple::Merge",

                  "Asked to use the first source's compression as the output compression, but the "

                  "first source (file '%s') has an empty RNTuple, therefore the output compression could not be "

                  "determined.",

                  inFile->GetName());

            return -1;

         }

         auto colRangeIter = (*firstCluster).GetColumnRangeIterable();

         auto firstColRange = colRangeIter.begin();

         if (firstColRange == colRangeIter.end()) {

            Error("RNTuple::Merge",

                  "Asked to use the first source's compression as the output compression, but the "

                  "first source (file '%s') has an empty RNTuple, therefore the output compression could not be "

                  "determined.",

                  inFile->GetName());

            return -1;

         }

         compression = (*firstColRange).fCompressionSettings;

         Info("RNTuple::Merge", "Using the first RNTuple's compression: %d", compression);

      }

      sources.push_back(std::move(source));

   }


   RNTupleWriteOptions writeOpts;

   assert(compression != kUnknownCompressionSettings);

   writeOpts.SetCompression(compression);

   auto destination = std::make_unique<RPageSinkFile>(ntupleName, *outFile, writeOpts);


   // If we already have an existing RNTuple, copy over its descriptor to support incremental merging

   if (outNTuple) {

      auto outSource = RPageSourceFile::CreateFromAnchor(*outNTuple);

      outSource->Attach();

      auto desc = outSource->GetSharedDescriptorGuard();

      destination->InitFromDescriptor(desc.GetRef());

   }


   // Interface conversion

   sourcePtrs.reserve(sources.size());

   for (const auto &s : sources) {

      sourcePtrs.push_back(s.get());

   }


   // Now merge

   RNTupleMerger merger;

   RNTupleMergeOptions mergerOpts;

   mergerOpts.fCompressionSettings = compression;

   merger.Merge(sourcePtrs, *destination, mergerOpts).ThrowOnError();


   // Provide the caller with a merged anchor object (even though we've already

   // written it).

   *this = *outFile->Get<ROOT::RNTuple>(ntupleName.c_str());


   return 0;

} catch (const RException &ex) {

   Error("RNTuple::Merge", "Exception thrown while merging: %s", ex.what());

   return -1;

}


namespace {

// Functor used to change the compression of a page to `options.fCompressionSettings`.

struct RChangeCompressionFunc {

   DescriptorId_t fOutputColumnId;


   const RColumnElementBase &fSrcColElement;

   const RColumnElementBase &fDstColElement;

   const RNTupleMergeOptions &fMergeOptions;


   RPageStorage::RSealedPage &fSealedPage;

   RPageAllocator &fPageAlloc;

   std::uint8_t *fBuffer;


   void operator()() const

   {

      auto page = RPageSource::UnsealPage(fSealedPage, fSrcColElement, fOutputColumnId, fPageAlloc).Unwrap();

      RPageSink::RSealPageConfig sealConf;

      sealConf.fElement = &fDstColElement;

      sealConf.fPage = &page;

      sealConf.fBuffer = fBuffer;

      sealConf.fCompressionSetting = fMergeOptions.fCompressionSettings;

      sealConf.fWriteChecksum = fSealedPage.GetHasChecksum();

      auto refSealedPage = RPageSink::SealPage(sealConf);

      fSealedPage = refSealedPage;

   }

};


struct RCommonField {

   const RFieldDescriptor *fSrc;

   const RFieldDescriptor *fDst;


   RCommonField(const RFieldDescriptor *src, const RFieldDescriptor *dst) : fSrc(src), fDst(dst) {}

};


struct RDescriptorsComparison {

   std::vector<const RFieldDescriptor *> fExtraDstFields;

   std::vector<const RFieldDescriptor *> fExtraSrcFields;

   std::vector<RCommonField> fCommonFields;

};


struct RColumnOutInfo {

   DescriptorId_t fColumnId;

   EColumnType fColumnType;

};


// { fully.qualified.fieldName.colInputId => colOutputInfo }

using ColumnIdMap_t = std::unordered_map<std::string, RColumnOutInfo>;


struct RColumnInfoGroup {

   std::vector<RColumnMergeInfo> fExtraDstColumns;

   std::vector<RColumnMergeInfo> fCommonColumns;

};


} // namespace


// These structs cannot be in the anon namespace becase they're used in RNTupleMerger's private interface.

namespace ROOT::Experimental::Internal {

struct RColumnMergeInfo {

   // This column name is built as a dot-separated concatenation of the ancestry of

   // the columns' parent fields' names plus the index of the column itself.

   // e.g. "Muon.pt.x._0"

   std::string fColumnName;

   DescriptorId_t fInputId;

   DescriptorId_t fOutputId;

   EColumnType fColumnType;

   // If nullopt, use the default in-memory type

   std::optional<std::type_index> fInMemoryType;

   const RFieldDescriptor *fParentField;

};


// Data related to a single call of RNTupleMerger::Merge()

struct RNTupleMergeData {

   std::span<RPageSource *> fSources;

   RPageSink &fDestination;

   const RNTupleMergeOptions &fMergeOpts;

   const RNTupleDescriptor &fDstDescriptor;

   const RNTupleDescriptor *fSrcDescriptor = nullptr;


   std::vector<RColumnMergeInfo> fColumns;

   ColumnIdMap_t fColumnIdMap;


   NTupleSize_t fNumDstEntries = 0;


   RNTupleMergeData(std::span<RPageSource *> sources, RPageSink &destination, const RNTupleMergeOptions &mergeOpts)

      : fSources{sources}, fDestination{destination}, fMergeOpts{mergeOpts}, fDstDescriptor{destination.GetDescriptor()}

   {

   }

};


struct RSealedPageMergeData {

   // We use a std::deque so that references to the contained SealedPageSequence_t, and its iterators, are

   // never invalidated.

   std::deque<RPageStorage::SealedPageSequence_t> fPagesV;

   std::vector<RPageStorage::RSealedPageGroup> fGroups;

   std::vector<std::unique_ptr<std::uint8_t[]>> fBuffers;

};


std::ostream &operator<<(std::ostream &os, const std::optional<RColumnDescriptor::RValueRange> &x)

{

   if (x) {

      os << '(' << x->fMin << ", " << x->fMax << ')';

   } else {

      os << "(null)";

   }

   return os;

}


} // namespace ROOT::Experimental::Internal


static bool IsSplitOrUnsplitVersionOf(EColumnType a, EColumnType b)

{

   // clang-format off

   if (a == EColumnType::kInt16 && b == EColumnType::kSplitInt16) return true;

   if (a == EColumnType::kSplitInt16 && b == EColumnType::kInt16) return true;

   if (a == EColumnType::kInt32 && b == EColumnType::kSplitInt32) return true;

   if (a == EColumnType::kSplitInt32 && b == EColumnType::kInt32) return true;

   if (a == EColumnType::kInt64 && b == EColumnType::kSplitInt64) return true;

   if (a == EColumnType::kSplitInt64 && b == EColumnType::kInt64) return true;

   if (a == EColumnType::kUInt16 && b == EColumnType::kSplitUInt16) return true;

   if (a == EColumnType::kSplitUInt16 && b == EColumnType::kUInt16) return true;

   if (a == EColumnType::kUInt32 && b == EColumnType::kSplitUInt32) return true;

   if (a == EColumnType::kSplitUInt32 && b == EColumnType::kUInt32) return true;

   if (a == EColumnType::kUInt64 && b == EColumnType::kSplitUInt64) return true;

   if (a == EColumnType::kSplitUInt64 && b == EColumnType::kUInt64) return true;

   if (a == EColumnType::kIndex32 && b == EColumnType::kSplitIndex32) return true;

   if (a == EColumnType::kSplitIndex32 && b == EColumnType::kIndex32) return true;

   if (a == EColumnType::kIndex64 && b == EColumnType::kSplitIndex64) return true;

   if (a == EColumnType::kSplitIndex64 && b == EColumnType::kIndex64) return true;

   if (a == EColumnType::kReal32 && b == EColumnType::kSplitReal32) return true;

   if (a == EColumnType::kSplitReal32 && b == EColumnType::kReal32) return true;

   if (a == EColumnType::kReal64 && b == EColumnType::kSplitReal64) return true;

   if (a == EColumnType::kSplitReal64 && b == EColumnType::kReal64) return true;

   // clang-format on

   return false;

}


/// Compares the top level fields of `dst` and `src` and determines whether they can be merged or not.

/// In addition, returns the differences between `dst` and `src`'s structures

static RResult<RDescriptorsComparison>

CompareDescriptorStructure(const RNTupleDescriptor &dst, const RNTupleDescriptor &src)

{

   // Cases:

   // 1. dst == src

   // 2. dst has fields that src hasn't

   // 3. src has fields that dst hasn't

   // 4. dst and src have fields that differ (compatible or incompatible)


   std::vector<std::string> errors;

   RDescriptorsComparison res;


   std::vector<RCommonField> commonFields;


   for (const auto &dstField : dst.GetTopLevelFields()) {

      const auto srcFieldId = src.FindFieldId(dstField.GetFieldName());

      if (srcFieldId != kInvalidDescriptorId) {

         const auto &srcField = src.GetFieldDescriptor(srcFieldId);

         commonFields.push_back({&srcField, &dstField});

      } else {

         res.fExtraDstFields.emplace_back(&dstField);

      }

   }

   for (const auto &srcField : src.GetTopLevelFields()) {

      const auto dstFieldId = dst.FindFieldId(srcField.GetFieldName());

      if (dstFieldId == kInvalidDescriptorId)

         res.fExtraSrcFields.push_back(&srcField);

   }


   // Check compatibility of common fields

   for (const auto &field : commonFields) {

      // NOTE: field.fSrc and field.fDst have the same name by construction

      const auto &fieldName = field.fSrc->GetFieldName();


      // Require that fields are both projected or both not projected

      bool projCompatible = field.fSrc->IsProjectedField() == field.fDst->IsProjectedField();

      if (!projCompatible) {

         std::stringstream ss;

         ss << "Field `" << fieldName << "` is incompatible with previously-seen field with that name because the "

            << (field.fSrc->IsProjectedField() ? "new" : "old") << " one is projected and the other isn't";

         errors.push_back(ss.str());

      } else if (field.fSrc->IsProjectedField()) {

         // if both fields are projected, verify that they point to the same real field

         const auto srcName = src.GetQualifiedFieldName(field.fSrc->GetProjectionSourceId());

         const auto dstName = dst.GetQualifiedFieldName(field.fDst->GetProjectionSourceId());

         if (srcName != dstName) {

            std::stringstream ss;

            ss << "Field `" << fieldName

               << "` is projected to a different field than a previously-seen field with the same name (old: "

               << dstName << ", new: " << srcName << ")";

            errors.push_back(ss.str());

         }

      }


      // Require that fields types match

      // TODO(gparolini): allow non-identical but compatible types

      const auto &srcTyName = field.fSrc->GetTypeName();

      const auto &dstTyName = field.fDst->GetTypeName();

      if (srcTyName != dstTyName) {

         std::stringstream ss;

         ss << "Field `" << fieldName

            << "` has a type incompatible with a previously-seen field with the same name: (old: " << dstTyName

            << ", new: " << srcTyName << ")";

         errors.push_back(ss.str());

      }


      // Require that type checksums match

      const auto srcTyChk = field.fSrc->GetTypeChecksum();

      const auto dstTyChk = field.fDst->GetTypeChecksum();

      if (srcTyChk && dstTyChk && *srcTyChk != *dstTyChk) {

         std::stringstream ss;

         ss << "Field `" << field.fSrc->GetFieldName()

            << "` has a different type checksum than previously-seen field with the same name";

         errors.push_back(ss.str());

      }


      // Require that type versions match

      const auto srcTyVer = field.fSrc->GetTypeVersion();

      const auto dstTyVer = field.fDst->GetTypeVersion();

      if (srcTyVer != dstTyVer) {

         std::stringstream ss;

         ss << "Field `" << field.fSrc->GetFieldName()

            << "` has a different type version than previously-seen field with the same name (old: " << dstTyVer

            << ", new: " << srcTyVer << ")";

         errors.push_back(ss.str());

      }


      // Require that column representations match

      const auto srcNCols = field.fSrc->GetLogicalColumnIds().size();

      const auto dstNCols = field.fDst->GetLogicalColumnIds().size();

      if (srcNCols != dstNCols) {

         std::stringstream ss;

         ss << "Field `" << field.fSrc->GetFieldName()

            << "` has a different number of columns than previously-seen field with the same name (old: " << dstNCols

            << ", new: " << srcNCols << ")";

         errors.push_back(ss.str());

      } else {

         for (auto i = 0u; i < srcNCols; ++i) {

            const auto srcColId = field.fSrc->GetLogicalColumnIds()[i];

            const auto dstColId = field.fDst->GetLogicalColumnIds()[i];

            const auto &srcCol = src.GetColumnDescriptor(srcColId);

            const auto &dstCol = dst.GetColumnDescriptor(dstColId);

            // TODO(gparolini): currently we refuse to merge columns of different types unless they are Split/non-Split

            // version of the same type, because we know how to treat that specific case. We should also properly handle

            // different but compatible types.

            if (srcCol.GetType() != dstCol.GetType() &&

                !IsSplitOrUnsplitVersionOf(srcCol.GetType(), dstCol.GetType())) {

               std::stringstream ss;

               ss << i << "-th column of field `" << field.fSrc->GetFieldName()

                  << "` has a different column type of the same column on the previously-seen field with the same name "

                     "(old: "

                  << RColumnElementBase::GetColumnTypeName(srcCol.GetType())

                  << ", new: " << RColumnElementBase::GetColumnTypeName(dstCol.GetType()) << ")";

               errors.push_back(ss.str());

            }

            if (srcCol.GetBitsOnStorage() != dstCol.GetBitsOnStorage()) {

               std::stringstream ss;

               ss << i << "-th column of field `" << field.fSrc->GetFieldName()

                  << "` has a different number of bits of the same column on the previously-seen field with the same "

                     "name "

                     "(old: "

                  << srcCol.GetBitsOnStorage() << ", new: " << dstCol.GetBitsOnStorage() << ")";

               errors.push_back(ss.str());

            }

            if (srcCol.GetValueRange() != dstCol.GetValueRange()) {

               std::stringstream ss;

               ss << i << "-th column of field `" << field.fSrc->GetFieldName()

                  << "` has a different value range of the same column on the previously-seen field with the same name "

                     "(old: "

                  << srcCol.GetValueRange() << ", new: " << dstCol.GetValueRange() << ")";

               errors.push_back(ss.str());

            }

            if (srcCol.GetRepresentationIndex() > 0) {

               std::stringstream ss;

               ss << i << "-th column of field `" << field.fSrc->GetFieldName()

                  << "` has a representation index higher than 0. This is not supported yet by the merger.";

               errors.push_back(ss.str());

            }

         }

      }

   }


   std::string errMsg;

   for (const auto &err : errors)

      errMsg += std::string("\n  * ") + err;


   if (!errMsg.empty())

      errMsg = errMsg.substr(1); // strip initial newline


   if (errMsg.length())

      return R__FAIL(errMsg);


   res.fCommonFields.reserve(commonFields.size());

   for (const auto &[srcField, dstField] : commonFields) {

      res.fCommonFields.emplace_back(srcField, dstField);

   }


   // TODO(gparolini): we should exhaustively check the field tree rather than just the top level fields,

   // in case the user forgets to change the version number on one field.


   return RResult(res);

}


// Applies late model extension to `destination`, adding all `newFields` to it.

static void ExtendDestinationModel(std::span<const RFieldDescriptor *> newFields, RNTupleModel &dstModel,

                                   RNTupleMergeData &mergeData, std::vector<RCommonField> &commonFields)

{

   assert(newFields.size() > 0); // no point in calling this with 0 new cols


   dstModel.Unfreeze();

   RNTupleModelChangeset changeset{dstModel};


   std::string msg = "destination doesn't contain field";

   if (newFields.size() > 1)

      msg += 's';

   msg += ' ';

   msg += std::accumulate(newFields.begin(), newFields.end(), std::string{}, [](const auto &acc, const auto *field) {

      return acc + (acc.length() ? ", " : "") + '`' + field->GetFieldName() + '`';

   });

   Info("RNTuple::Merge", "%s: adding %s to the destination model (entry #%" PRIu64 ").", msg.c_str(),

        (newFields.size() > 1 ? "them" : "it"), mergeData.fNumDstEntries);


   changeset.fAddedFields.reserve(newFields.size());

   for (const auto *fieldDesc : newFields) {

      auto field = fieldDesc->CreateField(*mergeData.fSrcDescriptor);

      if (fieldDesc->IsProjectedField())

         changeset.fAddedProjectedFields.emplace_back(field.get());

      else

         changeset.fAddedFields.emplace_back(field.get());

      changeset.fModel.AddField(std::move(field));

   }

   dstModel.Freeze();

   mergeData.fDestination.UpdateSchema(changeset, mergeData.fNumDstEntries);


   commonFields.reserve(commonFields.size() + newFields.size());

   for (const auto *field : newFields) {

      const auto newFieldInDstId = mergeData.fDstDescriptor.FindFieldId(field->GetFieldName());

      const auto &newFieldInDst = mergeData.fDstDescriptor.GetFieldDescriptor(newFieldInDstId);

      commonFields.emplace_back(field, &newFieldInDst);

   }

}


// Merges all columns appearing both in the source and destination RNTuples, just copying them if their

// compression matches ("fast merge") or by unsealing and resealing them with the proper compression.

void RNTupleMerger::MergeCommonColumns(RClusterPool &clusterPool, DescriptorId_t clusterId,

                                       std::span<RColumnMergeInfo> commonColumns,

                                       const RCluster::ColumnSet_t &commonColumnSet,

                                       RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData)

{

   assert(commonColumns.size() == commonColumnSet.size());

   if (commonColumns.empty())

      return;


   const RCluster *cluster = clusterPool.GetCluster(clusterId, commonColumnSet);

   // we expect the cluster pool to contain the requested set of columns, since they were

   // validated by CompareDescriptorStructure().

   assert(cluster);


   const auto &clusterDesc = mergeData.fSrcDescriptor->GetClusterDescriptor(clusterId);


   for (const auto &column : commonColumns) {

      const auto &columnId = column.fInputId;

      R__ASSERT(clusterDesc.ContainsColumn(columnId));


      const auto &columnDesc = mergeData.fSrcDescriptor->GetColumnDescriptor(columnId);

      const auto srcColElement = column.fInMemoryType

                                    ? GenerateColumnElement(*column.fInMemoryType, columnDesc.GetType())

                                    : RColumnElementBase::Generate(columnDesc.GetType());

      const auto dstColElement = column.fInMemoryType ? GenerateColumnElement(*column.fInMemoryType, column.fColumnType)

                                                      : RColumnElementBase::Generate(column.fColumnType);


      // Now get the pages for this column in this cluster

      const auto &pages = clusterDesc.GetPageRange(columnId);


      RPageStorage::SealedPageSequence_t sealedPages;

      sealedPages.resize(pages.fPageInfos.size());


      // Each column range potentially has a distinct compression settings

      const auto colRangeCompressionSettings = clusterDesc.GetColumnRange(columnId).fCompressionSettings;

      const bool needsCompressionChange = colRangeCompressionSettings != mergeData.fMergeOpts.fCompressionSettings;

      if (needsCompressionChange && mergeData.fMergeOpts.fExtraVerbose)

         Info("RNTuple::Merge", "Column %s: changing source compression from %d to %d", column.fColumnName.c_str(),

              colRangeCompressionSettings, mergeData.fMergeOpts.fCompressionSettings);


      size_t pageBufferBaseIdx = sealedPageData.fBuffers.size();

      // If the column range already has the right compression we don't need to allocate any new buffer, so we don't

      // bother reserving memory for them.

      if (needsCompressionChange)

         sealedPageData.fBuffers.resize(sealedPageData.fBuffers.size() + pages.fPageInfos.size());


      // Loop over the pages

      std::uint64_t pageIdx = 0;

      for (const auto &pageInfo : pages.fPageInfos) {

         assert(pageIdx < sealedPages.size());

         assert(sealedPageData.fBuffers.size() == 0 || pageIdx < sealedPageData.fBuffers.size());


         ROnDiskPage::Key key{columnId, pageIdx};

         auto onDiskPage = cluster->GetOnDiskPage(key);


         const auto checksumSize = pageInfo.fHasChecksum * RPageStorage::kNBytesPageChecksum;

         RPageStorage::RSealedPage &sealedPage = sealedPages[pageIdx];

         sealedPage.SetNElements(pageInfo.fNElements);

         sealedPage.SetHasChecksum(pageInfo.fHasChecksum);

         sealedPage.SetBufferSize(pageInfo.fLocator.fBytesOnStorage + checksumSize);

         sealedPage.SetBuffer(onDiskPage->GetAddress());

         // TODO(gparolini): more graceful error handling (skip the page?)

         sealedPage.VerifyChecksumIfEnabled().ThrowOnError();

         R__ASSERT(onDiskPage && (onDiskPage->GetSize() == sealedPage.GetBufferSize()));


         if (needsCompressionChange) {

            const auto uncompressedSize = srcColElement->GetSize() * sealedPage.GetNElements();

            auto &buffer = sealedPageData.fBuffers[pageBufferBaseIdx + pageIdx];

            buffer = std::make_unique<std::uint8_t[]>(uncompressedSize + checksumSize);

            RChangeCompressionFunc compressTask{

               column.fOutputId, *srcColElement, *dstColElement, mergeData.fMergeOpts,

               sealedPage,       *fPageAlloc,    buffer.get(),

            };


            if (fTaskGroup)

               fTaskGroup->Run(compressTask);

            else

               compressTask();

         }


         ++pageIdx;


      } // end of loop over pages


      if (fTaskGroup)

         fTaskGroup->Wait();


      sealedPageData.fPagesV.push_back(std::move(sealedPages));

      sealedPageData.fGroups.emplace_back(column.fOutputId, sealedPageData.fPagesV.back().cbegin(),

                                          sealedPageData.fPagesV.back().cend());

   } // end loop over common columns

}


// Generates default values for columns that are not present in the current source RNTuple

// but are present in the destination's schema.

static void GenerateExtraDstColumns(size_t nClusterEntries, std::span<RColumnMergeInfo> extraDstColumns,

                                    RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData)

{

   for (const auto &column : extraDstColumns) {

      const auto &columnId = column.fInputId;

      const auto &columnDesc = mergeData.fDstDescriptor.GetColumnDescriptor(columnId);

      const RFieldDescriptor *field = column.fParentField;


      // Skip all auxiliary columns

      if (field->GetLogicalColumnIds()[0] != columnId)

         continue;


      // Check if this column is a child of a Collection or a Variant. If so, it has no data

      // and can be skipped.

      bool skipColumn = false;

      auto nRepetitions = std::max<std::uint64_t>(field->GetNRepetitions(), 1);

      for (auto parentId = field->GetParentId(); parentId != kInvalidDescriptorId;) {

         const RFieldDescriptor &parent = mergeData.fSrcDescriptor->GetFieldDescriptor(parentId);

         if (parent.GetStructure() == ENTupleStructure::kCollection ||

             parent.GetStructure() == ENTupleStructure::kVariant) {

            skipColumn = true;

            break;

         }

         nRepetitions *= std::max<std::uint64_t>(parent.GetNRepetitions(), 1);

         parentId = parent.GetParentId();

      }

      if (skipColumn)

         continue;


      const auto structure = field->GetStructure();


      if (structure == ENTupleStructure::kStreamer) {

         Fatal(

            "RNTuple::Merge",

            "Destination RNTuple contains a streamer field (%s) that is not present in one of the sources. "

            "Creating a default value for a streamer field is ill-defined, therefore the merging process will abort.",

            field->GetFieldName().c_str());

         continue;

      }


      // NOTE: we cannot have a Record here because it has no associated columns.

      R__ASSERT(structure == ENTupleStructure::kCollection || structure == ENTupleStructure::kVariant ||

                structure == ENTupleStructure::kLeaf);


      const auto colElement = RColumnElementBase::Generate(columnDesc.GetType());

      const auto nElements = nClusterEntries * nRepetitions;

      const auto bytesOnStorage = colElement->GetPackedSize(nElements);

      constexpr auto kPageSizeLimit = 256 * 1024;

      // TODO(gparolini): consider coalescing the last page if its size is less than some threshold

      const size_t nPages = bytesOnStorage / kPageSizeLimit + !!(bytesOnStorage % kPageSizeLimit);

      for (size_t i = 0; i < nPages; ++i) {

         const auto pageSize = (i < nPages - 1) ? kPageSizeLimit : bytesOnStorage - kPageSizeLimit * (nPages - 1);

         const auto checksumSize = RPageStorage::kNBytesPageChecksum;

         const auto bufSize = pageSize + checksumSize;

         auto &buffer = sealedPageData.fBuffers.emplace_back(new unsigned char[bufSize]);


         RPageStorage::RSealedPage sealedPage{buffer.get(), bufSize, static_cast<std::uint32_t>(nElements), true};

         memset(buffer.get(), 0, pageSize);

         sealedPage.ChecksumIfEnabled();


         sealedPageData.fPagesV.push_back({sealedPage});

      }


      sealedPageData.fGroups.emplace_back(column.fOutputId, sealedPageData.fPagesV.back().cbegin(),

                                          sealedPageData.fPagesV.back().cend());

   }

}


// Iterates over all clusters of `source` and merges their pages into `destination`.

// It is assumed that all columns in `commonColumns` are present (and compatible) in both the source and

// the destination's schemas.

// The pages may be "fast-merged" (i.e. simply copied with no decompression/recompression) if the target

// compression is unspecified or matches the original compression settings.

void RNTupleMerger::MergeSourceClusters(RPageSource &source, std::span<RColumnMergeInfo> commonColumns,

                                        std::span<RColumnMergeInfo> extraDstColumns, RNTupleMergeData &mergeData)

{

   RClusterPool clusterPool{source};


   // Convert columns to a ColumnSet for the ClusterPool query

   RCluster::ColumnSet_t commonColumnSet;

   commonColumnSet.reserve(commonColumns.size());

   for (const auto &column : commonColumns)

      commonColumnSet.emplace(column.fInputId);


   RCluster::ColumnSet_t extraDstColumnSet;

   extraDstColumnSet.reserve(extraDstColumns.size());

   for (const auto &column : extraDstColumns)

      extraDstColumnSet.emplace(column.fInputId);


   // Loop over all clusters in this file.

   // descriptor->GetClusterIterable() doesn't guarantee any specific order, so we explicitly

   // request the first cluster.

   DescriptorId_t clusterId = mergeData.fSrcDescriptor->FindClusterId(0, 0);

   while (clusterId != kInvalidDescriptorId) {

      const auto &clusterDesc = mergeData.fSrcDescriptor->GetClusterDescriptor(clusterId);

      const auto nClusterEntries = clusterDesc.GetNEntries();

      R__ASSERT(nClusterEntries > 0);


      RSealedPageMergeData sealedPageData;


      if (!commonColumnSet.empty()) {

         MergeCommonColumns(clusterPool, clusterId, commonColumns, commonColumnSet, sealedPageData, mergeData);

      }


      if (!extraDstColumnSet.empty()) {

         GenerateExtraDstColumns(nClusterEntries, extraDstColumns, sealedPageData, mergeData);

      }


      // Commit the pages and the clusters

      mergeData.fDestination.CommitSealedPageV(sealedPageData.fGroups);

      mergeData.fDestination.CommitCluster(nClusterEntries);

      mergeData.fNumDstEntries += nClusterEntries;


      // Go to the next cluster

      clusterId = mergeData.fSrcDescriptor->FindNextClusterId(clusterId);

   }


   // TODO(gparolini): when we get serious about huge file support (>~ 100GB) we might want to check here

   // the size of the running page list and commit a cluster group when it exceeds some threshold,

   // which would prevent the page list from getting too large.

   // However, as of today, we aren't really handling such huge files, and even relatively big ones

   // such as the CMS dataset have a page list size of about only 2 MB.

   // So currently we simply merge all cluster groups into one.

}


static std::optional<std::type_index> ColumnInMemoryType(std::string_view fieldType, EColumnType onDiskType)

{

   if (onDiskType == EColumnType::kIndex32 || onDiskType == EColumnType::kSplitIndex32 ||

       onDiskType == EColumnType::kIndex64 || onDiskType == EColumnType::kSplitIndex64)

      return typeid(ClusterSize_t);


   if (onDiskType == EColumnType::kSwitch)

      return typeid(ROOT::Experimental::RColumnSwitch);


   if (fieldType == "bool") {

      return typeid(bool);

   } else if (fieldType == "std::byte") {

      return typeid(std::byte);

   } else if (fieldType == "char") {

      return typeid(char);

   } else if (fieldType == "std::int8_t") {

      return typeid(std::int8_t);

   } else if (fieldType == "std::uint8_t") {

      return typeid(std::uint8_t);

   } else if (fieldType == "std::int16_t") {

      return typeid(std::int16_t);

   } else if (fieldType == "std::uint16_t") {

      return typeid(std::uint16_t);

   } else if (fieldType == "std::int32_t") {

      return typeid(std::int32_t);

   } else if (fieldType == "std::uint32_t") {

      return typeid(std::uint32_t);

   } else if (fieldType == "std::int64_t") {

      return typeid(std::int64_t);

   } else if (fieldType == "std::uint64_t") {

      return typeid(std::uint64_t);

   } else if (fieldType == "float") {

      return typeid(float);

   } else if (fieldType == "double") {

      return typeid(double);

   }


   // if the type is not one of those above, we use the default in-memory type.

   return std::nullopt;

}


// Given a field, fill `columns` and `colIdMap` with information about all columns belonging to it and its subfields.

// `colIdMap` is used to map matching columns from different sources to the same output column in the destination.

// We match columns by their "fully qualified name", which is the concatenation of their ancestor fields' names

// and the column index.

// By this point, since we called `CompareDescriptorStructure()` earlier, we should be guaranteed that two matching

// columns will have at least compatible representations.

// NOTE: srcFieldDesc and dstFieldDesc may alias.

static void AddColumnsFromField(std::vector<RColumnMergeInfo> &columns, const RNTupleDescriptor &srcDesc,

                                RNTupleMergeData &mergeData, const RFieldDescriptor &srcFieldDesc,

                                const RFieldDescriptor &dstFieldDesc, const std::string &prefix = "")

{

   std::string name = prefix + '.' + srcFieldDesc.GetFieldName();


   const auto &columnIds = srcFieldDesc.GetLogicalColumnIds();

   columns.reserve(columns.size() + columnIds.size());

   // NOTE: here we can match the src and dst columns by column index because we forbid merging fields with

   // different column representations.

   for (auto i = 0u; i < srcFieldDesc.GetLogicalColumnIds().size(); ++i) {

      // We don't want to try and merge alias columns

      if (srcFieldDesc.IsProjectedField())

         continue;


      auto srcColumnId = srcFieldDesc.GetLogicalColumnIds()[i];

      const auto &srcColumn = srcDesc.GetColumnDescriptor(srcColumnId);

      RColumnMergeInfo info{};

      info.fColumnName = name + '.' + std::to_string(srcColumn.GetIndex());

      info.fInputId = srcColumn.GetPhysicalId();

      // Since the parent field is only relevant for extra dst columns, the choice of src or dstFieldDesc as a parent

      // is arbitrary (they're the same field).

      info.fParentField = &dstFieldDesc;


      if (auto it = mergeData.fColumnIdMap.find(info.fColumnName); it != mergeData.fColumnIdMap.end()) {

         info.fOutputId = it->second.fColumnId;

         info.fColumnType = it->second.fColumnType;

      } else {

         info.fOutputId = mergeData.fColumnIdMap.size();

         // NOTE(gparolini): map the type of src column to the type of dst column.

         // This mapping is only relevant for common columns and it's done to ensure we keep a consistent

         // on-disk representation of the same column.

         // This is also important to do for first source when it is used to generate the destination sink,

         // because even in that case their column representations may differ.

         // e.g. if the destination has a different compression than the source, an integer column might be

         // zigzag-encoded in the source but not in the destination.

         auto dstColumnId = dstFieldDesc.GetLogicalColumnIds()[i];

         const auto &dstColumn = mergeData.fDstDescriptor.GetColumnDescriptor(dstColumnId);

         info.fColumnType = dstColumn.GetType();

         mergeData.fColumnIdMap[info.fColumnName] = {info.fOutputId, info.fColumnType};

      }


      if (mergeData.fMergeOpts.fExtraVerbose) {

         Info("RNTuple::Merge",

              "Adding column %s with log.id %" PRIu64 ", phys.id %" PRIu64 ", type %s "

              " -> log.id %" PRIu64 ", type %s",

              info.fColumnName.c_str(), srcColumnId, srcColumn.GetPhysicalId(),

              RColumnElementBase::GetColumnTypeName(srcColumn.GetType()), info.fOutputId,

              RColumnElementBase::GetColumnTypeName(info.fColumnType));

      }


      // Since we disallow merging fields of different types, src and dstFieldDesc must have the same type name.

      assert(srcFieldDesc.GetTypeName() == dstFieldDesc.GetTypeName());

      info.fInMemoryType = ColumnInMemoryType(srcFieldDesc.GetTypeName(), info.fColumnType);

      columns.emplace_back(info);

   }


   const auto &srcChildrenIds = srcFieldDesc.GetLinkIds();

   const auto &dstChildrenIds = dstFieldDesc.GetLinkIds();

   assert(srcChildrenIds.size() == dstChildrenIds.size());

   for (auto i = 0u; i < srcChildrenIds.size(); ++i) {

      const auto &srcChild = srcDesc.GetFieldDescriptor(srcChildrenIds[i]);

      const auto &dstChild = mergeData.fDstDescriptor.GetFieldDescriptor(dstChildrenIds[i]);

      AddColumnsFromField(columns, srcDesc, mergeData, srcChild, dstChild, name);

   }

}


// Converts the fields comparison data to the corresponding column information.

// While doing so, it collects such information in `colIdMap`, which is used by later calls to this function

// to map already-seen column names to their chosen outputId, type and so on.

static RColumnInfoGroup

GatherColumnInfos(const RDescriptorsComparison &descCmp, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData)

{

   RColumnInfoGroup res;

   for (const RFieldDescriptor *field : descCmp.fExtraDstFields) {

      AddColumnsFromField(res.fExtraDstColumns, mergeData.fDstDescriptor, mergeData, *field, *field);

   }

   for (const auto &[srcField, dstField] : descCmp.fCommonFields) {

      AddColumnsFromField(res.fCommonColumns, srcDesc, mergeData, *srcField, *dstField);

   }

   return res;

}


RNTupleMerger::RNTupleMerger()

   // TODO(gparolini): consider using an arena allocator instead, since we know the precise lifetime

   // of the RNTuples we are going to handle (e.g. we can reset the arena at every source)

   : fPageAlloc(std::make_unique<RPageAllocatorHeap>())

{

#ifdef R__USE_IMT

   if (ROOT::IsImplicitMTEnabled())

      fTaskGroup = TTaskGroup();

#endif

}


RResult<void>

RNTupleMerger::Merge(std::span<RPageSource *> sources, RPageSink &destination, const RNTupleMergeOptions &mergeOptsIn)

{

   RNTupleMergeOptions mergeOpts = mergeOptsIn;

   {

      const auto dstCompSettings = destination.GetWriteOptions().GetCompression();

      if (mergeOpts.fCompressionSettings == kUnknownCompressionSettings) {

         mergeOpts.fCompressionSettings = dstCompSettings;

      } else if (mergeOpts.fCompressionSettings != dstCompSettings) {

         return R__FAIL(std::string("The compression given to RNTupleMergeOptions is different from that of the "

                                    "sink! (opts: ") +

                        std::to_string(mergeOpts.fCompressionSettings) + ", sink: " + std::to_string(dstCompSettings) +

                        ") This is currently unsupported.");

      }

   }


   RNTupleMergeData mergeData{sources, destination, mergeOpts};


   std::unique_ptr<RNTupleModel> model; // used to initialize the schema of the output RNTuple


#define SKIP_OR_ABORT(errMsg)                                                        \

   do {                                                                              \

      if (mergeOpts.fErrBehavior == ENTupleMergeErrBehavior::kSkip) {                \

         Warning("RNTuple::Merge", "Skipping RNTuple due to: %s", (errMsg).c_str()); \

         continue;                                                                   \

      } else {                                                                       \

         return R__FAIL(errMsg);                                                     \

      }                                                                              \

   } while (0)


   // Merge main loop

   for (RPageSource *source : sources) {

      source->Attach();

      auto srcDescriptor = source->GetSharedDescriptorGuard();

      mergeData.fSrcDescriptor = &srcDescriptor.GetRef();


      // Create sink from the input model if not initialized

      if (!destination.IsInitialized()) {

         auto opts = RNTupleDescriptor::RCreateModelOptions();

         opts.fReconstructProjections = true;

         model = srcDescriptor->CreateModel(opts);

         destination.Init(*model);

      }


      for (const auto &extraTypeInfoDesc : srcDescriptor->GetExtraTypeInfoIterable())

         destination.UpdateExtraTypeInfo(extraTypeInfoDesc);


      auto descCmpRes = CompareDescriptorStructure(mergeData.fDstDescriptor, srcDescriptor.GetRef());

      if (!descCmpRes) {

         SKIP_OR_ABORT(

            std::string("Source RNTuple will be skipped due to incompatible schema with the destination:\n") +

            descCmpRes.GetError()->GetReport());

      }

      auto descCmp = descCmpRes.Unwrap();


      // If the current source is missing some fields and we're not in Union mode, error

      // (if we are in Union mode, MergeSourceClusters will fill the missing fields with default values).

      if (mergeOpts.fMergingMode != ENTupleMergingMode::kUnion && !descCmp.fExtraDstFields.empty()) {

         std::string msg = "Source RNTuple is missing the following fields:";

         for (const auto *field : descCmp.fExtraDstFields) {

            msg += "\n  " + field->GetFieldName() + " : " + field->GetTypeName();

         }

         SKIP_OR_ABORT(msg);

      }


      // handle extra src fields

      if (descCmp.fExtraSrcFields.size()) {

         if (mergeOpts.fMergingMode == ENTupleMergingMode::kUnion) {

            // late model extension for all fExtraSrcFields in Union mode

            ExtendDestinationModel(descCmp.fExtraSrcFields, *model, mergeData, descCmp.fCommonFields);

         } else if (mergeOpts.fMergingMode == ENTupleMergingMode::kStrict) {

            // If the current source has extra fields and we're in Strict mode, error

            std::string msg = "Source RNTuple has extra fields that the destination RNTuple doesn't have:";

            for (const auto *field : descCmp.fExtraSrcFields) {

               msg += "\n  " + field->GetFieldName() + " : " + field->GetTypeName();

            }

            SKIP_OR_ABORT(msg);

         }

      }


      // handle extra dst fields & common fields

      auto columnInfos = GatherColumnInfos(descCmp, srcDescriptor.GetRef(), mergeData);

      MergeSourceClusters(*source, columnInfos.fCommonColumns, columnInfos.fExtraDstColumns, mergeData);

   } // end loop over sources


   // Commit the output

   destination.CommitClusterGroup();

   destination.CommitDataset();


   return RResult<void>::Success();

}

fBuffer
fBuffer
Definition Converters.cxx:3010

RClusterPool.hxx

RColumnElementBase.hxx

RError.hxx

R__FAIL
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290

RNTupleDescriptor.hxx

GenerateExtraDstColumns
static void GenerateExtraDstColumns(size_t nClusterEntries, std::span< RColumnMergeInfo > extraDstColumns, RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData)
Definition RNTupleMerger.cxx:619

ExtendDestinationModel
static void ExtendDestinationModel(std::span< const RFieldDescriptor * > newFields, RNTupleModel &dstModel, RNTupleMergeData &mergeData, std::vector< RCommonField > &commonFields)
Definition RNTupleMerger.cxx:484

CompareDescriptorStructure
static RResult< RDescriptorsComparison > CompareDescriptorStructure(const RNTupleDescriptor &dst, const RNTupleDescriptor &src)
Compares the top level fields of dst and src and determines whether they can be merged or not.
Definition RNTupleMerger.cxx:321

SKIP_OR_ABORT
#define SKIP_OR_ABORT(errMsg)

GatherColumnInfos
static RColumnInfoGroup GatherColumnInfos(const RDescriptorsComparison &descCmp, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData)
Definition RNTupleMerger.cxx:863

IsSplitOrUnsplitVersionOf
static bool IsSplitOrUnsplitVersionOf(EColumnType a, EColumnType b)
Definition RNTupleMerger.cxx:291

ColumnInMemoryType
static std::optional< std::type_index > ColumnInMemoryType(std::string_view fieldType, EColumnType onDiskType)
Definition RNTupleMerger.cxx:744

AddColumnsFromField
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const RFieldDescriptor &srcFieldDesc, const RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
Definition RNTupleMerger.cxx:792

RNTupleMerger.hxx

RNTupleModel.hxx

RNTupleSerialize.hxx

RNTupleUtil.hxx

RNTupleZip.hxx

RNTuple.hxx

RPageStorageFile.hxx

RPageStorage.hxx

b
#define b(i)
Definition RSha256.hxx:100

a
#define a(i)
Definition RSha256.hxx:99

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

Long64_t
long long Long64_t
Definition RtypesCore.h:69

operator<<
TBuffer & operator<<(TBuffer &buf, const Tmpl *obj)
Definition TBuffer.h:397

TError.h

R__ASSERT
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125

Info
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:218

Error
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
Definition TError.cxx:185

Warning
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:229

Fatal
void Fatal(const char *location, const char *msgfmt,...)
Use this function in case of a fatal error. It will abort the program.
Definition TError.cxx:244

TFileMergeInfo.h

TFile.h

src
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t src
Definition TGWin32VirtualXProxy.cxx:164

name
char name[80]
Definition TGX11.cxx:110

TKey.h

operator()
TRObject operator()(const T1 &t1) const
Definition TRFunctionImport__oprtr.h:14

TROOT.h

EColumnType
The available trivial, native content types of a column.

ROOT::Experimental::Internal::RClusterPool
Managed a set of clusters containing compressed and packed pages.
Definition RClusterPool.hxx:54

ROOT::Experimental::Internal::RClusterPool::GetCluster
RCluster * GetCluster(DescriptorId_t clusterId, const RCluster::ColumnSet_t &physicalColumns)
Returns the requested cluster either from the pool or, in case of a cache miss, lets the I/O thread l...
Definition RClusterPool.cxx:185

ROOT::Experimental::Internal::RCluster
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:152

ROOT::Experimental::Internal::RCluster::GetOnDiskPage
const ROnDiskPage * GetOnDiskPage(const ROnDiskPage::Key &key) const
Definition RCluster.cxx:32

ROOT::Experimental::Internal::RCluster::ColumnSet_t
std::unordered_set< DescriptorId_t > ColumnSet_t
Definition RCluster.hxx:154

ROOT::Experimental::Internal::RColumnElementBase
A column element encapsulates the translation between basic C++ types and their column representation...
Definition RColumnElementBase.hxx:54

ROOT::Experimental::Internal::RColumnElementBase::GetColumnTypeName
static const char * GetColumnTypeName(EColumnType type)
Definition RColumnElement.cxx:70

ROOT::Experimental::Internal::RColumnElementBase::Generate
static std::unique_ptr< RColumnElementBase > Generate(EColumnType type)
If CppT == void, use the default C++ type for the given column type.
Definition RColumnElementBase.hxx:144

ROOT::Experimental::Internal::RNTupleMerger
Given a set of RPageSources merge them into an RPageSink, optionally changing their compression.
Definition RNTupleMerger.hxx:79

ROOT::Experimental::Internal::RNTupleMerger::fTaskGroup
std::optional< TTaskGroup > fTaskGroup
Definition RNTupleMerger.hxx:81

ROOT::Experimental::Internal::RNTupleMerger::RNTupleMerger
RNTupleMerger()
Definition RNTupleMerger.cxx:875

ROOT::Experimental::Internal::RNTupleMerger::Merge
RResult< void > Merge(std::span< RPageSource * > sources, RPageSink &destination, const RNTupleMergeOptions &mergeOpts=RNTupleMergeOptions())
Merge a given set of sources into the destination.
Definition RNTupleMerger.cxx:887

ROOT::Experimental::Internal::RNTupleMerger::MergeSourceClusters
void MergeSourceClusters(RPageSource &source, std::span< RColumnMergeInfo > commonColumns, std::span< RColumnMergeInfo > extraDstColumns, RNTupleMergeData &mergeData)
Definition RNTupleMerger.cxx:692

ROOT::Experimental::Internal::RNTupleMerger::MergeCommonColumns
void MergeCommonColumns(RClusterPool &clusterPool, DescriptorId_t clusterId, std::span< RColumnMergeInfo > commonColumns, const RCluster::ColumnSet_t &commonColumnSet, RSealedPageMergeData &sealedPageData, const RNTupleMergeData &mergeData)
Definition RNTupleMerger.cxx:524

ROOT::Experimental::Internal::RPageAllocatorHeap
Uses standard C++ memory allocation for the column data pages.
Definition RPageAllocator.hxx:63

ROOT::Experimental::Internal::RPageAllocator
Abstract interface to allocate and release pages.
Definition RPageAllocator.hxx:40

ROOT::Experimental::Internal::RPageSink
Abstract interface to write data into an ntuple.
Definition RPageStorage.hxx:257

ROOT::Experimental::Internal::RPageSink::IsInitialized
bool IsInitialized() const
Definition RPageStorage.hxx:314

ROOT::Experimental::Internal::RPageSink::UpdateExtraTypeInfo
virtual void UpdateExtraTypeInfo(const RExtraTypeInfoDescriptor &extraTypeInfo)=0
Adds an extra type information record to schema.

ROOT::Experimental::Internal::RPageSink::CommitDataset
void CommitDataset()
Run the registered callbacks and finalize the current cluster and the entrire data set.
Definition RPageStorage.cxx:690

ROOT::Experimental::Internal::RPageSink::Init
void Init(RNTupleModel &model)
Physically creates the storage container to hold the ntuple (e.g., a keys a TFile or an S3 bucket) In...
Definition RPageStorage.hxx:321

ROOT::Experimental::Internal::RPageSink::GetWriteOptions
const RNTupleWriteOptions & GetWriteOptions() const
Returns the sink's write options.
Definition RPageStorage.hxx:310

ROOT::Experimental::Internal::RPageSink::CommitClusterGroup
virtual void CommitClusterGroup()=0
Write out the page locations (page list envelope) for all the committed clusters since the last call ...

ROOT::Experimental::Internal::RPageSink::CommitCluster
virtual std::uint64_t CommitCluster(NTupleSize_t nNewEntries)
Finalize the current cluster and create a new one for the following data.
Definition RPageStorage.hxx:380

ROOT::Experimental::Internal::RPageSink::UpdateSchema
virtual void UpdateSchema(const RNTupleModelChangeset &changeset, NTupleSize_t firstEntry)=0
Incorporate incremental changes to the model into the ntuple descriptor.

ROOT::Experimental::Internal::RPageSink::CommitSealedPageV
virtual void CommitSealedPageV(std::span< RPageStorage::RSealedPageGroup > ranges)=0
Write a vector of preprocessed pages to storage. The corresponding columns must have been added befor...

ROOT::Experimental::Internal::RPageSink::SealPage
RSealedPage SealPage(const RPage &page, const RColumnElementBase &element)
Helper for streaming a page.
Definition RPageStorage.cxx:673

ROOT::Experimental::Internal::RPageSourceFile::CreateFromAnchor
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const RNTupleReadOptions &options=RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
Definition RPageStorageFile.cxx:278

ROOT::Experimental::Internal::RPageSource
Abstract interface to read data from an ntuple.
Definition RPageStorage.hxx:550

ROOT::Experimental::Internal::RPageSource::UnsealPage
static RResult< RPage > UnsealPage(const RSealedPage &sealedPage, const RColumnElementBase &element, DescriptorId_t physicalColumnId, RPageAllocator &pageAlloc)
Helper for unstreaming a page.
Definition RPageStorage.cxx:503

ROOT::Experimental::Internal::RPageStorage::SealedPageSequence_t
std::deque< RSealedPage > SealedPageSequence_t
Definition RPageStorage.hxx:132

ROOT::Experimental::Internal::RPageStorage::kNBytesPageChecksum
static constexpr std::size_t kNBytesPageChecksum
The page checksum is a 64bit xxhash3.
Definition RPageStorage.hxx:75

ROOT::Experimental::RClusterDescriptor::GetNEntries
ClusterSize_t GetNEntries() const
Definition RNTupleDescriptor.hxx:362

ROOT::Experimental::RColumnDescriptor::GetType
EColumnType GetType() const
Definition RNTupleDescriptor.hxx:210

ROOT::Experimental::RColumnDescriptor::GetBitsOnStorage
std::uint16_t GetBitsOnStorage() const
Definition RNTupleDescriptor.hxx:209

ROOT::Experimental::RColumnSwitch
Holds the index and the tag of a kSwitch column.
Definition RNTupleUtil.hxx:151

ROOT::Experimental::RException
Base class for all ROOT issued exceptions.
Definition RError.hxx:78

ROOT::Experimental::RFieldDescriptor
Meta-data stored for every field of an ntuple.
Definition RNTupleDescriptor.hxx:70

ROOT::Experimental::RFieldDescriptor::GetParentId
DescriptorId_t GetParentId() const
Definition RNTupleDescriptor.hxx:132

ROOT::Experimental::RFieldDescriptor::GetFieldName
const std::string & GetFieldName() const
Definition RNTupleDescriptor.hxx:126

ROOT::Experimental::RFieldDescriptor::GetTypeName
const std::string & GetTypeName() const
Definition RNTupleDescriptor.hxx:128

ROOT::Experimental::RFieldDescriptor::GetLogicalColumnIds
const std::vector< DescriptorId_t > & GetLogicalColumnIds() const
Definition RNTupleDescriptor.hxx:135

ROOT::Experimental::RFieldDescriptor::IsProjectedField
bool IsProjectedField() const
Definition RNTupleDescriptor.hxx:138

ROOT::Experimental::RFieldDescriptor::GetLinkIds
const std::vector< DescriptorId_t > & GetLinkIds() const
Definition RNTupleDescriptor.hxx:134

ROOT::Experimental::RFieldDescriptor::GetNRepetitions
std::uint64_t GetNRepetitions() const
Definition RNTupleDescriptor.hxx:130

ROOT::Experimental::RFieldDescriptor::GetStructure
ENTupleStructure GetStructure() const
Definition RNTupleDescriptor.hxx:131

ROOT::Experimental::RNTupleDescriptor
The on-storage meta-data of an ntuple.
Definition RNTupleDescriptor.hxx:529

ROOT::Experimental::RNTupleDescriptor::FindNextClusterId
DescriptorId_t FindNextClusterId(DescriptorId_t clusterId) const
Definition RNTupleDescriptor.cxx:388

ROOT::Experimental::RNTupleDescriptor::FindClusterId
DescriptorId_t FindClusterId(DescriptorId_t physicalColumnId, NTupleSize_t index) const
Definition RNTupleDescriptor.cxx:373

ROOT::Experimental::RNTupleDescriptor::GetClusterDescriptor
const RClusterDescriptor & GetClusterDescriptor(DescriptorId_t clusterId) const
Definition RNTupleDescriptor.hxx:616

ROOT::Experimental::RNTupleDescriptor::GetQualifiedFieldName
std::string GetQualifiedFieldName(DescriptorId_t fieldId) const
Walks up the parents of the field ID and returns a field name of the form a.b.c.d In case of invalid ...
Definition RNTupleDescriptor.cxx:330

ROOT::Experimental::RNTupleDescriptor::FindFieldId
DescriptorId_t FindFieldId(std::string_view fieldName, DescriptorId_t parentId) const
Definition RNTupleDescriptor.cxx:311

ROOT::Experimental::RNTupleDescriptor::GetColumnDescriptor
const RColumnDescriptor & GetColumnDescriptor(DescriptorId_t columnId) const
Definition RNTupleDescriptor.hxx:608

ROOT::Experimental::RNTupleDescriptor::GetFieldDescriptor
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
Definition RNTupleDescriptor.hxx:607

ROOT::Experimental::RNTupleDescriptor::GetTopLevelFields
RFieldDescriptorIterable GetTopLevelFields() const
Definition RNTupleDescriptor.cxx:1187

ROOT::Experimental::RNTupleModel
The RNTupleModel encapulates the schema of an ntuple.
Definition RNTupleModel.hxx:136

ROOT::Experimental::RNTupleModel::Freeze
void Freeze()
Definition RNTupleModel.cxx:510

ROOT::Experimental::RNTupleModel::Unfreeze
void Unfreeze()
Definition RNTupleModel.cxx:496

ROOT::Experimental::RNTupleWriteOptions
Common user-tunable settings for storing ntuples.
Definition RNTupleWriteOptions.hxx:48

ROOT::Experimental::RNTupleWriteOptions::GetCompression
int GetCompression() const
Definition RNTupleWriteOptions.hxx:105

ROOT::Experimental::RNTupleWriteOptions::SetCompression
void SetCompression(int val)
Definition RNTupleWriteOptions.hxx:106

ROOT::Experimental::RResult< void >::ThrowOnError
void ThrowOnError()
Short-hand method to throw an exception in the case of errors.
Definition RError.hxx:281

ROOT::Experimental::RResult
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194

ROOT::Experimental::TTaskGroup
A class to manage the asynchronous execution of work items.
Definition TTaskGroup.hxx:26

ROOT::RNTuple
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:69

ROOT::RNTuple::Merge
Long64_t Merge(TCollection *input, TFileMergeInfo *mergeInfo)
RNTuple implements the hadd MergeFile interface Merge this NTuple with the input list entries.
Definition RNTupleMerger.cxx:45

TCollection
Collection abstract base class.
Definition TCollection.h:65

TCollection::GetEntries
virtual Int_t GetEntries() const
Definition TCollection.h:179

TDirectoryFile::FindKey
TKey * FindKey(const char *keyname) const override
Find key with name keyname in the current directory.
Definition TDirectoryFile.cxx:779

TDirectoryFile::Get
TObject * Get(const char *namecycle) override
Return pointer to object identified by namecycle.
Definition TDirectoryFile.cxx:937

TFileMergeInfo
Definition TFileMergeInfo.h:42

TFileMergeInfo::fOptions
TString fOptions
Definition TFileMergeInfo.h:53

TFile
A ROOT file is an on-disk file, usually with extension .root, that stores objects in a file-system-li...
Definition TFile.h:53

TFile::GetCompressionSettings
Int_t GetCompressionSettings() const
Definition TFile.h:397

TIter
Definition TCollection.h:235

TKey
Book space in a file, create I/O buffers, to fill them, (un)compress them.
Definition TKey.h:28

TKey::ReadObject
T * ReadObject()
To read an object (non deriving from TObject) from the file.
Definition TKey.h:103

TNamed::GetName
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47

TObject
Mother of all ROOT objects.
Definition TObject.h:41

TObject::ClassName
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:213

TString::Contains
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition TString.h:632

bool

double

x
Double_t x[n]
Definition legend1.C:17

ex
Double_t ex[n]
Definition legend1.C:17

ROOT::Experimental::Internal
Definition RDirectoryEntry.hxx:27

ROOT::Experimental::Internal::ENTupleMergingMode::kStrict
@ kStrict
The merger will refuse to merge any 2 RNTuples whose schema doesn't match exactly.

ROOT::Experimental::Internal::ENTupleMergingMode::kUnion
@ kUnion
The merger will update the output model to include all columns from all sources.

ROOT::Experimental::Internal::GenerateColumnElement
std::unique_ptr< RColumnElementBase > GenerateColumnElement(std::type_index inMemoryType, EColumnType onDiskType)
Definition RColumnElement.cxx:156

ROOT::Experimental
Definition RDirectory.hxx:30

ROOT::Experimental::NTupleSize_t
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
Definition RNTupleUtil.hxx:117

ROOT::Experimental::EColumnType
EColumnType
Definition RNTupleUtil.hxx:74

ROOT::Experimental::ClusterSize_t
RClusterSize ClusterSize_t
Definition RNTupleUtil.hxx:145

ROOT::Experimental::kUnknownCompressionSettings
constexpr int kUnknownCompressionSettings
Definition RNTupleUtil.hxx:148

ROOT::Experimental::DescriptorId_t
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
Definition RNTupleUtil.hxx:168

ROOT::Experimental::kInvalidDescriptorId
constexpr DescriptorId_t kInvalidDescriptorId
Definition RNTupleUtil.hxx:169

ROOT::IsImplicitMTEnabled
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:570

ROOT::Experimental::Internal::RColumnMergeInfo
Definition RNTupleMerger.cxx:239

ROOT::Experimental::Internal::RColumnMergeInfo::fColumnName
std::string fColumnName
Definition RNTupleMerger.cxx:243

ROOT::Experimental::Internal::RColumnMergeInfo::fParentField
const RFieldDescriptor * fParentField
Definition RNTupleMerger.cxx:249

ROOT::Experimental::Internal::RColumnMergeInfo::fOutputId
DescriptorId_t fOutputId
Definition RNTupleMerger.cxx:245

ROOT::Experimental::Internal::RColumnMergeInfo::fInputId
DescriptorId_t fInputId
Definition RNTupleMerger.cxx:244

ROOT::Experimental::Internal::RColumnMergeInfo::fInMemoryType
std::optional< std::type_index > fInMemoryType
Definition RNTupleMerger.cxx:248

ROOT::Experimental::Internal::RColumnMergeInfo::fColumnType
EColumnType fColumnType
Definition RNTupleMerger.cxx:246

ROOT::Experimental::Internal::RNTupleMergeData
Definition RNTupleMerger.cxx:253

ROOT::Experimental::Internal::RNTupleMergeData::fColumnIdMap
ColumnIdMap_t fColumnIdMap
Definition RNTupleMerger.cxx:261

ROOT::Experimental::Internal::RNTupleMergeData::RNTupleMergeData
RNTupleMergeData(std::span< RPageSource * > sources, RPageSink &destination, const RNTupleMergeOptions &mergeOpts)
Definition RNTupleMerger.cxx:265

ROOT::Experimental::Internal::RNTupleMergeData::fSources
std::span< RPageSource * > fSources
Definition RNTupleMerger.cxx:254

ROOT::Experimental::Internal::RNTupleMergeData::fDestination
RPageSink & fDestination
Definition RNTupleMerger.cxx:255

ROOT::Experimental::Internal::RNTupleMergeData::fDstDescriptor
const RNTupleDescriptor & fDstDescriptor
Definition RNTupleMerger.cxx:257

ROOT::Experimental::Internal::RNTupleMergeData::fSrcDescriptor
const RNTupleDescriptor * fSrcDescriptor
Definition RNTupleMerger.cxx:258

ROOT::Experimental::Internal::RNTupleMergeData::fMergeOpts
const RNTupleMergeOptions & fMergeOpts
Definition RNTupleMerger.cxx:256

ROOT::Experimental::Internal::RNTupleMergeData::fColumns
std::vector< RColumnMergeInfo > fColumns
Definition RNTupleMerger.cxx:260

ROOT::Experimental::Internal::RNTupleMergeData::fNumDstEntries
NTupleSize_t fNumDstEntries
Definition RNTupleMerger.cxx:263

ROOT::Experimental::Internal::RNTupleMergeOptions
Definition RNTupleMerger.hxx:58

ROOT::Experimental::Internal::RNTupleMergeOptions::fCompressionSettings
int fCompressionSettings
If fCompressionSettings == kUnknownCompressionSettings (the default), the merger will not change the ...
Definition RNTupleMerger.hxx:62

ROOT::Experimental::Internal::RNTupleMergeOptions::fMergingMode
ENTupleMergingMode fMergingMode
Determines how the merging treats sources with different models (.
Definition RNTupleMerger.hxx:64

ROOT::Experimental::Internal::RNTupleMergeOptions::fExtraVerbose
bool fExtraVerbose
If true, the merger will emit further diagnostics and information.
Definition RNTupleMerger.hxx:68

ROOT::Experimental::Internal::RNTupleModelChangeset
The incremental changes to a RNTupleModel
Definition RNTupleModel.hxx:108

ROOT::Experimental::Internal::ROnDiskPage::Key
On-disk pages within a page source are identified by the column and page number.
Definition RCluster.hxx:52

ROOT::Experimental::Internal::RPageSink::RSealPageConfig
Parameters for the SealPage() method.
Definition RPageStorage.hxx:336

ROOT::Experimental::Internal::RPageSink::RSealPageConfig::fElement
const RColumnElementBase * fElement
Corresponds to the page's elements, for size calculation etc.
Definition RPageStorage.hxx:338

ROOT::Experimental::Internal::RPageSink::RSealPageConfig::fBuffer
void * fBuffer
Location for sealed output. The memory buffer has to be large enough.
Definition RPageStorage.hxx:347

ROOT::Experimental::Internal::RPageSink::RSealPageConfig::fCompressionSetting
int fCompressionSetting
Compression algorithm and level to apply.
Definition RPageStorage.hxx:339

ROOT::Experimental::Internal::RPageSink::RSealPageConfig::fWriteChecksum
bool fWriteChecksum
Adds a 8 byte little-endian xxhash3 checksum to the page payload.
Definition RPageStorage.hxx:342

ROOT::Experimental::Internal::RPageSink::RSealPageConfig::fPage
const RPage * fPage
Input page to be sealed.
Definition RPageStorage.hxx:337

ROOT::Experimental::Internal::RPageStorage::RSealedPage
A sealed page contains the bytes of a page as written to storage (packed & compressed).
Definition RPageStorage.hxx:91

ROOT::Experimental::Internal::RPageStorage::RSealedPage::SetHasChecksum
void SetHasChecksum(bool hasChecksum)
Definition RPageStorage.hxx:124

ROOT::Experimental::Internal::RPageStorage::RSealedPage::SetNElements
void SetNElements(std::uint32_t nElements)
Definition RPageStorage.hxx:121

ROOT::Experimental::Internal::RPageStorage::RSealedPage::SetBuffer
void SetBuffer(const void *buffer)
Definition RPageStorage.hxx:110

ROOT::Experimental::Internal::RPageStorage::RSealedPage::SetBufferSize
void SetBufferSize(std::size_t bufferSize)
Definition RPageStorage.hxx:118

ROOT::Experimental::Internal::RPageStorage::RSealedPage::VerifyChecksumIfEnabled
RResult< void > VerifyChecksumIfEnabled() const
Definition RPageStorage.cxx:62

ROOT::Experimental::Internal::RPageStorage::RSealedPage::GetNElements
std::uint32_t GetNElements() const
Definition RPageStorage.hxx:120

ROOT::Experimental::Internal::RPageStorage::RSealedPage::GetBufferSize
std::size_t GetBufferSize() const
Definition RPageStorage.hxx:117

ROOT::Experimental::Internal::RPageStorage::RSealedPage::GetHasChecksum
bool GetHasChecksum() const
Definition RPageStorage.hxx:123

ROOT::Experimental::Internal::RSealedPageMergeData
Definition RNTupleMerger.cxx:271

ROOT::Experimental::Internal::RSealedPageMergeData::fGroups
std::vector< RPageStorage::RSealedPageGroup > fGroups
Definition RNTupleMerger.cxx:275

ROOT::Experimental::Internal::RSealedPageMergeData::fPagesV
std::deque< RPageStorage::SealedPageSequence_t > fPagesV
Definition RNTupleMerger.cxx:274

ROOT::Experimental::Internal::RSealedPageMergeData::fBuffers
std::vector< std::unique_ptr< std::uint8_t[]> > fBuffers
Definition RNTupleMerger.cxx:276

ROOT::Experimental::RNTupleDescriptor::RCreateModelOptions
Modifiers passed to CreateModel
Definition RNTupleDescriptor.hxx:580

ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose
@ kUseGeneralPurpose
Use the new recommended general-purpose setting; it is a best trade-off between compression ratio/dec...
Definition Compression.h:58