doc/master/RNTupleMerger_8cxx_source.html

/// \file RNTupleMerger.cxx

/// \ingroup NTuple ROOT7

/// \author Jakob Blomer <jblomer@cern.ch>, Max Orok <maxwellorok@gmail.com>, Alaettin Serhan Mete <amete@anl.gov>

/// \date 2020-07-08

/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback

/// is welcome!


/*************************************************************************

 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers.               *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


#include <ROOT/RError.hxx>

#include <ROOT/RNTuple.hxx>

#include <ROOT/RNTupleDescriptor.hxx>

#include <ROOT/RNTupleMerger.hxx>

#include <ROOT/RNTupleModel.hxx>

#include <ROOT/RNTupleUtil.hxx>

#include <ROOT/RPageStorageFile.hxx>

#include <TError.h>

#include <TFile.h>

#include <TKey.h>


#include <deque>


Long64_t ROOT::Experimental::RNTuple::Merge(TCollection *inputs, TFileMergeInfo *mergeInfo)

{

   // Check the inputs

   if (!inputs || inputs->GetEntries() < 3 || !mergeInfo)

      return -1;


   // Parse the input parameters

   TIter itr(inputs);


   // First entry is the RNTuple name

   std::string ntupleName = std::string(itr()->GetName());


   // Second entry is the output file

   TFile *outFile = dynamic_cast<TFile *>(itr());

   if (!outFile)

      return -1;


   // Check if the output file already has a key with that name

   TKey *outKey = outFile->FindKey(ntupleName.c_str());

   RNTuple *outNTuple = nullptr;

   if (outKey) {

      outNTuple = outKey->ReadObject<RNTuple>();

      if (!outNTuple) {

         Error("RNTuple::Merge", "Output file already has key, but not of type RNTuple!");

         return -1;

      }

      // In principle, we should already be working on the RNTuple object from the output file, but just continue with

      // pointer we just got.

   }


   RNTupleWriteOptions writeOpts;

   writeOpts.SetUseBufferedWrite(false);

   auto destination = std::make_unique<Internal::RPageSinkFile>(ntupleName, *outFile, writeOpts);


   // If we already have an existing RNTuple, copy over its descriptor to support incremental merging

   if (outNTuple) {

      auto source = Internal::RPageSourceFile::CreateFromAnchor(*outNTuple);

      source->Attach();

      auto desc = source->GetSharedDescriptorGuard();

      destination->InitFromDescriptor(desc.GetRef());

   }


   // The remaining entries are the input files

   std::vector<std::unique_ptr<Internal::RPageSourceFile>> sources;

   std::vector<Internal::RPageSource *> sourcePtrs;


   while (const auto &pitr = itr()) {

      TFile *inFile = dynamic_cast<TFile *>(pitr);

      RNTuple *anchor = inFile ? inFile->Get<RNTuple>(ntupleName.c_str()) : nullptr;

      if (!anchor)

         return -1;

      sources.push_back(Internal::RPageSourceFile::CreateFromAnchor(*anchor));

   }


   // Interface conversion

   for (const auto &s : sources) {

      sourcePtrs.push_back(s.get());

   }


   // Now merge

   Internal::RNTupleMerger merger;

   merger.Merge(sourcePtrs, *destination);


   // Provide the caller with a merged anchor object (even though we've already

   // written it).

   *this = *outFile->Get<RNTuple>(ntupleName.c_str());


   return 0;

}


////////////////////////////////////////////////////////////////////////////////

void ROOT::Experimental::Internal::RNTupleMerger::BuildColumnIdMap(

   std::vector<ROOT::Experimental::Internal::RNTupleMerger::RColumnInfo> &columns)

{

   for (auto &column : columns) {

      column.fColumnOutputId = fOutputIdMap.size();

      fOutputIdMap[column.fColumnName + "." + column.fColumnTypeAndVersion] = column.fColumnOutputId;

   }

}


////////////////////////////////////////////////////////////////////////////////

void ROOT::Experimental::Internal::RNTupleMerger::ValidateColumns(

   std::vector<ROOT::Experimental::Internal::RNTupleMerger::RColumnInfo> &columns)

{

   // First ensure that we have the same number of columns

   if (fOutputIdMap.size() != columns.size()) {

      throw RException(R__FAIL("Columns between sources do NOT match"));

   }

   // Then ensure that we have the same names of columns and assign the ids

   for (auto &column : columns) {

      try {

         column.fColumnOutputId = fOutputIdMap.at(column.fColumnName + "." + column.fColumnTypeAndVersion);

      } catch (const std::out_of_range &) {

         throw RException(R__FAIL("Column NOT found in the first source w/ name " + column.fColumnName +

                                  " type and version " + column.fColumnTypeAndVersion));

      }

   }

}


////////////////////////////////////////////////////////////////////////////////

std::vector<ROOT::Experimental::Internal::RNTupleMerger::RColumnInfo>

ROOT::Experimental::Internal::RNTupleMerger::CollectColumns(const RNTupleDescriptor &descriptor)

{

   std::vector<RColumnInfo> columns;

   // Here we recursively find the columns and fill the RColumnInfo vector

   AddColumnsFromField(columns, descriptor, descriptor.GetFieldZero());

   // Then we either build the internal map (first source) or validate the columns against it (remaning sources)

   // In either case, we also assign the output ids here

   if (fOutputIdMap.empty()) {

      BuildColumnIdMap(columns);

   } else {

      ValidateColumns(columns);

   }

   return columns;

}


////////////////////////////////////////////////////////////////////////////////

void ROOT::Experimental::Internal::RNTupleMerger::AddColumnsFromField(

   std::vector<ROOT::Experimental::Internal::RNTupleMerger::RColumnInfo> &columns, const RNTupleDescriptor &desc,

   const RFieldDescriptor &fieldDesc, const std::string &prefix)

{

   for (const auto &field : desc.GetFieldIterable(fieldDesc)) {

      std::string name = prefix + field.GetFieldName() + ".";

      const std::string typeAndVersion = field.GetTypeName() + "." + std::to_string(field.GetTypeVersion());

      for (const auto &column : desc.GetColumnIterable(field)) {

         columns.emplace_back(name + std::to_string(column.GetIndex()), typeAndVersion, column.GetPhysicalId(),

                              kInvalidDescriptorId);

      }

      AddColumnsFromField(columns, desc, field, name);

   }

}


////////////////////////////////////////////////////////////////////////////////

void ROOT::Experimental::Internal::RNTupleMerger::Merge(std::span<RPageSource *> sources, RPageSink &destination)

{

   if (destination.IsInitialized()) {

      CollectColumns(destination.GetDescriptor());

   }


   std::unique_ptr<RNTupleModel> model; // used to initialize the schema of the output RNTuple


   // Append the sources to the destination one-by-one

   for (const auto &source : sources) {

      source->Attach();


      // Get a handle on the descriptor (metadata)

      auto descriptor = source->GetSharedDescriptorGuard();


      // Collect all the columns

      // The column name : output column id map is only built once

      auto columns = CollectColumns(descriptor.GetRef());


      // Create sink from the input model if not initialized

      if (!destination.IsInitialized()) {

         model = descriptor->CreateModel();

         destination.Init(*model.get());

      }


      for (const auto &extraTypeInfoDesc : descriptor->GetExtraTypeInfoIterable()) {

         destination.UpdateExtraTypeInfo(extraTypeInfoDesc);

      }


      // Make sure the source contains events to be merged

      if (source->GetNEntries() == 0) {

         continue;

      }


      // Now loop over all clusters in this file

      // descriptor->GetClusterIterable() doesn't guarantee any specific order...

      // Find the first cluster id and iterate from there...

      auto clusterId = descriptor->FindClusterId(0, 0);


      while (clusterId != ROOT::Experimental::kInvalidDescriptorId) {

         auto &cluster = descriptor->GetClusterDescriptor(clusterId);


         std::vector<std::unique_ptr<unsigned char[]>> buffers;

         // We use a std::deque so that references to the contained SealedPageSequence_t, and its iterators, are never

         // invalidated.

         std::deque<RPageStorage::SealedPageSequence_t> sealedPagesV;

         std::vector<RPageStorage::RSealedPageGroup> sealedPageGroups;


         for (const auto &column : columns) {


            // See if this cluster contains this column

            // if not, there is nothing to read/do...

            auto columnId = column.fColumnInputId;

            if (!cluster.ContainsColumn(columnId)) {

               continue;

            }


            // Now get the pages for this column in this cluster

            const auto &pages = cluster.GetPageRange(columnId);

            size_t idx{0};


            RPageStorage::SealedPageSequence_t sealedPages;


            // Loop over the pages

            for (const auto &pageInfo : pages.fPageInfos) {


               // Each page contains N elements that we are going to read together

               // LoadSealedPage reads packed/compressed bytes of a page into

               // a memory buffer provided by a sealed page

               RClusterIndex clusterIndex(clusterId, idx);

               Internal::RPageStorage::RSealedPage sealedPage;

               source->LoadSealedPage(columnId, clusterIndex, sealedPage);


               // The way LoadSealedPage works might require a double call

               // See the implementation. Here we do this in any case...

               auto buffer = std::make_unique<unsigned char[]>(sealedPage.GetSize());

               sealedPage.SetBuffer(buffer.get());

               source->LoadSealedPage(columnId, clusterIndex, sealedPage);


               buffers.push_back(std::move(buffer));

               sealedPages.push_back(std::move(sealedPage));


               // Move on to the next index

               idx += pageInfo.fNElements;


            } // end of loop over pages


            sealedPagesV.push_back(std::move(sealedPages));

            sealedPageGroups.emplace_back(column.fColumnOutputId, sealedPagesV.back().cbegin(),

                                          sealedPagesV.back().cend());


         } // end of loop over columns


         // Now commit all pages to the output

         destination.CommitSealedPageV(sealedPageGroups);


         // Commit the clusters

         destination.CommitCluster(cluster.GetNEntries());


         // Go to the next cluster

         clusterId = descriptor->FindNextClusterId(clusterId);


      } // end of loop over clusters


      // Commit all clusters for this input

      destination.CommitClusterGroup();


   } // end of loop over sources


   // Commit the output

   destination.CommitDataset();

}

RError.hxx

R__FAIL
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290

RNTupleDescriptor.hxx

RNTupleMerger.hxx

RNTupleModel.hxx

RNTupleUtil.hxx

RNTuple.hxx

RPageStorageFile.hxx

Long64_t
long long Long64_t
Definition RtypesCore.h:80

TError.h

Error
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
Definition TError.cxx:185

TFile.h

name
char name[80]
Definition TGX11.cxx:110

TKey.h

ROOT::Experimental::Internal::RNTupleMerger
Given a set of RPageSources merge them into an RPageSink.
Definition RNTupleMerger.hxx:40

ROOT::Experimental::Internal::RNTupleMerger::AddColumnsFromField
void AddColumnsFromField(std::vector< RColumnInfo > &columns, const RNTupleDescriptor &desc, const RFieldDescriptor &fieldDesc, const std::string &prefix="")
Recursively add columns from a given field.
Definition RNTupleMerger.cxx:146

ROOT::Experimental::Internal::RNTupleMerger::Merge
void Merge(std::span< RPageSource * > sources, RPageSink &destination)
Merge a given set of sources into the destination.
Definition RNTupleMerger.cxx:162

ROOT::Experimental::Internal::RNTupleMerger::CollectColumns
std::vector< RColumnInfo > CollectColumns(const RNTupleDescriptor &descriptor)
Recursively collect all the columns for all the fields rooted at field zero.
Definition RNTupleMerger.cxx:130

ROOT::Experimental::Internal::RNTupleMerger::BuildColumnIdMap
void BuildColumnIdMap(std::vector< RColumnInfo > &columns)
Build the internal column id map from the first source This is where we assign the output ids for the...
Definition RNTupleMerger.cxx:100

ROOT::Experimental::Internal::RNTupleMerger::ValidateColumns
void ValidateColumns(std::vector< RColumnInfo > &columns)
Validate the columns against the internal map that is built from the first source This is where we as...
Definition RNTupleMerger.cxx:110

ROOT::Experimental::Internal::RPageSink
Abstract interface to write data into an ntuple.
Definition RPageStorage.hxx:192

ROOT::Experimental::Internal::RPageSink::IsInitialized
bool IsInitialized() const
Definition RPageStorage.hxx:236

ROOT::Experimental::Internal::RPageSink::UpdateExtraTypeInfo
virtual void UpdateExtraTypeInfo(const RExtraTypeInfoDescriptor &extraTypeInfo)=0
Adds an extra type information record to schema.

ROOT::Experimental::Internal::RPageSink::CommitDataset
void CommitDataset()
Run the registered callbacks and finalize the current cluster and the entrire data set.
Definition RPageStorage.cxx:410

ROOT::Experimental::Internal::RPageSink::GetDescriptor
virtual const RNTupleDescriptor & GetDescriptor() const =0
Return the RNTupleDescriptor being constructed.

ROOT::Experimental::Internal::RPageSink::Init
void Init(RNTupleModel &model)
Physically creates the storage container to hold the ntuple (e.g., a keys a TFile or an S3 bucket) In...
Definition RPageStorage.hxx:243

ROOT::Experimental::Internal::RPageSink::CommitClusterGroup
virtual void CommitClusterGroup()=0
Write out the page locations (page list envelope) for all the committed clusters since the last call ...

ROOT::Experimental::Internal::RPageSink::CommitCluster
virtual std::uint64_t CommitCluster(NTupleSize_t nNewEntries)=0
Finalize the current cluster and create a new one for the following data.

ROOT::Experimental::Internal::RPageSink::CommitSealedPageV
virtual void CommitSealedPageV(std::span< RPageStorage::RSealedPageGroup > ranges)=0
Write a vector of preprocessed pages to storage. The corresponding columns must have been added befor...

ROOT::Experimental::Internal::RPageSourceFile::CreateFromAnchor
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const RNTupleReadOptions &options=RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
Definition RPageStorageFile.cxx:294

ROOT::Experimental::Internal::RPageStorage::SealedPageSequence_t
std::deque< RSealedPage > SealedPageSequence_t
Definition RPageStorage.hxx:110

ROOT::Experimental::RClusterIndex
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Definition RNTupleUtil.hxx:111

ROOT::Experimental::RException
Base class for all ROOT issued exceptions.
Definition RError.hxx:78

ROOT::Experimental::RFieldDescriptor
Meta-data stored for every field of an ntuple.
Definition RNTupleDescriptor.hxx:68

ROOT::Experimental::RNTupleDescriptor
The on-storage meta-data of an ntuple.
Definition RNTupleDescriptor.hxx:464

ROOT::Experimental::RNTupleDescriptor::GetFieldIterable
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
Definition RNTupleDescriptor.hxx:818

ROOT::Experimental::RNTupleDescriptor::GetColumnIterable
RColumnDescriptorIterable GetColumnIterable() const
Definition RNTupleDescriptor.hxx:843

ROOT::Experimental::RNTupleDescriptor::GetFieldZero
const RFieldDescriptor & GetFieldZero() const
Definition RNTupleDescriptor.hxx:882

ROOT::Experimental::RNTupleWriteOptions
Common user-tunable settings for storing ntuples.
Definition RNTupleWriteOptions.hxx:37

ROOT::Experimental::RNTupleWriteOptions::SetUseBufferedWrite
void SetUseBufferedWrite(bool val)
Definition RNTupleWriteOptions.hxx:97

ROOT::Experimental::RNTuple
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:61

ROOT::Experimental::RNTuple::Merge
Long64_t Merge(TCollection *input, TFileMergeInfo *mergeInfo)
RNTuple implements the hadd MergeFile interface Merge this NTuple with the input list entries.
Definition RNTupleMerger.cxx:29

TCollection
Collection abstract base class.
Definition TCollection.h:65

TCollection::GetEntries
virtual Int_t GetEntries() const
Definition TCollection.h:179

TDirectoryFile::FindKey
TKey * FindKey(const char *keyname) const override
Find key with name keyname in the current directory.
Definition TDirectoryFile.cxx:779

TDirectoryFile::Get
TObject * Get(const char *namecycle) override
Return pointer to object identified by namecycle.
Definition TDirectoryFile.cxx:937

TFileMergeInfo
Definition TFileMergeInfo.h:42

TFile
A ROOT file is an on-disk file, usually with extension .root, that stores objects in a file-system-li...
Definition TFile.h:53

TIter
Definition TCollection.h:235

TKey
Book space in a file, create I/O buffers, to fill them, (un)compress them.
Definition TKey.h:28

TKey::ReadObject
T * ReadObject()
To read an object (non deriving from TObject) from the file.
Definition TKey.h:103

ROOT::Experimental::kInvalidDescriptorId
constexpr DescriptorId_t kInvalidDescriptorId
Definition RNTupleUtil.hxx:108

ROOT::Experimental::Internal::RPageStorage::RSealedPage
A sealed page contains the bytes of a page as written to storage (packed & compressed).
Definition RPageStorage.hxx:86

ROOT::Experimental::Internal::RPageStorage::RSealedPage::SetBuffer
void SetBuffer(const void *buffer)
Definition RPageStorage.hxx:101

ROOT::Experimental::Internal::RPageStorage::RSealedPage::GetSize
std::uint32_t GetSize() const
Definition RPageStorage.hxx:103