Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
17#include <ROOT/RField.hxx>
19#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPageSinkBuf.hxx>
23#include <ROOT/RPageStorage.hxx>
25#include <string_view>
26
27#include <TBranch.h>
28#include <TChain.h>
29#include <TClass.h>
30#include <TDataType.h>
31#include <TLeaf.h>
32#include <TLeafC.h>
33#include <TLeafElement.h>
34#include <TLeafObject.h>
35
36#include <cassert>
37#include <cstdint>
38#include <cstring>
39#include <iostream>
40#include <utility>
41
42namespace {
43
44class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
45private:
46 static constexpr std::uint64_t gUpdateFrequencyBytes = 100 * 1000 * 1000; // report every 100 MB
47 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
48
49public:
50 ~RDefaultProgressCallback() override {}
51 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
52 {
53 // Report if more than 100 MB (compressed) where written since the last status update
54 if (nbytesWritten < fNbytesNext)
55 return;
56 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
57 fNbytesNext += gUpdateFrequencyBytes;
58 if (nbytesWritten > fNbytesNext) {
59 // If we already passed the next threshold, increase by a sensible amount.
60 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
61 }
62 }
63
64 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
65 {
66 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
67 }
68};
69
70} // anonymous namespace
71
74{
75 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
77}
78
79std::unique_ptr<ROOT::Experimental::RNTupleImporter>
81 std::string_view destFileName)
82{
83 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
84 importer->fNTupleName = treeName;
85 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
86 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
87 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
88 }
89
90 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
91 if (!importer->fSourceTree) {
92 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
93 }
94
95 // If we have IMT enabled, its best use is for parallel page compression
96 importer->fSourceTree->SetImplicitMT(false);
97 auto result = importer->InitDestination(destFileName);
98
99 if (!result)
101
102 return importer;
103}
104
105std::unique_ptr<ROOT::Experimental::RNTupleImporter>
107{
108 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
109
110 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
111 if (sourceTree->LoadTree(0) != 0)
112 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
113 importer->fNTupleName = sourceTree->GetTree()->GetName();
114 } else {
115 importer->fNTupleName = sourceTree->GetName();
116 }
117
118 importer->fSourceTree = sourceTree;
119
120 // If we have IMT enabled, its best use is for parallel page compression
121 importer->fSourceTree->SetImplicitMT(false);
122 auto result = importer->InitDestination(destFileName);
123
124 if (!result)
126
127 return importer;
128}
129
131{
133 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
134 if (!fDestFile || fDestFile->IsZombie()) {
135 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
136 }
137
138 return RResult<void>::Success();
139}
140
142{
143 for (const auto &f : fImportFields) {
144 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << "]\n";
145 }
147 std::cout << "Importing (projected) '" << f->GetFieldName() << "' [" << f->GetTypeName() << "]\n";
148 }
149}
150
160
162{
163 ResetSchema();
164
165 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
166 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
167 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
169 assert(b);
170 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
172
173 const bool isLeafList = b->GetNleaves() > 1;
174 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
175 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
176 if (isLeafList && isClass)
177 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
178 if (isLeafList && isCountLeaf)
179 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
180
181 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
182 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
183 // char arrays and C strings.
185 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
186 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
187
188 if (isCountLeaf) {
189 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
190 // leaf count comes before any array leaves that use it.
191 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
192 // collection, together the collection mode.
194 c.fMaxLength = firstLeaf->GetMaximum();
195 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
196 // Casting to void * makes it work for both Int_t and UInt_t
197 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
198 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
199 continue;
200 }
201
202 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
203 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
204 std::vector<std::unique_ptr<ROOT::RFieldBase>> recordItems;
205 // For leaf count arrays, we expect to find a single leaf; we don't add a field right away but only
206 // later through a projection
207 bool isLeafCountArray = false;
208 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
209 if (l->IsA() == TLeafObject::Class()) {
210 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
211 }
212
213 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
214 // There are files in the public where the title is broken (empty).
215 Int_t countval = l->GetLenStatic();
216 auto *countleaf = l->GetLeafCount();
217 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
218 isLeafCountArray = (countleaf != nullptr);
219
220 // The base case for branches with fundamental, single numerical types.
221 // For other types of branches, different field names or types are necessary,
222 // which is determined below.
223 std::string fieldName = b->GetName();
224 std::string fieldType = l->GetTypeName();
225
226 if (isLeafList)
227 fieldName = l->GetName();
228
229 if (isCString)
230 fieldType = "std::string";
231
232 if (isClass)
233 fieldType = b->GetClassName();
234
236 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
237
239 // Replace any occurrence of a dot ('.') with an underscore.
240 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
241 }
242
245 if (!fieldOrError)
247 auto field = fieldOrError.Unwrap();
248 if (isCString) {
249 branchBufferSize = l->GetMaximum();
250 f.fValue = std::make_unique<ROOT::RFieldBase::RValue>(field->CreateValue());
251 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
252 fImportTransformations.emplace_back(
253 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
254 } else {
255 if (isClass) {
256 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
257 // calling SetBranchAddress()
258 branchBufferSize = sizeof(void *) * countval;
259 } else if (isLeafCountArray) {
260 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
261 } else {
262 branchBufferSize = l->GetOffset() + field->GetValueSize();
263 }
264 }
265 f.fField = field.get();
266
267 if (isLeafList) {
268 recordItems.emplace_back(std::move(field));
269 } else if (isLeafCountArray) {
270 const std::string countleafName = countleaf->GetName();
271 fLeafCountCollections[countleafName].fLeafFields.emplace_back(std::move(field));
272 fLeafCountCollections[countleafName].fLeafBranchIndexes.emplace_back(fImportBranches.size());
273 R__ASSERT(b->GetListOfLeaves()->GetEntries() == 1);
274 break;
275 } else {
276 fModel->AddField(std::move(field));
277 fImportFields.emplace_back(std::move(f));
278 }
279 }
280 if (!recordItems.empty()) {
281 auto recordField = std::make_unique<ROOT::RRecordField>(b->GetName(), std::move(recordItems));
283 f.fField = recordField.get();
284 fImportFields.emplace_back(std::move(f));
285 fModel->AddField(std::move(recordField));
286 }
287
289 ib.fBranchName = b->GetName();
290 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
291 if (isClass) {
292 auto klass = TClass::GetClass(b->GetClassName());
293 if (!klass) {
294 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
295 std::string(b->GetName()));
296 }
297 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
298 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
299 } else {
300 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
301 }
302
303 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
304 if (!isLeafCountArray && !fImportFields.back().fFieldBuffer) {
305 fImportFields.back().fFieldBuffer =
306 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
307 }
308
309 fImportBranches.emplace_back(std::move(ib));
310 }
311
312 int iLeafCountCollection = 0;
313 for (auto &p : fLeafCountCollections) {
314 // We want to capture this variable, which is not possible with a
315 // structured binding in C++17. Explicitly defining a variable works.
316 auto countLeafName = p.first;
317 auto &c = p.second;
318
319 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
320 auto recordField = std::make_unique<ROOT::RRecordField>("_0", std::move(c.fLeafFields));
321 c.fRecordField = recordField.get();
322 auto collectionField = ROOT::RVectorField::CreateUntyped(c.fFieldName, std::move(recordField));
323 fModel->AddField(std::move(collectionField));
324
325 // Add projected fields for all leaf count arrays
326 for (const auto leaf : c.fRecordField->GetConstSubfields()) {
327 const auto name = leaf->GetFieldName();
328 auto projectedField =
329 ROOT::RFieldBase::Create(name, "ROOT::VecOps::RVec<" + leaf->GetTypeName() + ">").Unwrap();
330 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
331 if (fieldName == name)
332 return c.fFieldName;
333 else
334 return c.fFieldName + "._0." + name;
335 });
336 }
337
339 // Replace any occurrenceof a dot ('.') in the count leaf name with an underscore.
340 std::replace(countLeafName.begin(), countLeafName.end(), '.', '_');
341 }
342
343 // Add projected fields for count leaf
344 auto projectedField = ROOT::RFieldBase::Create(countLeafName, "ROOT::RNTupleCardinality<std::uint32_t>").Unwrap();
345 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
346
348 }
349
350 if (fFieldModifier) {
351 for (auto &field : fModel->GetMutableFieldZero()) {
353 }
354 }
355
356 fModel->Freeze();
357
358 fEntry = fModel->CreateBareEntry();
359 for (const auto &f : fImportFields) {
360 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
361 }
362 for (auto &[_, c] : fLeafCountCollections) {
363 fEntry->BindRawPtr<void>(c.fFieldName, &c.fFieldBuffer);
364 }
365
366 if (!fIsQuiet)
367 ReportSchema();
368
369 return RResult<void>::Success();
370}
371
373{
374 if (fDestFile->FindKey(fNTupleName.c_str()) != nullptr)
375 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
376
378
379 std::unique_ptr<ROOT::Internal::RPageSink> sink =
380 std::make_unique<ROOT::Internal::RPageSinkFile>(fNTupleName, *fDestFile, fWriteOptions);
381 sink->GetMetrics().Enable();
382 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
383
385 sink = std::make_unique<ROOT::Internal::RPageSinkBuf>(std::move(sink));
386 }
387
388 auto ntplWriter = ROOT::Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
389 // The guard needs to be destructed before the writer goes out of scope
391
392 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
393
395
396 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
398 }
399
400 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
402
403 for (auto &[_, c] : fLeafCountCollections) {
404 const auto sizeOfRecord = c.fRecordField->GetValueSize();
405 c.fFieldBuffer.resize(sizeOfRecord * (*c.fCountVal));
406
407 const auto nLeafs = c.fRecordField->GetConstSubfields().size();
408 for (std::size_t l = 0; l < nLeafs; ++l) {
409 const auto offset = c.fRecordField->GetOffsets()[l];
410 const auto sizeOfLeaf = c.fRecordField->GetConstSubfields()[l]->GetValueSize();
411 const auto idxImportBranch = c.fLeafBranchIndexes[l];
412 for (Int_t j = 0; j < *c.fCountVal; ++j) {
413 memcpy(c.fFieldBuffer.data() + j * sizeOfRecord + offset,
414 fImportBranches[idxImportBranch].fBranchBuffer.get() + (j * sizeOfLeaf), sizeOfLeaf);
415 }
416 }
417 }
418
419 for (auto &t : fImportTransformations) {
420 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
421 if (!result)
423 }
424
425 ntplWriter->Fill(*fEntry);
426
428 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
429 }
431 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
432}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:303
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:110
#define _(A, B)
Definition cfortran.h:108
Used to report every ~100 MB (compressed), and at the end about the status of the import.
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::unique_ptr< ROOT::REntry > fEntry
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::unique_ptr< ROOT::RNTupleModel > fModel
std::vector< RImportBranch > fImportBranches
ROOT::RResult< void > InitDestination(std::string_view destFileName)
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
void Import()
Import works in two steps:
ROOT::RNTupleWriteOptions fWriteOptions
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
ROOT::RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::RFieldZero & GetFieldZero()
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
std::vector< const RFieldBase * > GetConstSubfields() const
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
const_iterator begin() const
const_iterator end() const
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:197
static std::unique_ptr< RVectorField > CreateUntyped(std::string_view fieldName, std::unique_ptr< RFieldBase > itemField)
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:3073
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4131
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
TClass * IsA() const override
Definition TLine.h:79
virtual const char * GetName() const
Returns name of object.
Definition TObject.cxx:457
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual Int_t GetEntry(Long64_t entry, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition TTree.cxx:5655
virtual Int_t SetBranchAddress(const char *bname, void *add, TBranch **ptr=nullptr)
Change branch address, dealing with clone trees properly.
Definition TTree.cxx:8493
virtual Long64_t GetEntries() const
Definition TTree.h:464
virtual TObjArray * GetListOfBranches()
Definition TTree.h:529
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< ROOT::RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RProjectedFields & GetProjectedFieldsOfModel(RNTupleModel &model)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
ROOT::RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
Int_t fMaxLength
Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
TLine l
Definition textangle.C:4