Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
17#include <ROOT/RField.hxx>
19#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPageSinkBuf.hxx>
23#include <ROOT/RPageStorage.hxx>
25#include <string_view>
26
27#include <TBranch.h>
28#include <TChain.h>
29#include <TClass.h>
30#include <TDataType.h>
31#include <TLeaf.h>
32#include <TLeafC.h>
33#include <TLeafElement.h>
34#include <TLeafObject.h>
35
36#include <cassert>
37#include <cstdint>
38#include <cstring>
39#include <iostream>
40#include <utility>
41
42namespace {
43
44class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
45private:
46 static constexpr std::uint64_t gUpdateFrequencyBytes = 50 * 1000 * 1000; // report every 50MB
47 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
48
49public:
50 ~RDefaultProgressCallback() override {}
51 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
52 {
53 // Report if more than 50MB (compressed) where written since the last status update
54 if (nbytesWritten < fNbytesNext)
55 return;
56 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
57 fNbytesNext += gUpdateFrequencyBytes;
58 if (nbytesWritten > fNbytesNext) {
59 // If we already passed the next threshold, increase by a sensible amount.
60 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
61 }
62 }
63
64 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
65 {
66 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
67 }
68};
69
70} // anonymous namespace
71
74{
75 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
77}
78
81 RImportField &field)
82{
83 auto valueSize = field.fField->GetValueSize();
84 memcpy(field.fFieldBuffer, branch.fBranchBuffer.get() + (fNum * valueSize), valueSize);
85 fNum++;
87}
88
89std::unique_ptr<ROOT::Experimental::RNTupleImporter>
90ROOT::Experimental::RNTupleImporter::Create(std::string_view sourceFileName, std::string_view treeName,
91 std::string_view destFileName)
92{
93 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
94 importer->fNTupleName = treeName;
95 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
96 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
97 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
98 }
99
100 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
101 if (!importer->fSourceTree) {
102 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
103 }
104
105 // If we have IMT enabled, its best use is for parallel page compression
106 importer->fSourceTree->SetImplicitMT(false);
107 auto result = importer->InitDestination(destFileName);
108
109 if (!result)
111
112 return importer;
113}
114
115std::unique_ptr<ROOT::Experimental::RNTupleImporter>
116ROOT::Experimental::RNTupleImporter::Create(TTree *sourceTree, std::string_view destFileName)
117{
118 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
119
120 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
121 if (sourceTree->LoadTree(0) != 0)
122 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
123 importer->fNTupleName = sourceTree->GetTree()->GetName();
124 } else {
125 importer->fNTupleName = sourceTree->GetName();
126 }
127
128 importer->fSourceTree = sourceTree;
129
130 // If we have IMT enabled, its best use is for parallel page compression
131 importer->fSourceTree->SetImplicitMT(false);
132 auto result = importer->InitDestination(destFileName);
133
134 if (!result)
136
137 return importer;
138}
139
141{
142 fDestFileName = destFileName;
143 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
144 if (!fDestFile || fDestFile->IsZombie()) {
145 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
146 }
147
148 return RResult<void>::Success();
149}
150
152{
153 for (const auto &f : fImportFields) {
154 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << ']' << std::endl;
155 }
156}
157
167
169{
170 ResetSchema();
171
172 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
173 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
174 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
175 for (auto b : TRangeDynCast<TBranch>(*fSourceTree->GetListOfBranches())) {
176 assert(b);
177 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
178 assert(firstLeaf);
179
180 const bool isLeafList = b->GetNleaves() > 1;
181 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
182 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
183 if (isLeafList && isClass)
184 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
185 if (isLeafList && isCountLeaf)
186 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
187
188 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
189 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
190 // char arrays and C strings.
191 Int_t firstLeafCountval;
192 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
193 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
194
195 if (isCountLeaf) {
196 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
197 // leaf count comes before any array leaves that use it.
198 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
199 // collection, together the collection mode.
201 c.fCollectionModel = RNTupleModel::CreateBare();
202 c.fMaxLength = firstLeaf->GetMaximum();
203 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
204 // Casting to void * makes it work for both Int_t and UInt_t
205 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
206 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
207 continue;
208 }
209
210 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
211 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
212 std::vector<std::unique_ptr<RFieldBase>> recordItems;
213 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
214 if (l->IsA() == TLeafObject::Class()) {
215 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
216 }
217
218 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
219 // There are files in the public where the title is broken (empty).
220 Int_t countval = l->GetLenStatic();
221 auto *countleaf = l->GetLeafCount();
222 const bool isLeafCountArray = (countleaf != nullptr);
223 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
224
225 // The base case for branches with fundamental, single numerical types.
226 // For other types of branches, different field names or types are necessary,
227 // which is determined below.
228 std::string fieldName = b->GetName();
229 std::string fieldType = l->GetTypeName();
230
231 if (isLeafList)
232 fieldName = l->GetName();
233
234 if (isCString)
235 fieldType = "std::string";
236
237 if (isClass)
238 fieldType = b->GetClassName();
239
240 if (isFixedSizeArray)
241 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
242
244 // Replace any occurrence of a dot ('.') with an underscore.
245 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
246 }
247
249 f.fIsClass = isClass;
250 auto fieldOrError = RFieldBase::Create(fieldName, fieldType);
251 if (!fieldOrError)
252 return R__FORWARD_ERROR(fieldOrError);
253 auto field = fieldOrError.Unwrap();
254 if (isCString) {
255 branchBufferSize = l->GetMaximum();
256 f.fValue = std::make_unique<RFieldBase::RValue>(field->CreateValue());
257 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
258 fImportTransformations.emplace_back(
259 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
260 } else {
261 if (isClass) {
262 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
263 // calling SetBranchAddress()
264 branchBufferSize = sizeof(void *) * countval;
265 } else if (isLeafCountArray) {
266 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
267 } else {
268 branchBufferSize = l->GetOffset() + field->GetValueSize();
269 }
270 }
271 f.fField = field.get();
272
273 if (isLeafList) {
274 recordItems.emplace_back(std::move(field));
275 } else if (isLeafCountArray) {
276 f.fValue = std::make_unique<RFieldBase::RValue>(field->CreateValue());
277 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
278 f.fIsInUntypedCollection = true;
279 const std::string countleafName = countleaf->GetName();
280 fLeafCountCollections[countleafName].fCollectionModel->AddField(std::move(field));
281 fLeafCountCollections[countleafName].fImportFieldIndexes.emplace_back(fImportFields.size());
282 fLeafCountCollections[countleafName].fTransformations.emplace_back(
283 std::make_unique<RLeafArrayTransformation>(fImportBranches.size(), fImportFields.size()));
284 fImportFields.emplace_back(std::move(f));
285 } else {
286 fModel->AddField(std::move(field));
287 fImportFields.emplace_back(std::move(f));
288 }
289 }
290 if (!recordItems.empty()) {
291 auto recordField = std::make_unique<RRecordField>(b->GetName(), std::move(recordItems));
293 f.fField = recordField.get();
294 fImportFields.emplace_back(std::move(f));
295 fModel->AddField(std::move(recordField));
296 }
297
298 RImportBranch ib;
299 ib.fBranchName = b->GetName();
300 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
301 if (isClass) {
302 auto klass = TClass::GetClass(b->GetClassName());
303 if (!klass) {
304 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
305 std::string(b->GetName()));
306 }
307 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
308 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
309 } else {
310 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
311 }
312
313 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
314 if (!fImportFields.back().fFieldBuffer) {
315 fImportFields.back().fFieldBuffer =
316 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
317 }
318
319 fImportBranches.emplace_back(std::move(ib));
320 }
321
322 int iLeafCountCollection = 0;
323 for (auto &p : fLeafCountCollections) {
324 // We want to capture this variable, which is not possible with a
325 // structured binding in C++17. Explicitly defining a variable works.
326 auto &countLeafName = p.first;
327 auto &c = p.second;
328 c.fCollectionModel->Freeze();
329 c.fCollectionEntry = c.fCollectionModel->CreateBareEntry();
330 for (auto idx : c.fImportFieldIndexes) {
331 const auto name = fImportFields[idx].fField->GetFieldName();
332 const auto buffer = fImportFields[idx].fFieldBuffer;
333 c.fCollectionEntry->BindRawPtr(name, buffer);
334 }
335 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
336 c.fCollectionWriter = fModel->MakeCollection(c.fFieldName, std::move(c.fCollectionModel));
337 // Add projected fields for all leaf count arrays
338 for (auto idx : c.fImportFieldIndexes) {
339 const auto name = fImportFields[idx].fField->GetFieldName();
340 auto projectedField =
341 RFieldBase::Create(name, "ROOT::VecOps::RVec<" + fImportFields[idx].fField->GetTypeName() + ">").Unwrap();
342 R__ASSERT(dynamic_cast<RRVecField *>(projectedField.get()));
343 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
344 if (fieldName == name)
345 return c.fFieldName;
346 else
347 return c.fFieldName + "." + name;
348 });
349 }
350 // Add projected fields for count leaf
351 auto projectedField =
352 RFieldBase::Create(countLeafName, "ROOT::Experimental::RNTupleCardinality<std::uint32_t>").Unwrap();
353 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
354 iLeafCountCollection++;
355 }
356
357 fModel->Freeze();
358 fEntry = fModel->CreateBareEntry();
359 for (const auto &f : fImportFields) {
360 if (f.fIsInUntypedCollection)
361 continue;
362 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
363 }
364 for (const auto &[_, c] : fLeafCountCollections) {
365 fEntry->BindRawPtr<void>(c.fFieldName, c.fCollectionWriter->GetOffsetPtr());
366 }
367
368 if (!fIsQuiet)
369 ReportSchema();
370
371 return RResult<void>::Success();
372}
373
375{
376 if (fDestFile->FindKey(fNTupleName.c_str()) != nullptr)
377 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
378
380
381 std::unique_ptr<Internal::RPageSink> sink =
382 std::make_unique<Internal::RPageSinkFile>(fNTupleName, *fDestFile, fWriteOptions);
383 sink->GetMetrics().Enable();
384 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
385
386 if (fWriteOptions.GetUseBufferedWrite()) {
387 sink = std::make_unique<Internal::RPageSinkBuf>(std::move(sink));
388 }
389
390 auto ntplWriter = Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
391 // The guard needs to be destructed before the writer goes out of scope
392 RImportGuard importGuard(*this);
393
394 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
395
396 auto nEntries = fSourceTree->GetEntries();
397
398 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
399 nEntries = fMaxEntries;
400 }
401
402 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
403 fSourceTree->GetEntry(i);
404
405 for (const auto &[_, c] : fLeafCountCollections) {
406 for (Int_t l = 0; l < *c.fCountVal; ++l) {
407 for (auto &t : c.fTransformations) {
408 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
409 if (!result)
411 }
412 c.fCollectionWriter->Fill(*c.fCollectionEntry);
413 }
414 for (auto &t : c.fTransformations)
415 t->ResetEntry();
416 }
417
418 for (auto &t : fImportTransformations) {
419 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
420 if (!result)
422 t->ResetEntry();
423 }
424
425 ntplWriter->Fill(*fEntry);
426
428 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
429 }
431 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
432}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:294
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
int Int_t
Definition RtypesCore.h:45
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Definition TError.h:118
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:110
Int_t i
#define _(A, B)
Definition cfortran.h:108
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias, bool fContinueOnError=false)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:565
virtual size_t GetValueSize() const =0
The number of bytes taken by a value of the appropriate type.
Used to report every ~50MB (compressed), and at the end about the status of the import.
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::vector< RImportBranch > fImportBranches
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void Import()
Import works in two steps:
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
static std::unique_ptr< RNTupleModel > CreateBare()
A bare model has no default entry.
The type-erased field for a RVec<Type>
Definition RField.hxx:1122
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2969
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4110
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual void SetImplicitMT(bool enabled)
Definition TTree.h:624
virtual TTree * GetTree() const
Definition TTree.h:520
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition TTree.cxx:6473
TClass * IsA() const override
Definition TTree.h:662
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
TLine l
Definition textangle.C:4