Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDFSnapshotHelpers.cxx
Go to the documentation of this file.
1/**
2 \file RDFSnapshotHelpers.cxx
3 \ingroup dataframe
4 \author Enrico Guiraud, CERN
5 \author Danilo Piparo, CERN
6 \date 2016-12
7 \author Vincenzo Eduardo Padulano
8 \author Stephan Hageboeck
9 \date 2025-06
10*/
11
12/*************************************************************************
13 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
14 * All rights reserved. *
15 * *
16 * For the licensing terms see $ROOTSYS/LICENSE. *
17 * For the list of contributors see $ROOTSYS/README/CREDITS. *
18 *************************************************************************/
19
21
22#include <ROOT/REntry.hxx>
23#include <ROOT/RFieldToken.hxx>
24#include <ROOT/RNTuple.hxx>
25#include <ROOT/RNTupleDS.hxx>
28#include <ROOT/RTTreeDS.hxx>
30
31#include <TBranchObject.h>
32#include <TClassEdit.h>
33#include <TDictionary.h>
34#include <TDataType.h>
35#include <TFile.h>
36#include <TLeaf.h>
37#include <TTreeReader.h>
38
39#include <algorithm>
40#include <type_traits>
41#include <utility>
42
44// Maintaining the following allows for faster vector resize:
45static_assert(std::is_nothrow_move_assignable_v<RBranchData>);
46static_assert(std::is_nothrow_move_constructible_v<RBranchData>);
47
48namespace {
49
50void AssertNoNullBranchAddresses(const std::vector<RBranchData> &branches)
51{
52 std::vector<TBranch *> branchesWithNullAddress;
53 for (const auto &branchData : branches) {
54 if (branchData.fOutputBranch->GetAddress() == nullptr)
55 branchesWithNullAddress.push_back(branchData.fOutputBranch);
56 }
57
58 if (branchesWithNullAddress.empty())
59 return;
60
61 // otherwise build error message and throw
62 std::vector<std::string> missingBranchNames;
63 std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(),
64 std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); });
65 std::string msg = "RDataFrame::Snapshot:";
66 if (missingBranchNames.size() == 1) {
67 msg += " branch " + missingBranchNames[0] +
68 " is needed as it provides the size for one or more branches containing dynamically sized arrays, but "
69 "it is";
70 } else {
71 msg += " branches ";
72 for (const auto &bName : missingBranchNames)
73 msg += bName + ", ";
74 msg.resize(msg.size() - 2); // remove last ", "
75 msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are";
76 }
77 msg += " not part of the set of branches that are being written out.";
78 throw std::runtime_error(msg);
79}
80
81TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName)
82{
83 if (inputTree) {
84 if (auto *getBranchRes = inputTree->GetBranch(branchName.c_str()))
85 return getBranchRes;
86
87 // try harder
88 if (auto *findBranchRes = inputTree->FindBranch(branchName.c_str()))
89 return findBranchRes;
90 }
91 return nullptr;
92}
93
94std::vector<RBranchData>::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector<RBranchData> &outputBranches,
95 std::vector<RBranchData>::iterator thisBranch,
96 TBranch *inputBranch, int basketSize, void *address)
97{
98 if (!inputBranch)
99 return thisBranch;
100 const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName());
101 if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec)
102 return thisBranch;
103 // must construct the leaflist for the output branch and create the branch in the output tree
104 const auto *leaf = static_cast<TLeaf *>(inputBranch->GetListOfLeaves()->UncheckedAt(0));
105 if (!leaf)
106 return thisBranch;
107 const auto bname = leaf->GetName();
108 auto *sizeLeaf = leaf->GetLeafCount();
109 const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic());
110
111 // We proceed only if branch is a fixed-or-variable-sized array
112 if (sizeLeaf || leaf->GetLenStatic() > 1) {
113 if (sizeLeaf) {
114 // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's
115 // in the output tree.
116 auto sizeLeafIt =
117 std::find_if(outputBranches.begin(), outputBranches.end(),
118 [&sizeLeafName](const RBranchData &bd) { return bd.fOutputBranchName == sizeLeafName; });
119 if (sizeLeafIt == outputBranches.end()) {
120 // The size leaf is not part of the output branches yet, so emplace an empty slot for it.
121 // This means that iterators need to be updated in case the container reallocates.
122 const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch);
123 outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr);
124 thisBranch = outputBranches.begin() + indexBeforeEmplace;
125 sizeLeafIt = outputBranches.end() - 1;
126 }
127 if (!sizeLeafIt->fOutputBranch) {
128 // The size leaf was emplaced, but not initialised yet
129 const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName());
130 // Use original basket size for existing branches otherwise use custom basket size.
131 const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize();
132 // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for
133 // `sizeLeafName`
134 auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast<void *>(nullptr),
135 (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize);
136 sizeLeafIt->fOutputBranch = outputBranch;
137 }
138 }
139
140 const auto btype = leaf->GetTypeName();
141 const auto rootbtype = ROOT::Internal::RDF::TypeName2ROOTTypeName(btype);
142 if (rootbtype == ' ') {
143 Warning("Snapshot",
144 "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The "
145 "leaf is of type '%s'. This column will not be written out.",
146 bname, btype);
147 return thisBranch;
148 }
149
150 const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype;
151 // Use original basket size for existing branches and new basket size for new branches
152 const auto bufSize = (basketSize > 0) ? basketSize : inputBranch->GetBasketSize();
153 void *addressForBranch = [address]() -> void * {
154 if (address) {
155 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we need
156 // its buffer, so we cast it and extract the address of the buffer
157 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(address);
158 return rawRVec->data();
159 }
160 return nullptr;
161 }();
162 thisBranch->fOutputBranch =
163 outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize);
164 thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle());
165 thisBranch->fIsCArray = true;
166 }
167
168 return thisBranch;
169}
170
171void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress)
172{
173 const static TClassRef TBOClRef("TBranchObject");
174 if (inputBranch && inputBranch->IsA() == TBOClRef) {
175 branchData.fOutputBranch->SetAddress(reinterpret_cast<void **>(inputBranch->GetAddress()));
176 } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) {
177 // This is a relatively rare case of a fixed-size array getting redefined
178 branchData.fBranchAddressForCArrays = valueAddress;
179 branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays);
180 } else {
181 void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * {
182 if (isCArray) {
183 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
184 // need its buffer, so we cast it and extract the address of the buffer
185 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
186 return rawRVec->data();
187 }
188 return valueAddress;
189 }();
190 branchData.fOutputBranch->SetAddress(correctAddress);
191 branchData.fBranchAddressForCArrays = valueAddress;
192 }
193}
194
195void CreateFundamentalTypeBranch(TTree &outputTree, RBranchData &bd, void *valueAddress, int bufSize)
196{
197 // Logic taken from
198 // TTree::BranchImpRef(
199 // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel)
201 if (rootTypeChar == ' ') {
202 Warning("Snapshot",
203 "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This "
204 "column will not be written out.",
205 bd.fOutputBranchName.c_str());
206 return;
207 }
208 std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar};
209 bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize);
210}
211
212/// Ensure that an object with the input name can be written to the target file. This means checking that the
213/// TFile can be opened in the mode specified in `opts`, deleting any pre-existing objects with the same name in case
214/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
215void EnsureValidSnapshotOutput(const ROOT::RDF::RSnapshotOptions &opts, const std::string &objName,
216 const std::string &fileName)
217{
218 TString fileMode = opts.fMode;
219 fileMode.ToLower();
220 if (fileMode != "update")
221 return;
222
223 // output file opened in "update" mode: must check whether target object name is already present in file
224 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "UPDATE_WITHOUT_GLOBALREGISTRATION")};
225 if (!outFile || outFile->IsZombie())
226 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
227
228 // Object is not present in the file, we are good
229 if (!outFile->GetKey(objName.c_str()))
230 return;
231
232 // object called objName is already present in the file
233 if (opts.fOverwriteIfExists) {
234 if (auto existingTree = outFile->Get<TTree>(objName.c_str()); existingTree) {
235 // Special case for TTree. TTree::Delete invalidates the 'this' pointer, so we don't wrap it in a
236 // std::unique_ptr.
237 existingTree->Delete("all");
238 } else {
239 // Ensure deletion of object and all its cycles.
240 outFile->Delete((objName + ";*").c_str());
241 }
242 } else {
243 const std::string msg = "Snapshot: object \"" + objName + "\" already present in file \"" + fileName +
244 "\". If you want to delete the original object and write another, please set the "
245 "'fOverwriteIfExists' option to true in RSnapshotOptions.";
246 throw std::invalid_argument(msg);
247 }
248}
249
250void SetBranchesHelper(TTree *inputTree, TTree &outputTree,
251 std::vector<ROOT::Internal::RDF::RBranchData> &allBranchData, std::size_t currentIndex,
252 int basketSize, void *valueAddress)
253{
254 auto branchData = allBranchData.begin() + currentIndex;
255 auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName);
256
257 if (branchData->fOutputBranch && valueAddress) {
258 // The output branch was already created, we just need to (re)set its address
259 SetBranchAddress(inputBranch, *branchData, valueAddress);
260 return;
261 }
262
263 // Respect the original bufsize and splitlevel arguments
264 // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid
265 // writing garbage when unsplit objects cannot be written as split objects (e.g. in case of a polymorphic
266 // TObject branch, see https://bit.ly/2EjLMId ).
267 // A user-provided basket size value takes precedence.
268 const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000);
269 const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99;
270
271 auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID);
272 if (dynamic_cast<TDataType *>(dictionary)) {
273 // Branch of fundamental type
274 CreateFundamentalTypeBranch(outputTree, *branchData, valueAddress, bufSize);
275 return;
276 }
277
278 if (!branchData->fIsDefine) {
279 // Cases where we need a leaflist (e.g. C-style arrays)
280 // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is
281 // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is
282 // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree
283 // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec).
284 branchData = CreateCStyleArrayBranch(outputTree, allBranchData, branchData, inputBranch, bufSize, valueAddress);
285 }
286 if (branchData->fOutputBranch) {
287 // A branch was created in the previous function call
288 if (valueAddress) {
289 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
290 // need its buffer, so we cast it and extract the address of the buffer
291 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
292 branchData->fBranchAddressForCArrays = rawRVec->data();
293 }
294 return;
295 }
296
297 if (auto *classPtr = dynamic_cast<TClass *>(dictionary)) {
298 // Case of unsplit object with polymorphic type
299 if (inputBranch && dynamic_cast<TBranchObject *>(inputBranch) && valueAddress)
300 branchData->fOutputBranch =
301 ROOT::Internal::TreeUtils::CallBranchImp(outputTree, branchData->fOutputBranchName.c_str(), classPtr,
302 inputBranch->GetAddress(), bufSize, splitLevel);
303 // General case, with valid address
304 else if (valueAddress)
306 outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID),
307 valueAddress, bufSize, splitLevel);
308 // No value was passed, we're just creating a hollow branch to populate the dataset schema
309 else
310 branchData->fOutputBranch =
311 outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize);
312 return;
313 }
314
315 // We are not aware of other cases
316 throw std::logic_error(
317 "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
318}
319
320auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options)
321{
323 using OutputFormat = ROOT::RDF::ESnapshotOutputFormat;
324
325 if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) {
326 // The default compression settings for TTree is 101
327 if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
328 return ROOT::CompressionSettings(CompAlgo::kZLIB, 1);
329 }
331 } else if (options.fOutputFormat == OutputFormat::kRNTuple) {
332 // The default compression settings for RNTuple is 505
333 if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
334 return ROOT::CompressionSettings(CompAlgo::kZSTD, 5);
335 }
337 } else {
338 throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format");
339 }
340}
341
342std::string ModeWithoutGlobalRegistration(const std::string &mode)
343{
344 if (mode.find("_WITHOUT_GLOBALREGISTRATION") != std::string::npos) {
345 return mode;
346 }
347 return mode + "_WITHOUT_GLOBALREGISTRATION";
348}
349
350} // namespace
351
352ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine,
353 const std::type_info *typeID)
354 : fInputBranchName{std::move(inputBranchName)},
355 fOutputBranchName{std::move(outputBranchName)},
356 fInputTypeID{typeID},
357 fIsDefine{isDefine}
358{
359 auto *dictionary = TDictionary::GetDictionary(*fInputTypeID);
360 if (auto datatype = dynamic_cast<TDataType *>(dictionary); datatype) {
361 fTypeData = FundamentalType(datatype->Size());
362 } else if (auto tclass = dynamic_cast<TClass *>(dictionary); tclass) {
363 fTypeData = EmptyDynamicType{tclass};
364 }
365}
366
367/// @brief Return a pointer to an empty instance of the type represented by this branch.
368/// For fundamental types, this is simply an 8-byte region of zeroes. For classes, it is an instance created with
369/// TClass::New.
370/// @param pointerToPointer Return a pointer to a pointer, so it can be used directly in TTree::SetBranchAddress().
372{
373 if (auto fundamental = std::get_if<FundamentalType>(&fTypeData); fundamental) {
374 assert(!pointerToPointer); // Not used for fundamental types
375 return fundamental->fBytes.data();
376 }
377
378 auto &dynamic = std::get<EmptyDynamicType>(fTypeData);
379 if (!dynamic.fEmptyInstance) {
380 auto *dictionary = TDictionary::GetDictionary(*fInputTypeID);
381 assert(dynamic_cast<TDataType *>(dictionary) ==
382 nullptr); // TDataType should be handled by writing into the local buffer
383
384 auto tclass = dynamic_cast<TClass *>(dictionary);
385 assert(tclass);
386 dynamic.fEmptyInstance = std::shared_ptr<void>{tclass->New(), tclass->GetDestructor()};
387 }
388
389 if (pointerToPointer) {
390 // Make TTree happy (needs a pointer to pointer):
391 dynamic.fRawPtrToEmptyInstance = dynamic.fEmptyInstance.get();
392 return &dynamic.fRawPtrToEmptyInstance;
393 } else {
394 return dynamic.fEmptyInstance.get();
395 }
396}
397
398/// Point the branch address to an empty instance of the type represented by this branch
399/// or write null bytes into the space used by the fundamental type.
400/// This is used in case of variations, when certain defines/actions don't execute. We
401/// nevertheless need to write something, so we point the branch to an empty instance.
403{
404 if (!fOutputBranch)
405 return;
406
407 if (auto fundamental = std::get_if<FundamentalType>(&fTypeData); fundamental) {
408 fundamental->fBytes.fill(std::byte{0});
409 } else {
410 // TTree expects pointer to pointer, to figure out who allocates the object:
411 fOutputBranch->SetAddress(EmptyInstance(/*pointerToPointer=*/true));
412 }
413}
414
416 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
417 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
419 const std::vector<const std::type_info *> &colTypeIDs)
421 fDirName(dirname),
422 fTreeName(treename),
423 fOptions(options),
424 fOutputLoopManager(loopManager),
425 fInputLoopManager(inputLM)
426{
427 EnsureValidSnapshotOutput(fOptions, fTreeName, fFileName);
428
429 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
430 fBranchData.reserve(vbnames.size());
431 for (unsigned int i = 0; i < vbnames.size(); ++i) {
432 fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]);
433 }
434}
435
436// Define special member methods here where the definition of all the data member types is available
440 ROOT::Internal::RDF::UntypedSnapshotTTreeHelper &&) noexcept = default;
441
443{
444 if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) {
445 const auto fileOpenMode = [&]() {
446 TString checkupdate = fOptions.fMode;
447 checkupdate.ToLower();
448 return checkupdate == "update" ? "updated" : "created";
449 }();
450 Warning("Snapshot",
451 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
452 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
453 "its result in a variable and for example calling the GetValue() method on it.",
454 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
455 }
456}
457
459{
460 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
461 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
462 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
463 fInputTree = treeDS->GetTree();
465}
466
467void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Exec(unsigned int, const std::vector<void *> &values)
468{
470 UpdateCArraysPtrs(values);
471 } else {
472 SetBranches(values);
474 }
475
476 fOutputTree->Fill();
477}
478
480{
481 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
482 // associated to those is re-allocated. As a result the value of the pointer can change therewith
483 // leaving associated to the branch of the output tree an invalid pointer.
484 // With this code, we set the value of the pointer in the output branch anew when needed.
485 assert(values.size() == fBranchData.size());
486 auto nValues = values.size();
487 for (decltype(nValues) i{}; i < nValues; i++) {
488 if (fBranchData[i].fIsCArray) {
489 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
490 // need its buffer, so we cast it and extract the address of the buffer
491 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
492 if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) {
493 fBranchData[i].fOutputBranch->SetAddress(data);
494 fBranchData[i].fBranchAddressForCArrays = data;
495 }
496 }
497 }
498}
499
501{
502 // create branches in output tree
503 assert(fBranchData.size() == values.size());
504 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
505 SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]);
506 }
507 AssertNoNullBranchAddresses(fBranchData);
508}
509
511{
512 void *dummyValueAddress{};
513 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
514 SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress);
515 }
516}
517
519{
520 fOutputFile.reset(TFile::Open(fFileName.c_str(), ModeWithoutGlobalRegistration(fOptions.fMode).c_str(),
521 /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
522 if (!fOutputFile)
523 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
524
525 TDirectory *outputDir = fOutputFile.get();
526 if (!fDirName.empty()) {
527 TString checkupdate = fOptions.fMode;
528 checkupdate.ToLower();
529 if (checkupdate == "update")
530 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
531 else
532 outputDir = fOutputFile->mkdir(fDirName.c_str());
533 }
534
535 fOutputTree = std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/outputDir);
536
537 if (fOptions.fAutoFlush)
538 fOutputTree->SetAutoFlush(fOptions.fAutoFlush);
539}
540
542{
543 assert(fOutputTree != nullptr);
544 assert(fOutputFile != nullptr);
545
546 // There were no entries to fill the TTree with (either the input TTree was empty or no event passed after
547 // filtering). We have already created an empty TTree, now also create the branches to preserve the schema
548 if (fOutputTree->GetEntries() == 0) {
550 }
551 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
552 fOutputTree->AutoSave("flushbaskets");
553 // must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
554 fOutputTree.reset();
555 fOutputFile->Close();
556
557 // Now connect the data source to the loop manager so it can be used for further processing
558 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
559 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
560}
561
562/**
563 * \brief Create a new UntypedSnapshotTTreeHelper with a different output file name
564 *
565 * \param newName A type-erased string with the output file name
566 * \return UntypedSnapshotTTreeHelper
567 *
568 * This MakeNew implementation is tied to the cloning feature of actions
569 * of the computation graph. In particular, cloning a Snapshot node usually
570 * also involves changing the name of the output file, otherwise the cloned
571 * Snapshot would overwrite the same file.
572 */
575{
576 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
577 std::vector<std::string> inputBranchNames;
578 std::vector<std::string> outputBranchNames;
579 std::vector<bool> isDefine;
580 std::vector<const std::type_info *> inputColumnTypeIDs;
581 for (const auto &bd : fBranchData) {
582 if (bd.fInputBranchName.empty())
583 break;
584 inputBranchNames.push_back(bd.fInputBranchName);
585 outputBranchNames.push_back(bd.fOutputBranchName);
586 isDefine.push_back(bd.fIsDefine);
587 inputColumnTypeIDs.push_back(bd.fInputTypeID);
588 }
589
591 fDirName,
592 fTreeName,
593 std::move(inputBranchNames),
594 std::move(outputBranchNames),
595 fOptions,
596 std::move(isDefine),
599 inputColumnTypeIDs};
600}
601
603 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename,
604 const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
605 std::vector<bool> &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM,
606 const std::vector<const std::type_info *> &colTypeIDs)
607 : fNSlots(nSlots),
613 fDirName(dirname),
614 fTreeName(treename),
615 fOptions(options),
616 fOutputLoopManager(loopManager),
617 fInputLoopManager(inputLM)
618{
619 EnsureValidSnapshotOutput(fOptions, fTreeName, fFileName);
620
621 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
622 fBranchData.reserve(fNSlots);
623 for (unsigned int slot = 0; slot < fNSlots; ++slot) {
624 fBranchData.emplace_back();
625 auto &thisSlot = fBranchData.back();
626 thisSlot.reserve(vbnames.size());
627 for (unsigned int i = 0; i < vbnames.size(); ++i) {
628 thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]);
629 }
630 }
631}
632
633// Define special member methods here where the definition of all the data member types is available
637 ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT &&) noexcept = default;
638
640{
641 if (!fTreeName.empty() /*not moved from*/ && fOptions.fLazy && !fOutputFiles.empty() &&
642 std::all_of(fOutputFiles.begin(), fOutputFiles.end(), [](const auto &f) { return !f; }) /* never run */) {
643 const auto fileOpenMode = [&]() {
644 TString checkupdate = fOptions.fMode;
645 checkupdate.ToLower();
646 return checkupdate == "update" ? "updated" : "created";
647 }();
648 Warning("Snapshot",
649 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
650 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
651 "its result in a variable and for example calling the GetValue() method on it.",
652 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
653 }
654}
655
657{
658 ::TDirectory::TContext c; // do not let tasks change the thread-local gDirectory
659 if (!fOutputFiles[slot]) {
660 // first time this thread executes something, let's create a TBufferMerger output directory
661 fOutputFiles[slot] = fMerger->GetFile();
662 }
663 TDirectory *treeDirectory = fOutputFiles[slot].get();
664 if (!fDirName.empty()) {
665 // call returnExistingDirectory=true since MT can end up making this call multiple times
666 treeDirectory = fOutputFiles[slot]->mkdir(fDirName.c_str(), "", true);
667 }
668 // re-create output tree as we need to create its branches again, with new input variables
669 // TODO we could instead create the output tree and its branches, change addresses of input variables in each task
670 fOutputTrees[slot] =
671 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
673 // TODO can be removed when RDF supports interleaved TBB task execution properly, see ROOT-10269
674 fOutputTrees[slot]->SetImplicitMT(false);
675 if (fOptions.fAutoFlush)
676 fOutputTrees[slot]->SetAutoFlush(fOptions.fAutoFlush);
677 if (r) {
678 // We could be getting a task-local TTreeReader from the TTreeProcessorMT.
679 fInputTrees[slot] = r->GetTree();
680 } else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource())) {
681 fInputTrees[slot] = treeDS->GetTree();
682 }
683 fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
684}
685
687{
688 if (fOutputTrees[slot]->GetEntries() > 0)
689 fOutputFiles[slot]->Write();
690 for (auto &branchData : fBranchData[slot])
691 branchData.ClearBranchPointers(); // The branch pointers will go stale below
692 // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones)
693 fOutputTrees[slot].reset(nullptr);
694}
695
696void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector<void *> &values)
697{
698 if (fBranchAddressesNeedReset[slot] == 0) {
699 UpdateCArraysPtrs(slot, values);
700 } else {
701 SetBranches(slot, values);
703 }
704 fOutputTrees[slot]->Fill();
705 auto entries = fOutputTrees[slot]->GetEntries();
706 auto autoFlush = fOutputTrees[slot]->GetAutoFlush();
707 if ((autoFlush > 0) && (entries % autoFlush == 0))
708 fOutputFiles[slot]->Write();
709}
710
712 const std::vector<void *> &values)
713{
714 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
715 // associated to those is re-allocated. As a result the value of the pointer can change therewith
716 // leaving associated to the branch of the output tree an invalid pointer.
717 // With this code, we set the value of the pointer in the output branch anew when needed.
718 assert(values.size() == fBranchData[slot].size());
719 auto nValues = values.size();
720 for (decltype(nValues) i{}; i < nValues; i++) {
721 auto &branchData = fBranchData[slot][i];
722 if (branchData.fIsCArray) {
723 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
724 // need its buffer, so we cast it and extract the address of the buffer
725 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
726 if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) {
727 // reset the branch address
728 branchData.fOutputBranch->SetAddress(data);
729 branchData.fBranchAddressForCArrays = data;
730 }
731 }
732 }
733}
734
736 const std::vector<void *> &values)
737{
738 // create branches in output tree
739 auto &branchData = fBranchData[slot];
740 assert(branchData.size() == values.size());
741 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
742 SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]);
743 }
744
745 AssertNoNullBranchAddresses(branchData);
746}
747
749{
750 void *dummyValueAddress{};
751 auto &branchData = fBranchData.front();
752 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
753 SetBranchesHelper(inputTree, outputTree, branchData, i, fOptions.fBasketSize, dummyValueAddress);
754 }
755}
756
758{
759 auto outFile =
760 std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), ModeWithoutGlobalRegistration(fOptions.fMode).c_str(),
761 /*ftitle=*/fFileName.c_str(), GetSnapshotCompressionSettings(fOptions))};
762 if (!outFile)
763 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
764 fOutputFile = outFile.get();
765 fMerger = std::make_unique<ROOT::TBufferMerger>(std::move(outFile));
766}
767
769{
770
771 for (auto &file : fOutputFiles) {
772 if (file) {
773 file->Write();
774 file->Close();
775 }
776 }
777
778 // If there were no entries to fill the TTree with (either the input TTree was empty or no event passed after
779 // filtering), create an empty TTree in the output file and create the branches to preserve the schema
780 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
781 assert(fOutputFile && "Missing output file in Snapshot finalization.");
782 // Use GetKey to avoid having to deal with memory management of the object in the file
783 if (!fOutputFile->GetKey(fullTreeName.c_str())) {
784
785 // First find in which directory we need to write the output TTree
786 TDirectory *treeDirectory = fOutputFile;
787 if (!fDirName.empty()) {
788 treeDirectory = fOutputFile->mkdir(fDirName.c_str(), "", true);
789 }
790 ::TDirectory::TContext c{treeDirectory};
791
792 // Create the output TTree and create the user-requested branches
793 auto outTree =
794 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
795 TTree *inputTree{};
796 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
797 inputTree = treeDS->GetTree();
798 SetEmptyBranches(inputTree, *outTree);
799
800 fOutputFile->Write();
801 }
802
803 // flush all buffers to disk by destroying the TBufferMerger
804 fOutputFiles.clear();
805 fMerger.reset();
806
807 // Now connect the data source to the loop manager so it can be used for further processing
808 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
809}
810
811/**
812 * \brief Create a new UntypedSnapshotTTreeHelperMT with a different output file name
813 *
814 * \param newName A type-erased string with the output file name
815 * \return UntypedSnapshotTTreeHelperMT
816 *
817 * This MakeNew implementation is tied to the cloning feature of actions
818 * of the computation graph. In particular, cloning a Snapshot node usually
819 * also involves changing the name of the output file, otherwise the cloned
820 * Snapshot would overwrite the same file.
821 */
824{
825 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
826 std::vector<std::string> inputBranchNames;
827 std::vector<std::string> outputBranchNames;
828 std::vector<bool> isDefine;
829 std::vector<const std::type_info *> inputColumnTypeIDs;
830 for (const auto &bd : fBranchData.front()) {
831 if (bd.fInputBranchName.empty())
832 break;
833 inputBranchNames.push_back(bd.fInputBranchName);
834 outputBranchNames.push_back(bd.fOutputBranchName);
835 isDefine.push_back(bd.fIsDefine);
836 inputColumnTypeIDs.push_back(bd.fInputTypeID);
837 }
838
840 finalName,
841 fDirName,
842 fTreeName,
843 std::move(inputBranchNames),
844 std::move(outputBranchNames),
845 fOptions,
846 std::move(isDefine),
849 std::move(inputColumnTypeIDs)};
850}
851
853 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename,
854 const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options,
856 const std::vector<const std::type_info *> &colTypeIDs)
858 fDirName(dirname),
859 fNTupleName(ntuplename),
860 fOptions(options),
861 fInputLoopManager(inputLM),
862 fOutputLoopManager(outputLM),
863 fInputFieldNames(vfnames),
865 fNSlots(nSlots),
866 fFillContexts(nSlots),
867 fEntries(nSlots),
868 fInputColumnTypeIDs(colTypeIDs)
869{
870 EnsureValidSnapshotOutput(fOptions, fNTupleName, fFileName);
871}
872
873// Define special member methods here where the definition of all the data member types is available
877 ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper &&) noexcept = default;
878
880{
881 if (!fNTupleName.empty() /* not moved from */ && !fOutputFile /* did not run */ && fOptions.fLazy)
882 Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
883}
884
886{
887 auto model = ROOT::RNTupleModel::CreateBare();
888 auto nFields = fOutputFieldNames.size();
889 fFieldTokens.resize(nFields);
890 for (decltype(nFields) i = 0; i < nFields; i++) {
891 // Need to retrieve the type of every field to create as a string
892 // If the input type for a field does not have RTTI, internally we store it as the tag UseNativeDataType. When
893 // that is detected, we need to ask the data source which is the type name based on the on-disk information.
894 const auto typeName = *fInputColumnTypeIDs[i] == typeid(ROOT::Internal::RDF::UseNativeDataType)
896 fInputFieldNames[i], fOptions.fVector2RVec)
898
899 // Cardinality fields are read-only, so instead we snapshot them as their inner type.
900 if (typeName.substr(0, 25) == "ROOT::RNTupleCardinality<") {
901 // Get "T" from "ROOT::RNTupleCardinality<T>".
902 std::string cardinalityType = typeName.substr(25, typeName.size() - 26);
903 Warning("Snapshot",
904 "Column \"%s\" is a read-only \"%s\" column. It will be snapshot as its inner type \"%s\" instead.",
905 fInputFieldNames[i].c_str(), typeName.c_str(), cardinalityType.c_str());
906 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], cardinalityType).Unwrap());
907 } else {
908 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap());
909 }
910 fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]);
911 }
912 model->Freeze();
913
914 ROOT::RNTupleWriteOptions writeOptions;
915 writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions));
916 writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize);
917 writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize);
918 writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize);
919 writeOptions.SetMaxUnzippedClusterSize(fOptions.fMaxUnzippedClusterSize);
920 writeOptions.SetEnablePageChecksums(fOptions.fEnablePageChecksums);
921 writeOptions.SetEnableSamePageMerging(fOptions.fEnableSamePageMerging);
922
923 fOutputFile.reset(TFile::Open(fFileName.c_str(), ModeWithoutGlobalRegistration(fOptions.fMode).c_str()));
924 if (!fOutputFile)
925 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
926
927 TDirectory *outputDir = fOutputFile.get();
928 if (!fDirName.empty()) {
929 TString checkupdate = fOptions.fMode;
930 checkupdate.ToLower();
931 if (checkupdate == "update")
932 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
933 else
934 outputDir = fOutputFile->mkdir(fDirName.c_str());
935 }
936
937 // The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for
938 // calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask).
939 fWriter = ROOT::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
940}
941
943{
944 if (!fFillContexts[slot]) {
945 fFillContexts[slot] = fWriter->CreateFillContext();
946 fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry();
947 }
948}
949
950void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector<void *> &values)
951{
952 auto &fillContext = fFillContexts[slot];
953 auto &outputEntry = fEntries[slot];
954 assert(values.size() == fFieldTokens.size());
955 for (decltype(values.size()) i = 0; i < values.size(); i++) {
956 outputEntry->BindRawPtr(fFieldTokens[i], values[i]);
957 }
958 fillContext->Fill(*outputEntry);
959}
960
962{
963 // In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression.
964 // NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush.
965 fFillContexts[slot]->FlushCluster();
966}
967
969{
970 // First clear and destroy all entries, which were created from the RNTupleFillContexts.
971 fEntries.clear();
972 fFillContexts.clear();
973 // Then destroy the RNTupleParallelWriter and write the metadata.
974 fWriter.reset();
975 // We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
976 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::RDF::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
977}
978
979/**
980 * Create a new UntypedSnapshotRNTupleHelper with a different output file name.
981 *
982 * \param[in] newName A type-erased string with the output file name
983 * \return UntypedSnapshotRNTupleHelper
984 *
985 * This MakeNew implementation is tied to the cloning feature of actions
986 * of the computation graph. In particular, cloning a Snapshot node usually
987 * also involves changing the name of the output file, otherwise the cloned
988 * Snapshot would overwrite the same file.
989 */
992{
993 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
997}
998
999/*
1000 * ------------------------------------
1001 * Snapshot with systematic variations
1002 * ------------------------------------
1003 */
1004namespace ROOT::Internal::RDF {
1005/// An object to store an output file and a tree in one common place to share them between instances
1006/// of Snapshot with systematic uncertainties.
1008 std::unique_ptr<TFile> fFile;
1009 std::unique_ptr<TTree> fTree;
1010 std::string fDirectoryName;
1012
1013 // Bitmasks to indicate whether syst. uncertainties have been computed. Bound to TBranches, so need to be stable in
1014 // memory.
1015 struct Bitmask {
1016 std::string branchName;
1017 std::bitset<64> bitset{};
1018 std::unique_ptr<uint64_t> branchBuffer{new uint64_t{}};
1019 };
1020 std::vector<Bitmask> fBitMasks;
1021
1022 std::unordered_map<std::string, unsigned int> fBranchToVariationMapping;
1023 // The corresponding ROOT dictionary is declared in core/clingutils/src
1024 std::unordered_map<std::string, std::pair<std::string, unsigned int>> fBranchToBitmaskMapping;
1025 unsigned int fNBits = 0;
1026
1027 SnapshotOutputWriter(TFile *file) : fFile{file} { assert(fFile); }
1029 {
1030 if (!fBranchToBitmaskMapping.empty()) {
1031 fFile->WriteObject(&fBranchToBitmaskMapping,
1032 (std::string{"R_rdf_column_to_bitmask_mapping_"} + fTree->GetName()).c_str());
1033 }
1034 if (fTree) {
1035 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
1036 fTree->AutoSave("flushbaskets");
1037
1038 // Now connect the data source to the loop manager so it can be used for further processing
1039 std::string tree = fTree->GetName();
1040 if (!fDirectoryName.empty())
1041 tree = fDirectoryName + '/' + tree;
1042 std::string file = fFile->GetName();
1043
1044 fTree.reset();
1045 fFile.reset();
1046
1048 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(tree, file));
1049 }
1050 }
1051 SnapshotOutputWriter(SnapshotOutputWriter const &) = delete; // Anyway deleted because of the unique_ptrs
1054 delete; // Can be done, but need to make move-from object safe to destruct
1055 SnapshotOutputWriter &operator=(SnapshotOutputWriter &&) noexcept = delete;
1056
1057 /// Register a branch and corresponding systematic uncertainty.
1058 /// This will create an entry in the mapping from branch names to bitmasks, so the corresponding
1059 /// column can be masked if it doesn't contain valid entries. This mapping is written next to the
1060 /// tree into the output file.
1061 void RegisterBranch(std::string const &branchName, unsigned int variationIndex)
1062 {
1063 if (auto it = fBranchToVariationMapping.find(branchName); it != fBranchToVariationMapping.end()) {
1064 if (variationIndex != it->second) {
1065 throw std::logic_error("Branch " + branchName +
1066 " is being registered with different variation index than the expected one: " +
1067 std::to_string(variationIndex));
1068 }
1069 return;
1070 }
1071
1072 // Neither branch nor systematic are known, so a new entry needs to be created
1073 fNBits = std::max(fNBits, variationIndex);
1074 const auto vectorIndex = variationIndex / 64u;
1075 const auto bitIndex = variationIndex % 64u;
1076
1077 // Create bitmask branches as long as necessary to capture the bit
1078 while (vectorIndex >= fBitMasks.size()) {
1079 std::string bitmaskBranchName =
1080 std::string{"R_rdf_mask_"} + fTree->GetName() + '_' + std::to_string(fBitMasks.size());
1081 fBitMasks.push_back(Bitmask{bitmaskBranchName});
1082 fTree->Branch(bitmaskBranchName.c_str(), fBitMasks.back().branchBuffer.get());
1083 }
1084
1085 fBranchToVariationMapping[branchName] = variationIndex;
1086 fBranchToBitmaskMapping[branchName] = std::make_pair(fBitMasks[vectorIndex].branchName, bitIndex);
1087 }
1088
1089 /// Clear all bits, as if none of the variations passed its filter.
1091 {
1092 for (auto &mask : fBitMasks)
1093 mask.bitset.reset();
1094 }
1095
1096 /// Set a bit signalling that the variation at `index` passed its filter.
1097 void SetMaskBit(unsigned int index)
1098 {
1099 const auto vectorIndex = index / 64;
1100 const auto bitIndex = index % 64;
1101 fBitMasks[vectorIndex].bitset.set(bitIndex, true);
1102 }
1103
1104 /// Test if any of the mask bits are set.
1105 bool MaskEmpty() const
1106 {
1107 return std::none_of(fBitMasks.begin(), fBitMasks.end(), [](Bitmask const &mask) { return mask.bitset.any(); });
1108 }
1109
1110 /// Write the current event and the bitmask to the output dataset.
1111 void Write() const
1112 {
1113 if (!fTree)
1114 throw std::runtime_error("The TTree associated to the Snapshot action doesn't exist, any more.");
1115
1116 for (auto const &mask : fBitMasks) {
1117 *mask.branchBuffer = mask.bitset.to_ullong();
1118 }
1119
1120 fTree->Fill();
1121 }
1122};
1123
1124} // namespace ROOT::Internal::RDF
1125
1127 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
1128 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
1130 const std::vector<const std::type_info *> &colTypeIDs)
1131 : fOptions(options), fInputLoopManager{inputLoopMgr}, fOutputLoopManager{outputLoopMgr}
1132{
1133 EnsureValidSnapshotOutput(fOptions, std::string(treename), std::string(filename));
1134
1135 TDirectory::TContext fileCtxt;
1136 fOutputHandle = std::make_shared<SnapshotOutputWriter>(
1137 TFile::Open(filename.data(), ModeWithoutGlobalRegistration(fOptions.fMode).c_str(), /*ftitle=*/"",
1138 GetSnapshotCompressionSettings(fOptions)));
1139 if (!fOutputHandle->fFile)
1140 throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename});
1141
1142 TDirectory *outputDir = fOutputHandle->fFile.get();
1143 if (!dirname.empty()) {
1144 fOutputHandle->fDirectoryName = dirname;
1145 TString checkupdate = fOptions.fMode;
1146 checkupdate.ToLower();
1147 if (checkupdate == "update")
1148 outputDir =
1149 fOutputHandle->fFile->mkdir(std::string{dirname}.c_str(), "", true); // do not overwrite existing directory
1150 else
1151 outputDir = fOutputHandle->fFile->mkdir(std::string{dirname}.c_str());
1152 }
1153
1154 fOutputHandle->fTree = std::make_unique<TTree>(std::string{treename}.c_str(), std::string{treename}.c_str(),
1155 fOptions.fSplitLevel, /*dir=*/outputDir);
1156 fOutputHandle->fOutputLoopManager = fOutputLoopManager;
1157 if (fOptions.fAutoFlush)
1158 fOutputHandle->fTree->SetAutoFlush(fOptions.fAutoFlush);
1159
1160 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
1161
1162 fBranchData.reserve(vbnames.size());
1163 for (unsigned int i = 0; i < vbnames.size(); ++i) {
1164 fOutputHandle->RegisterBranch(outputBranchNames[i], 0);
1165 fBranchData.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]);
1166 }
1167}
1168
1169/// Register a new column as a variation of the column at `originalColumnIndex`, and clone its properties.
1170/// If a nominal column is registered here, it is written without changes, but it means that it will be masked
1171/// in case its selection cuts don't pass.
1172/// \param slot Task ID for MT runs.
1173/// \param columnIndex Index where the data of this column will be passed into the helper.
1174/// \param originalColumnIndex If the column being registered is a variation of a "nominal" column, this designates the
1175/// original.
1176/// Properties such as name and output type are cloned from the original.
1177/// \param variationName The variation that this column belongs to. If "nominal" is used, this column is considered as
1178/// the original.
1180 unsigned int columnIndex,
1181 unsigned int originalColumnIndex,
1182 unsigned int variationIndex,
1183 std::string const &variationName)
1184{
1185 if (columnIndex == originalColumnIndex) {
1186 // This is a nominal column, but it participates in variations.
1187 // It always needs to be written, but we still need to create a mask bit to mark when nominal is invalid.
1188 assert(variationIndex == 0);
1189 fBranchData[columnIndex].fVariationIndex = 0;
1190 fOutputHandle->RegisterBranch(fBranchData[columnIndex].fOutputBranchName, variationIndex);
1191 } else if (columnIndex >= fBranchData.size()) {
1192 // First task, need to create branches
1193 fBranchData.resize(columnIndex + 1);
1194 auto &bd = fBranchData[columnIndex];
1195 bd = fBranchData[originalColumnIndex];
1196 std::string newOutputName = bd.fOutputBranchName + "__" + variationName;
1197 std::replace(newOutputName.begin(), newOutputName.end(), ':', '_');
1198 bd.fOutputBranchName = std::move(newOutputName);
1199 bd.fVariationIndex = variationIndex;
1200
1201 fOutputHandle->RegisterBranch(bd.fOutputBranchName, variationIndex);
1202 } else {
1203 assert(static_cast<unsigned int>(fBranchData[columnIndex].fVariationIndex) == variationIndex);
1204 }
1205}
1206
1207/// Bind all output branches to RDF columns for the given slots.
1209{
1210 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
1211 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
1212 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
1213 fInputTree = treeDS->GetTree();
1214
1215 // Create all output branches; and bind them to empty values
1216 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
1217 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize,
1218 fBranchData[i].EmptyInstance(/*pointerToPointer=*/false));
1219 }
1220
1221 AssertNoNullBranchAddresses(fBranchData);
1222}
1223
1224/// Connect all output fields to the values pointed to by `values`, fill the output dataset,
1225/// call the Fill of the output tree, and clear the mask bits that show whether a variation was reached.
1226void ROOT::Internal::RDF::SnapshotHelperWithVariations::Exec(unsigned int /*slot*/, const std::vector<void *> &values,
1227 std::vector<bool> const &filterPassed)
1228{
1229 // Rebind branch pointers to RDF values
1230 assert(fBranchData.size() == values.size());
1231 for (std::size_t i = 0; i < values.size(); i++) {
1232 const auto variationIndex = fBranchData[i].fVariationIndex;
1233 if (variationIndex < 0) {
1234 // Branch without variations, it always needs to be written
1235 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]);
1236 } else {
1237 // Nominal will always be written, systematics only if needed
1238 if (variationIndex == 0 || filterPassed[variationIndex]) {
1239 const bool fundamentalType = fBranchData[i].WriteValueIfFundamental(values[i]);
1240 if (!fundamentalType) {
1241 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]);
1242 }
1243 }
1244
1245 if (filterPassed[variationIndex]) {
1246 fOutputHandle->SetMaskBit(variationIndex);
1247 }
1248 }
1249 }
1250
1251 assert(!fOutputHandle->MaskEmpty()); // Exec should not have been called if nothing passes
1252
1253 fOutputHandle->Write();
1254 fOutputHandle->ClearMaskBits();
1255 for (auto &branchData : fBranchData) {
1256 branchData.ClearBranchContents();
1257 }
1258}
1259
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
if(isa< VarDecl >(D)||isa< FieldDecl >(D)||isa< EnumConstantDecl >(D))
Definition TCling.cxx:7039
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t mask
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char mode
static TBranch * SearchForBranch(TTree *tree, const char *name)
Definition TTreePyz.cxx:61
The head node of a RDF computation graph.
std::shared_ptr< SnapshotOutputWriter > fOutputHandle
SnapshotHelperWithVariations(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&, ROOT::Detail::RDF::RLoopManager *outputLoopMgr, ROOT::Detail::RDF::RLoopManager *inputLoopMgr, const std::vector< const std::type_info * > &colTypeIDs)
void InitTask(TTreeReader *, unsigned int slot)
Bind all output branches to RDF columns for the given slots.
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
void Exec(unsigned int, const std::vector< void * > &values, std::vector< bool > const &filterPassed)
Connect all output fields to the values pointed to by values, fill the output dataset,...
void RegisterVariedColumn(unsigned int slot, unsigned int columnIndex, unsigned int originalColumnIndex, unsigned int varationIndex, std::string const &variationName)
Register a new column as a variation of the column at originalColumnIndex, and clone its properties.
std::vector< std::shared_ptr< ROOT::RNTupleFillContext > > fFillContexts
std::unique_ptr< ROOT::RNTupleParallelWriter > fWriter
UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector< const std::type_info * > &colTypeIDs)
std::vector< std::unique_ptr< ROOT::REntry > > fEntries
std::vector< const std::type_info * > fInputColumnTypeIDs
void Exec(unsigned int slot, const std::vector< void * > &values)
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
UntypedSnapshotRNTupleHelper MakeNew(void *newName)
Create a new UntypedSnapshotRNTupleHelper with a different output file name.
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
void InitTask(TTreeReader *, unsigned int slot)
UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(unsigned int slot, const std::vector< void * > &values)
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
std::vector< std::shared_ptr< ROOT::TBufferMergerFile > > fOutputFiles
std::vector< std::vector< RBranchData > > fBranchData
UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelperMT with a different output file name.
void InitTask(TTreeReader *r, unsigned int slot)
void Exec(unsigned int slot, const std::vector< void * > &values)
std::vector< std::unique_ptr< TTree > > fOutputTrees
std::unique_ptr< ROOT::TBufferMerger > fMerger
void SetBranches(unsigned int slot, const std::vector< void * > &values)
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
UntypedSnapshotTTreeHelper MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelper with a different output file name.
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
void SetBranches(const std::vector< void * > &values)
void Exec(unsigned int, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(const std::vector< void * > &values)
pointer data() noexcept
Return a pointer to the vector's buffer, even if empty().
Definition RVec.hxx:282
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
static std::unique_ptr< RNTupleParallelWriter > Append(std::unique_ptr< ROOT::RNTupleModel > model, std::string_view ntupleName, TDirectory &fileOrDirectory, const ROOT::RNTupleWriteOptions &options=ROOT::RNTupleWriteOptions())
Append an RNTuple to the existing file.
Common user-tunable settings for storing RNTuples.
void SetEnablePageChecksums(bool val)
Note that turning off page checksums will also turn off the same page merging optimization (see tunin...
void SetMaxUnzippedClusterSize(std::size_t val)
void SetMaxUnzippedPageSize(std::size_t val)
void SetInitialUnzippedPageSize(std::size_t val)
void SetApproxZippedClusterSize(std::size_t val)
void SetCompression(std::uint32_t val)
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1524
A Branch for the case of an object.
A TTree is a list of TBranches.
Definition TBranch.h:93
virtual const char * GetClassName() const
Return the name of the user class whose content is stored in this branch, if any.
Definition TBranch.cxx:1323
virtual char * GetAddress() const
Definition TBranch.h:221
static TClass * Class()
Int_t GetSplitLevel() const
Definition TBranch.h:259
TClass * IsA() const override
Definition TBranch.h:304
virtual void SetAddress(void *add)
Set address of this branch.
Definition TBranch.cxx:2694
TObjArray * GetListOfLeaves()
Definition TBranch.h:256
TClassRef is used to implement a permanent reference to a TClass object.
Definition TClassRef.h:29
TClass instances represent classes, structs and namespaces in the ROOT type system.
Definition TClass.h:84
Basic data type descriptor (datatype information is obtained from CINT).
Definition TDataType.h:44
Int_t GetType() const
Definition TDataType.h:71
static TDictionary * GetDictionary(const char *name)
Retrieve the type (class, fundamental type, typedef etc) named "name".
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
Describe directory structure in memory.
Definition TDirectory.h:45
A file, usually with extension .root, that stores data and code in the form of serialized objects in ...
Definition TFile.h:130
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3787
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
const char * GetTitle() const override
Returns title of object.
Definition TNamed.h:50
Basic string class.
Definition TString.h:138
void ToLower()
Change string to lower-case.
Definition TString.cxx:1189
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
A TTree represents a columnar dataset.
Definition TTree.h:89
virtual TBranch * FindBranch(const char *name)
Return the branch that correspond to the path 'branchname', which can include the name of the tree or...
Definition TTree.cxx:4890
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition TTree.cxx:5430
TBranch * Branch(const char *name, T *obj, Int_t bufsize=32000, Int_t splitlevel=99)
Add a new branch, and infer the data type from the type of obj being passed.
Definition TTree.h:397
virtual TTree * GetTree() const
Definition TTree.h:604
@ kEntriesReshuffled
If set, signals that this TTree is the output of the processing of another TTree, and the entries are...
Definition TTree.h:305
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition RDFUtils.cxx:424
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition RDFUtils.cxx:369
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:200
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:654
char TypeID2ROOTTypeName(const std::type_info &tid)
Definition RDFUtils.cxx:228
TBranch * CallBranchImp(TTree &tree, const char *branchname, TClass *ptrClass, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10133
TBranch * CallBranchImpRef(TTree &tree, const char *branchname, TClass *ptrClass, EDataType datatype, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10127
std::vector< std::string > ColumnNames_t
@ kROOTRVec
Definition ESTLType.h:46
@ kSTLvector
Definition ESTLType.h:30
int CompressionSettings(RCompressionSetting::EAlgorithm::EValues algorithm, int compressionLevel)
ROOT::ESTLType STLKind(std::string_view type)
Converts STL container name to number.
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
Stores empty instances of classes, so a dummy object can be written when a systematic variation doesn...
Stores variations of a fundamental type.
Stores properties of each output branch in a Snapshot.
void * EmptyInstance(bool pointerToPointer)
Return a pointer to an empty instance of the type represented by this branch.
void ClearBranchContents()
Point the branch address to an empty instance of the type represented by this branch or write null by...
std::variant< FundamentalType, EmptyDynamicType > fTypeData
const std::type_info * fInputTypeID
void Write() const
Write the current event and the bitmask to the output dataset.
void ClearMaskBits()
Clear all bits, as if none of the variations passed its filter.
SnapshotOutputWriter(SnapshotOutputWriter const &)=delete
std::unordered_map< std::string, std::pair< std::string, unsigned int > > fBranchToBitmaskMapping
void RegisterBranch(std::string const &branchName, unsigned int variationIndex)
Register a branch and corresponding systematic uncertainty.
void SetMaskBit(unsigned int index)
Set a bit signalling that the variation at index passed its filter.
bool MaskEmpty() const
Test if any of the mask bits are set.
SnapshotOutputWriter & operator=(SnapshotOutputWriter const &)=delete
std::unordered_map< std::string, unsigned int > fBranchToVariationMapping
SnapshotOutputWriter(SnapshotOutputWriter &&) noexcept=delete
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:347
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
Definition Compression.h:88
A collection of options to steer the creation of the dataset on disk through Snapshot().
ESnapshotOutputFormat fOutputFormat
Which data format to write to.
std::string fMode
Mode of creation of output file.
ECAlgo fCompressionAlgorithm
Compression algorithm of output file.
int fCompressionLevel
Compression level of output file.
bool fOverwriteIfExists
If fMode is "UPDATE", overwrite object in output file if it already exists.