Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDFSnapshotHelpers.cxx
Go to the documentation of this file.
1/**
2 \file RDFSnapshotHelpers.cxx
3 \ingroup dataframe
4 \author Enrico Guiraud, CERN
5 \author Danilo Piparo, CERN
6 \date 2016-12
7 \author Vincenzo Eduardo Padulano
8 \author Stephan Hageboeck
9 \date 2025-06
10*/
11
12/*************************************************************************
13 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
14 * All rights reserved. *
15 * *
16 * For the licensing terms see $ROOTSYS/LICENSE. *
17 * For the list of contributors see $ROOTSYS/README/CREDITS. *
18 *************************************************************************/
19
21
22#include <ROOT/REntry.hxx>
23#include <ROOT/RFieldToken.hxx>
24#include <ROOT/RNTuple.hxx>
25#include <ROOT/RNTupleDS.hxx>
28#include <ROOT/RTTreeDS.hxx>
30
31#include <TBranchObject.h>
32#include <TClassEdit.h>
33#include <TDictionary.h>
34#include <TDataType.h>
35#include <TFile.h>
36#include <TLeaf.h>
37#include <TTreeReader.h>
38
39#include <algorithm>
40#include <type_traits>
41#include <utility>
42
44
45namespace {
46
47void AssertNoNullBranchAddresses(const std::vector<RBranchData> &branches)
48{
49 std::vector<TBranch *> branchesWithNullAddress;
50 for (const auto &branchData : branches) {
51 if (branchData.fOutputBranch->GetAddress() == nullptr)
52 branchesWithNullAddress.push_back(branchData.fOutputBranch);
53 }
54
55 if (branchesWithNullAddress.empty())
56 return;
57
58 // otherwise build error message and throw
59 std::vector<std::string> missingBranchNames;
61 std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); });
62 std::string msg = "RDataFrame::Snapshot:";
63 if (missingBranchNames.size() == 1) {
64 msg += " branch " + missingBranchNames[0] +
65 " is needed as it provides the size for one or more branches containing dynamically sized arrays, but "
66 "it is";
67 } else {
68 msg += " branches ";
69 for (const auto &bName : missingBranchNames)
70 msg += bName + ", ";
71 msg.resize(msg.size() - 2); // remove last ", "
72 msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are";
73 }
74 msg += " not part of the set of branches that are being written out.";
75 throw std::runtime_error(msg);
76}
77
79{
80 if (inputTree) {
81 if (auto *getBranchRes = inputTree->GetBranch(branchName.c_str()))
82 return getBranchRes;
83
84 // try harder
85 if (auto *findBranchRes = inputTree->FindBranch(branchName.c_str()))
86 return findBranchRes;
87 }
88 return nullptr;
89}
90
91std::vector<RBranchData>::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector<RBranchData> &outputBranches,
92 std::vector<RBranchData>::iterator thisBranch,
93 TBranch *inputBranch, int basketSize, void *address)
94{
95 if (!inputBranch)
96 return thisBranch;
97 const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName());
98 if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec)
99 return thisBranch;
100 // must construct the leaflist for the output branch and create the branch in the output tree
101 const auto *leaf = static_cast<TLeaf *>(inputBranch->GetListOfLeaves()->UncheckedAt(0));
102 if (!leaf)
103 return thisBranch;
104 const auto bname = leaf->GetName();
105 auto *sizeLeaf = leaf->GetLeafCount();
106 const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic());
107
108 // We proceed only if branch is a fixed-or-variable-sized array
109 if (sizeLeaf || leaf->GetLenStatic() > 1) {
110 if (sizeLeaf) {
111 // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's
112 // in the output tree.
113 auto sizeLeafIt =
114 std::find_if(outputBranches.begin(), outputBranches.end(),
115 [&sizeLeafName](const RBranchData &bd) { return bd.fOutputBranchName == sizeLeafName; });
116 if (sizeLeafIt == outputBranches.end()) {
117 // The size leaf is not part of the output branches yet, so emplace an empty slot for it.
118 // This means that iterators need to be updated in case the container reallocates.
119 const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch);
120 outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr,
121 /*outputBranch=*/nullptr);
124 }
125 if (!sizeLeafIt->fOutputBranch) {
126 // The size leaf was emplaced, but not initialised yet
128 // Use original basket size for existing branches otherwise use custom basket size.
129 const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize();
130 // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for
131 // `sizeLeafName`
132 auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast<void *>(nullptr),
133 (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize);
134 sizeLeafIt->fOutputBranch = outputBranch;
135 }
136 }
137
138 const auto btype = leaf->GetTypeName();
140 if (rootbtype == ' ') {
141 Warning("Snapshot",
142 "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The "
143 "leaf is of type '%s'. This column will not be written out.",
144 bname, btype);
145 return thisBranch;
146 }
147
148 const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype;
149 // Use original basket size for existing branches and new basket size for new branches
150 const auto bufSize = (basketSize > 0) ? basketSize : inputBranch->GetBasketSize();
151 void *addressForBranch = [address]() -> void * {
152 if (address) {
153 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we need
154 // its buffer, so we cast it and extract the address of the buffer
155 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(address);
156 return rawRVec->data();
157 }
158 return nullptr;
159 }();
160 thisBranch->fOutputBranch =
161 outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize);
162 thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle());
163 thisBranch->fIsCArray = true;
164 }
165
166 return thisBranch;
167}
168
169void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress)
170{
171 const static TClassRef TBOClRef("TBranchObject");
172 if (inputBranch && inputBranch->IsA() == TBOClRef) {
173 branchData.fOutputBranch->SetAddress(reinterpret_cast<void **>(inputBranch->GetAddress()));
174 } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) {
175 // This is a relatively rare case of a fixed-size array getting redefined
176 branchData.fBranchAddressForCArrays = valueAddress;
177 branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays);
178 } else {
179 void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * {
180 if (isCArray) {
181 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
182 // need its buffer, so we cast it and extract the address of the buffer
183 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
184 return rawRVec->data();
185 }
186 return valueAddress;
187 }();
188 branchData.fOutputBranch->SetAddress(correctAddress);
189 branchData.fBranchAddressForCArrays = valueAddress;
190 }
191}
192
194{
195 // Logic taken from
196 // TTree::BranchImpRef(
197 // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel)
199 if (rootTypeChar == ' ') {
200 Warning("Snapshot",
201 "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This "
202 "column will not be written out.",
203 bd.fOutputBranchName.c_str());
204 return;
205 }
206 std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar};
207 bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize);
208}
209
210/// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the
211/// TFile can be opened in the mode specified in `opts`, deleting any existing TTrees in case
212/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
214 const std::string &fileName)
215{
216 TString fileMode = opts.fMode;
217 fileMode.ToLower();
218 if (fileMode != "update")
219 return;
220
221 // output file opened in "update" mode: must check whether output TTree is already present in file
222 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "update")};
223 if (!outFile || outFile->IsZombie())
224 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
225
226 TObject *outTree = outFile->Get(treeName.c_str());
227 if (outTree == nullptr)
228 return;
229
230 // object called treeName is already present in the file
231 if (opts.fOverwriteIfExists) {
232 if (outTree->InheritsFrom("TTree")) {
233 static_cast<TTree *>(outTree)->Delete("all");
234 } else {
235 outFile->Delete(treeName.c_str());
236 }
237 } else {
238 const std::string msg = "Snapshot: tree \"" + treeName + "\" already present in file \"" + fileName +
239 "\". If you want to delete the original tree and write another, please set "
240 "RSnapshotOptions::fOverwriteIfExists to true.";
241 throw std::invalid_argument(msg);
242 }
243}
244
245/// Ensure that the RNTuple with the resulting snapshot can be written to the target TFile. This means checking that the
246/// TFile can be opened in the mode specified in `opts`, deleting any existing RNTuples in case
247/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
249 const std::string &fileName)
250{
251 TString fileMode = opts.fMode;
252 fileMode.ToLower();
253 if (fileMode != "update")
254 return;
255
256 // output file opened in "update" mode: must check whether output RNTuple is already present in file
257 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "update")};
258 if (!outFile || outFile->IsZombie())
259 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
260
261 auto *outNTuple = outFile->Get<ROOT::RNTuple>(ntupleName.c_str());
262
263 if (outNTuple) {
264 if (opts.fOverwriteIfExists) {
265 outFile->Delete((ntupleName + ";*").c_str());
266 return;
267 } else {
268 const std::string msg = "Snapshot: RNTuple \"" + ntupleName + "\" already present in file \"" + fileName +
269 "\". If you want to delete the original ntuple and write another, please set "
270 "the 'fOverwriteIfExists' option to true in RSnapshotOptions.";
271 throw std::invalid_argument(msg);
272 }
273 }
274
275 // Also check if there is any object other than an RNTuple with the provided ntupleName.
276 TObject *outObj = outFile->Get(ntupleName.c_str());
277
278 if (!outObj)
279 return;
280
281 // An object called ntupleName is already present in the file.
282 if (opts.fOverwriteIfExists) {
283 if (auto tree = dynamic_cast<TTree *>(outObj)) {
284 tree->Delete("all");
285 } else {
286 outFile->Delete((ntupleName + ";*").c_str());
287 }
288 } else {
289 const std::string msg = "Snapshot: object \"" + ntupleName + "\" already present in file \"" + fileName +
290 "\". If you want to delete the original object and write a new RNTuple, please set "
291 "the 'fOverwriteIfExists' option to true in RSnapshotOptions.";
292 throw std::invalid_argument(msg);
293 }
294}
295
297 std::vector<ROOT::Internal::RDF::RBranchData> &allBranchData, std::size_t currentIndex,
298 int basketSize, void *valueAddress)
299{
301 auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName);
302
303 if (branchData->fOutputBranch && valueAddress) {
304 // The output branch was already created, we just need to (re)set its address
305 SetBranchAddress(inputBranch, *branchData, valueAddress);
306 return;
307 }
308
309 // Respect the original bufsize and splitlevel arguments
310 // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid
311 // writing garbage when unsplit objects cannot be written as split objects (e.g. in case of a polymorphic
312 // TObject branch, see https://bit.ly/2EjLMId ).
313 // A user-provided basket size value takes precedence.
314 const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000);
315 const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99;
316
317 auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID);
318 if (dynamic_cast<TDataType *>(dictionary)) {
319 // Branch of fundamental type
321 return;
322 }
323
324 if (!branchData->fIsDefine) {
325 // Cases where we need a leaflist (e.g. C-style arrays)
326 // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is
327 // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is
328 // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree
329 // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec).
331 }
332 if (branchData->fOutputBranch) {
333 // A branch was created in the previous function call
334 if (valueAddress) {
335 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
336 // need its buffer, so we cast it and extract the address of the buffer
337 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
338 branchData->fBranchAddressForCArrays = rawRVec->data();
339 }
340 return;
341 }
342
343 if (auto *classPtr = dynamic_cast<TClass *>(dictionary)) {
344 // Case of unsplit object with polymorphic type
345 if (inputBranch && dynamic_cast<TBranchObject *>(inputBranch) && valueAddress)
346 branchData->fOutputBranch =
348 inputBranch->GetAddress(), bufSize, splitLevel);
349 // General case, with valid address
350 else if (valueAddress)
352 outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID),
354 // No value was passed, we're just creating a hollow branch to populate the dataset schema
355 else
356 branchData->fOutputBranch =
357 outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize);
358 return;
359 }
360
361 // We are not aware of other cases
362 throw std::logic_error(
363 "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
364}
365} // namespace
366
368 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
369 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
371 const std::vector<const std::type_info *> &colTypeIDs)
372 : fFileName(filename),
373 fDirName(dirname),
374 fTreeName(treename),
375 fOptions(options),
376 fOutputLoopManager(loopManager),
377 fInputLoopManager(inputLM)
378{
380
382 fBranchData.reserve(vbnames.size());
383 for (unsigned int i = 0; i < vbnames.size(); ++i) {
384 fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]);
385 }
386}
387
388// Define special member methods here where the definition of all the data member types is available
392 ROOT::Internal::RDF::UntypedSnapshotTTreeHelper &&) noexcept = default;
393
395{
396 if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) {
397 const auto fileOpenMode = [&]() {
398 TString checkupdate = fOptions.fMode;
399 checkupdate.ToLower();
400 return checkupdate == "update" ? "updated" : "created";
401 }();
402 Warning("Snapshot",
403 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
404 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
405 "its result in a variable and for example calling the GetValue() method on it.",
406 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
407 }
408}
409
411{
412 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
413 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
414 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
415 fInputTree = treeDS->GetTree();
416 fBranchAddressesNeedReset = true;
417}
418
419void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Exec(unsigned int, const std::vector<void *> &values)
420{
421 if (!fBranchAddressesNeedReset) {
422 UpdateCArraysPtrs(values);
423 } else {
424 SetBranches(values);
425 fBranchAddressesNeedReset = false;
426 }
427
428 fOutputTree->Fill();
429}
430
432{
433 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
434 // associated to those is re-allocated. As a result the value of the pointer can change therewith
435 // leaving associated to the branch of the output tree an invalid pointer.
436 // With this code, we set the value of the pointer in the output branch anew when needed.
437 assert(values.size() == fBranchData.size());
438 auto nValues = values.size();
439 for (decltype(nValues) i{}; i < nValues; i++) {
440 if (fBranchData[i].fIsCArray) {
441 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
442 // need its buffer, so we cast it and extract the address of the buffer
443 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
444 if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) {
445 fBranchData[i].fOutputBranch->SetAddress(data);
446 fBranchData[i].fBranchAddressForCArrays = data;
447 }
448 }
449 }
450}
451
453{
454 // create branches in output tree
455 assert(fBranchData.size() == values.size());
456 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
457 SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]);
458 }
459 AssertNoNullBranchAddresses(fBranchData);
460}
461
463{
464 void *dummyValueAddress{};
465 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
466 SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress);
467 }
468}
469
471{
472 fOutputFile.reset(
473 TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"",
474 ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
475 if (!fOutputFile)
476 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
477
478 TDirectory *outputDir = fOutputFile.get();
479 if (!fDirName.empty()) {
480 TString checkupdate = fOptions.fMode;
481 checkupdate.ToLower();
482 if (checkupdate == "update")
483 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
484 else
485 outputDir = fOutputFile->mkdir(fDirName.c_str());
486 }
487
488 fOutputTree = std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/outputDir);
489
490 if (fOptions.fAutoFlush)
491 fOutputTree->SetAutoFlush(fOptions.fAutoFlush);
492}
493
495{
496 assert(fOutputTree != nullptr);
497 assert(fOutputFile != nullptr);
498
499 // There were no entries to fill the TTree with (either the input TTree was empty or no event passed after
500 // filtering). We have already created an empty TTree, now also create the branches to preserve the schema
501 if (fOutputTree->GetEntries() == 0) {
502 SetEmptyBranches(fInputTree, *fOutputTree);
503 }
504 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
505 fOutputTree->AutoSave("flushbaskets");
506 // must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
507 fOutputTree.reset();
508 fOutputFile->Close();
509
510 // Now connect the data source to the loop manager so it can be used for further processing
511 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
512 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
513}
514
515/**
516 * \brief Create a new UntypedSnapshotTTreeHelper with a different output file name
517 *
518 * \param newName A type-erased string with the output file name
519 * \return UntypedSnapshotTTreeHelper
520 *
521 * This MakeNew implementation is tied to the cloning feature of actions
522 * of the computation graph. In particular, cloning a Snapshot node usually
523 * also involves changing the name of the output file, otherwise the cloned
524 * Snapshot would overwrite the same file.
525 */
528{
529 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
530 std::vector<std::string> inputBranchNames;
531 std::vector<std::string> outputBranchNames;
532 std::vector<bool> isDefine;
533 std::vector<const std::type_info *> inputColumnTypeIDs;
534 for (const auto &bd : fBranchData) {
535 if (bd.fInputBranchName.empty())
536 break;
537 inputBranchNames.push_back(bd.fInputBranchName);
538 outputBranchNames.push_back(bd.fOutputBranchName);
539 isDefine.push_back(bd.fIsDefine);
540 inputColumnTypeIDs.push_back(bd.fInputTypeID);
541 }
542
544 fDirName,
545 fTreeName,
546 std::move(inputBranchNames),
547 std::move(outputBranchNames),
548 fOptions,
549 std::move(isDefine),
550 fOutputLoopManager,
551 fInputLoopManager,
553}
554
556 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename,
557 const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
559 const std::vector<const std::type_info *> &colTypeIDs)
560 : fNSlots(nSlots),
561 fOutputFiles(fNSlots),
562 fOutputTrees(fNSlots),
563 fBranchAddressesNeedReset(fNSlots, 1),
564 fInputTrees(fNSlots),
565 fFileName(filename),
566 fDirName(dirname),
567 fTreeName(treename),
568 fOptions(options),
569 fOutputLoopManager(loopManager),
570 fInputLoopManager(inputLM)
571{
573
575 fBranchData.reserve(fNSlots);
576 for (unsigned int slot = 0; slot < fNSlots; ++slot) {
577 fBranchData.emplace_back();
578 auto &thisSlot = fBranchData.back();
579 thisSlot.reserve(vbnames.size());
580 for (unsigned int i = 0; i < vbnames.size(); ++i) {
581 thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]);
582 }
583 }
584}
585
586// Define special member methods here where the definition of all the data member types is available
591
593{
594 if (!fTreeName.empty() /*not moved from*/ && fOptions.fLazy && !fOutputFiles.empty() &&
595 std::all_of(fOutputFiles.begin(), fOutputFiles.end(), [](const auto &f) { return !f; }) /* never run */) {
596 const auto fileOpenMode = [&]() {
597 TString checkupdate = fOptions.fMode;
598 checkupdate.ToLower();
599 return checkupdate == "update" ? "updated" : "created";
600 }();
601 Warning("Snapshot",
602 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
603 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
604 "its result in a variable and for example calling the GetValue() method on it.",
605 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
606 }
607}
608
610{
611 ::TDirectory::TContext c; // do not let tasks change the thread-local gDirectory
612 if (!fOutputFiles[slot]) {
613 // first time this thread executes something, let's create a TBufferMerger output directory
614 fOutputFiles[slot] = fMerger->GetFile();
615 }
616 TDirectory *treeDirectory = fOutputFiles[slot].get();
617 if (!fDirName.empty()) {
618 // call returnExistingDirectory=true since MT can end up making this call multiple times
619 treeDirectory = fOutputFiles[slot]->mkdir(fDirName.c_str(), "", true);
620 }
621 // re-create output tree as we need to create its branches again, with new input variables
622 // TODO we could instead create the output tree and its branches, change addresses of input variables in each task
623 fOutputTrees[slot] =
624 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
625 fOutputTrees[slot]->SetBit(TTree::kEntriesReshuffled);
626 // TODO can be removed when RDF supports interleaved TBB task execution properly, see ROOT-10269
627 fOutputTrees[slot]->SetImplicitMT(false);
628 if (fOptions.fAutoFlush)
629 fOutputTrees[slot]->SetAutoFlush(fOptions.fAutoFlush);
630 if (r) {
631 // We could be getting a task-local TTreeReader from the TTreeProcessorMT.
632 fInputTrees[slot] = r->GetTree();
633 } else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource())) {
634 fInputTrees[slot] = treeDS->GetTree();
635 }
636 fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
637}
638
640{
641 if (fOutputTrees[slot]->GetEntries() > 0)
642 fOutputFiles[slot]->Write();
643 for (auto &branchData : fBranchData[slot])
644 branchData.ClearBranchPointers(); // Pointers might go to an old tree, so they are stale now
645 // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones)
646 fOutputTrees[slot].reset(nullptr);
647}
648
649void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector<void *> &values)
650{
651 if (fBranchAddressesNeedReset[slot] == 0) {
652 UpdateCArraysPtrs(slot, values);
653 } else {
654 SetBranches(slot, values);
655 fBranchAddressesNeedReset[slot] = 0;
656 }
657 fOutputTrees[slot]->Fill();
658 auto entries = fOutputTrees[slot]->GetEntries();
659 auto autoFlush = fOutputTrees[slot]->GetAutoFlush();
660 if ((autoFlush > 0) && (entries % autoFlush == 0))
661 fOutputFiles[slot]->Write();
662}
663
665 const std::vector<void *> &values)
666{
667 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
668 // associated to those is re-allocated. As a result the value of the pointer can change therewith
669 // leaving associated to the branch of the output tree an invalid pointer.
670 // With this code, we set the value of the pointer in the output branch anew when needed.
671 assert(values.size() == fBranchData[slot].size());
672 auto nValues = values.size();
673 for (decltype(nValues) i{}; i < nValues; i++) {
674 auto &branchData = fBranchData[slot][i];
675 if (branchData.fIsCArray) {
676 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
677 // need its buffer, so we cast it and extract the address of the buffer
678 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
679 if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) {
680 // reset the branch address
681 branchData.fOutputBranch->SetAddress(data);
682 branchData.fBranchAddressForCArrays = data;
683 }
684 }
685 }
686}
687
689 const std::vector<void *> &values)
690{
691 // create branches in output tree
692 auto &branchData = fBranchData[slot];
693 assert(branchData.size() == values.size());
694 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
695 SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]);
696 }
697
699}
700
702{
703 void *dummyValueAddress{};
704 auto &branchData = fBranchData.front();
705 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
707 }
708}
709
711{
712 const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
713 auto outFile =
714 std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)};
715 if (!outFile)
716 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
717 fOutputFile = outFile.get();
718 fMerger = std::make_unique<ROOT::TBufferMerger>(std::move(outFile));
719}
720
722{
723
724 for (auto &file : fOutputFiles) {
725 if (file) {
726 file->Write();
727 file->Close();
728 }
729 }
730
731 // If there were no entries to fill the TTree with (either the input TTree was empty or no event passed after
732 // filtering), create an empty TTree in the output file and create the branches to preserve the schema
733 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
734 assert(fOutputFile && "Missing output file in Snapshot finalization.");
735 if (!fOutputFile->Get(fullTreeName.c_str())) {
736
737 // First find in which directory we need to write the output TTree
738 TDirectory *treeDirectory = fOutputFile;
739 if (!fDirName.empty()) {
740 treeDirectory = fOutputFile->mkdir(fDirName.c_str(), "", true);
741 }
743
744 // Create the output TTree and create the user-requested branches
745 auto outTree =
746 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
747 TTree *inputTree{};
748 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
749 inputTree = treeDS->GetTree();
750 SetEmptyBranches(inputTree, *outTree);
751
752 fOutputFile->Write();
753 }
754
755 // flush all buffers to disk by destroying the TBufferMerger
756 fOutputFiles.clear();
757 fMerger.reset();
758
759 // Now connect the data source to the loop manager so it can be used for further processing
760 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
761}
762
763/**
764 * \brief Create a new UntypedSnapshotTTreeHelperMT with a different output file name
765 *
766 * \param newName A type-erased string with the output file name
767 * \return UntypedSnapshotTTreeHelperMT
768 *
769 * This MakeNew implementation is tied to the cloning feature of actions
770 * of the computation graph. In particular, cloning a Snapshot node usually
771 * also involves changing the name of the output file, otherwise the cloned
772 * Snapshot would overwrite the same file.
773 */
776{
777 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
778 std::vector<std::string> inputBranchNames;
779 std::vector<std::string> outputBranchNames;
780 std::vector<bool> isDefine;
781 std::vector<const std::type_info *> inputColumnTypeIDs;
782 for (const auto &bd : fBranchData.front()) {
783 if (bd.fInputBranchName.empty())
784 break;
785 inputBranchNames.push_back(bd.fInputBranchName);
786 outputBranchNames.push_back(bd.fOutputBranchName);
787 isDefine.push_back(bd.fIsDefine);
788 inputColumnTypeIDs.push_back(bd.fInputTypeID);
789 }
790
792 finalName,
793 fDirName,
794 fTreeName,
795 std::move(inputBranchNames),
796 std::move(outputBranchNames),
797 fOptions,
798 std::move(isDefine),
799 fOutputLoopManager,
800 fInputLoopManager,
801 std::move(inputColumnTypeIDs)};
802}
803
805 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename,
806 const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options,
808 const std::vector<const std::type_info *> &colTypeIDs)
809 : fFileName(filename),
810 fDirName(dirname),
811 fNTupleName(ntuplename),
812 fOptions(options),
813 fInputLoopManager(inputLM),
814 fOutputLoopManager(outputLM),
815 fInputFieldNames(vfnames),
816 fOutputFieldNames(ReplaceDotWithUnderscore(fnames)),
817 fNSlots(nSlots),
818 fFillContexts(nSlots),
819 fEntries(nSlots),
820 fInputColumnTypeIDs(colTypeIDs)
821{
823}
824
825// Define special member methods here where the definition of all the data member types is available
830
832{
833 if (!fNTupleName.empty() /* not moved from */ && !fOutputFile /* did not run */ && fOptions.fLazy)
834 Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
835}
836
838{
839 auto model = ROOT::RNTupleModel::CreateBare();
840 auto nFields = fOutputFieldNames.size();
841 fFieldTokens.resize(nFields);
842 for (decltype(nFields) i = 0; i < nFields; i++) {
843 // Need to retrieve the type of every field to create as a string
844 // If the input type for a field does not have RTTI, internally we store it as the tag UseNativeDataType. When
845 // that is detected, we need to ask the data source which is the type name based on the on-disk information.
846 const auto typeName = *fInputColumnTypeIDs[i] == typeid(ROOT::Internal::RDF::UseNativeDataType)
847 ? ROOT::Internal::RDF::GetTypeNameWithOpts(*fInputLoopManager->GetDataSource(),
848 fInputFieldNames[i], fOptions.fVector2RVec)
849 : ROOT::Internal::RDF::TypeID2TypeName(*fInputColumnTypeIDs[i]);
850 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap());
851 fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]);
852 }
853 model->Freeze();
854
856 writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
857
858 fOutputFile.reset(TFile::Open(fFileName.c_str(), fOptions.fMode.c_str()));
859 if (!fOutputFile)
860 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
861
862 TDirectory *outputDir = fOutputFile.get();
863 if (!fDirName.empty()) {
864 TString checkupdate = fOptions.fMode;
865 checkupdate.ToLower();
866 if (checkupdate == "update")
867 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
868 else
869 outputDir = fOutputFile->mkdir(fDirName.c_str());
870 }
871
872 // The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for
873 // calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask).
874 fWriter = ROOT::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
875}
876
878{
879 if (!fFillContexts[slot]) {
880 fFillContexts[slot] = fWriter->CreateFillContext();
881 fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry();
882 }
883}
884
885void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector<void *> &values)
886{
887 auto &fillContext = fFillContexts[slot];
888 auto &outputEntry = fEntries[slot];
889 assert(values.size() == fFieldTokens.size());
890 for (decltype(values.size()) i = 0; i < values.size(); i++) {
891 outputEntry->BindRawPtr(fFieldTokens[i], values[i]);
892 }
893 fillContext->Fill(*outputEntry);
894}
895
897{
898 // In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression.
899 // NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush.
900 fFillContexts[slot]->FlushCluster();
901}
902
904{
905 // First clear and destroy all entries, which were created from the RNTupleFillContexts.
906 fEntries.clear();
907 fFillContexts.clear();
908 // Then destroy the RNTupleParallelWriter and write the metadata.
909 fWriter.reset();
910 // We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
911 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::RDF::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
912}
913
914/**
915 * Create a new UntypedSnapshotRNTupleHelper with a different output file name.
916 *
917 * \param[in] newName A type-erased string with the output file name
918 * \return UntypedSnapshotRNTupleHelper
919 *
920 * This MakeNew implementation is tied to the cloning feature of actions
921 * of the computation graph. In particular, cloning a Snapshot node usually
922 * also involves changing the name of the output file, otherwise the cloned
923 * Snapshot would overwrite the same file.
924 */
927{
928 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
930 fNSlots, finalName, fDirName, fNTupleName, fInputFieldNames,
931 fOutputFieldNames, fOptions, fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs};
932}
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
static TBranch * SearchForBranch(TTree *tree, const char *name)
Definition TTreePyz.cxx:50
The head node of a RDF computation graph.
UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector< const std::type_info * > &colTypeIDs)
void Exec(unsigned int slot, const std::vector< void * > &values)
UntypedSnapshotRNTupleHelper MakeNew(void *newName)
Create a new UntypedSnapshotRNTupleHelper with a different output file name.
void InitTask(TTreeReader *, unsigned int slot)
UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(unsigned int slot, const std::vector< void * > &values)
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
std::vector< std::vector< RBranchData > > fBranchData
UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelperMT with a different output file name.
void InitTask(TTreeReader *r, unsigned int slot)
void Exec(unsigned int slot, const std::vector< void * > &values)
void SetBranches(unsigned int slot, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelper with a different output file name.
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
void SetBranches(const std::vector< void * > &values)
void Exec(unsigned int, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(const std::vector< void * > &values)
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
static std::unique_ptr< RNTupleParallelWriter > Append(std::unique_ptr< ROOT::RNTupleModel > model, std::string_view ntupleName, TDirectory &fileOrDirectory, const ROOT::RNTupleWriteOptions &options=ROOT::RNTupleWriteOptions())
Append an RNTuple to the existing file.
Common user-tunable settings for storing RNTuples.
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:67
const_iterator begin() const
const_iterator end() const
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1526
A Branch for the case of an object.
A TTree is a list of TBranches.
Definition TBranch.h:93
static TClass * Class()
TClassRef is used to implement a permanent reference to a TClass object.
Definition TClassRef.h:29
TClass instances represent classes, structs and namespaces in the ROOT type system.
Definition TClass.h:84
Basic data type descriptor (datatype information is obtained from CINT).
Definition TDataType.h:44
Int_t GetType() const
Definition TDataType.h:68
static TDictionary * GetDictionary(const char *name)
Retrieve the type (class, fundamental type, typedef etc) named "name".
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
Describe directory structure in memory.
Definition TDirectory.h:45
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3764
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
Mother of all ROOT objects.
Definition TObject.h:41
Basic string class.
Definition TString.h:138
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
A TTree represents a columnar dataset.
Definition TTree.h:89
@ kEntriesReshuffled
If set, signals that this TTree is the output of the processing of another TTree, and the entries are...
Definition TTree.h:297
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition RDFUtils.cxx:397
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition RDFUtils.cxx:342
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:178
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:627
char TypeID2ROOTTypeName(const std::type_info &tid)
Definition RDFUtils.cxx:206
TBranch * CallBranchImp(TTree &tree, const char *branchname, TClass *ptrClass, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10094
TBranch * CallBranchImpRef(TTree &tree, const char *branchname, TClass *ptrClass, EDataType datatype, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10088
std::vector< std::string > ColumnNames_t
@ kROOTRVec
Definition ESTLType.h:46
@ kSTLvector
Definition ESTLType.h:30
int CompressionSettings(RCompressionSetting::EAlgorithm::EValues algorithm, int compressionLevel)
ROOT::ESTLType STLKind(std::string_view type)
Converts STL container name to number.
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
Stores properties of each output branch in a Snapshot.
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:344
A collection of options to steer the creation of the dataset on file.