Logo ROOT  
Reference Guide
TTreeProcessorMT.cxx
Go to the documentation of this file.
1// @(#)root/thread:$Id$
2// Authors: Enric Tejedor, Enrico Guiraud CERN 05/06/2018
3
4/*************************************************************************
5 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12/** \class ROOT::TTreeProcessorMT
13 \ingroup Parallelism
14 \brief A class to process the entries of a TTree in parallel.
15
16By means of its Process method, ROOT::TTreeProcessorMT provides a way to process the
17entries of a TTree in parallel. When invoking TTreeProcessor::Process, the user
18passes a function whose only parameter is a TTreeReader. The function iterates
19on a subrange of entries by using that TTreeReader.
20
21The implementation of ROOT::TTreeProcessorMT parallelizes the processing of the subranges,
22each corresponding to a cluster in the TTree. This is possible thanks to the use
23of a ROOT::TThreadedObject, so that each thread works with its own TFile and TTree
24objects.
25*/
26
27#include "TROOT.h"
29
30using namespace ROOT;
31
32namespace {
33
34/// A cluster of entries
35struct EntryCluster {
36 Long64_t start;
37 Long64_t end;
38};
39
40// note that this routine assumes global entry numbers
41static bool ClustersAreSortedAndContiguous(const std::vector<std::vector<EntryCluster>> &cls)
42{
43 Long64_t last_end = 0ll;
44 for (const auto &fcl : cls) {
45 for (const auto &c : fcl) {
46 if (last_end != c.start)
47 return false;
48 last_end = c.end;
49 }
50 }
51 return true;
52}
53
54/// Take a vector of vectors of EntryClusters (a vector per file), filter the entries according to entryList, and
55/// and return a new vector of vectors of EntryClusters where cluster start/end entry numbers have been converted to
56/// TEntryList-local entry numbers.
57///
58/// This routine assumes that entry numbers in the TEntryList (and, if present, in the sub-entrylists) are in
59/// ascending order, i.e., for n > m:
60/// elist.GetEntry(n) + tree_offset_for_entry_from_elist(n) > elist.GetEntry(m) + tree_offset_for_entry_from_elist(m)
61static std::vector<std::vector<EntryCluster>>
62ConvertToElistClusters(std::vector<std::vector<EntryCluster>> &&clusters, TEntryList &entryList,
63 const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames,
64 const std::vector<Long64_t> &entriesPerFile)
65{
66 R__ASSERT(entryList.GetN() > 0); // wasteful to call this function if it has nothing to do
67 R__ASSERT(ClustersAreSortedAndContiguous(clusters));
68
69 const bool listHasGlobalEntryNumbers = entryList.GetLists() == nullptr;
70 const auto nFiles = clusters.size();
71
72 std::unique_ptr<TChain> chain;
73 using NextFn_t = Long64_t (*)(Long64_t &, TEntryList &, TChain *);
74 // A function that advances TEntryList and returns global entry numbers or -1 if we reached the end
75 // (might or might not need a TChain depending on whether listHasGlobalEntryNumbers)
76 NextFn_t Next;
77 if (listHasGlobalEntryNumbers) {
78 Next = [](Long64_t &elEntry, TEntryList &elist, TChain *) {
79 ++elEntry;
80 return elist.Next();
81 };
82 } else {
83 // we need `chain` to be able to convert local entry numbers to global entry numbers in `Next`
84 chain.reset(new TChain());
85 for (auto i = 0u; i < nFiles; ++i)
86 chain->Add((fileNames[i] + "/" + treeNames[i]).c_str(), entriesPerFile[i]);
87 Next = [](Long64_t &elEntry, TEntryList &elist, TChain *ch) {
88 ++elEntry;
89 int treenum = -1;
90 Long64_t localEntry = elist.GetEntryAndTree(elEntry, treenum);
91 if (localEntry == -1ll)
92 return localEntry;
93 return localEntry + ch->GetTreeOffset()[treenum];
94 };
95 }
96
97 // the call to GetEntry also serves the purpose to reset TEntryList::fLastIndexQueried,
98 // so we can be sure TEntryList::Next will return the correct thing
99 Long64_t elistEntry = 0ll;
100 Long64_t entry = entryList.GetEntry(elistEntry);
101
102 std::vector<std::vector<EntryCluster>> elistClusters;
103
104 for (auto fileN = 0u; fileN < nFiles; ++fileN) {
105 std::vector<EntryCluster> elistClustersForFile;
106 for (const auto &c : clusters[fileN]) {
107 if (entry >= c.end || entry == -1ll) // no entrylist entries in this cluster
108 continue;
109 R__ASSERT(entry >= c.start); // current entry should never come before the cluster we are looking at
110 const Long64_t elistRangeStart = elistEntry;
111 // advance entry list until the entrylist entry goes beyond the end of the cluster
112 while (entry < c.end && entry != -1ll)
113 entry = Next(elistEntry, entryList, chain.get());
114 elistClustersForFile.emplace_back(EntryCluster{elistRangeStart, elistEntry});
115 }
116 elistClusters.emplace_back(std::move(elistClustersForFile));
117 }
118
119 R__ASSERT(elistClusters.size() == clusters.size()); // same number of files
120 R__ASSERT(ClustersAreSortedAndContiguous(elistClusters));
121
122 entryList.GetEntry(0ll); // reset TEntryList internal state, lest we incur in ROOT-10807
123 return elistClusters;
124}
125
126// EntryClusters and number of entries per file
127using ClustersAndEntries = std::pair<std::vector<std::vector<EntryCluster>>, std::vector<Long64_t>>;
128
129////////////////////////////////////////////////////////////////////////
130/// Return a vector of cluster boundaries for the given tree and files.
131static ClustersAndEntries
132MakeClusters(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames)
133{
134 // Note that as a side-effect of opening all files that are going to be used in the
135 // analysis once, all necessary streamers will be loaded into memory.
137 const auto nFileNames = fileNames.size();
138 std::vector<std::vector<EntryCluster>> clustersPerFile;
139 std::vector<Long64_t> entriesPerFile;
140 entriesPerFile.reserve(nFileNames);
141 Long64_t offset = 0ll;
142 for (auto i = 0u; i < nFileNames; ++i) {
143 const auto &fileName = fileNames[i];
144 const auto &treeName = treeNames[i];
145
146 std::unique_ptr<TFile> f(TFile::Open(fileName.c_str())); // need TFile::Open to load plugins if need be
147 if (!f || f->IsZombie()) {
148 const auto msg = "TTreeProcessorMT::Process: an error occurred while opening file \"" + fileName + "\"";
149 throw std::runtime_error(msg);
150 }
151 auto *t = f->Get<TTree>(treeName.c_str()); // t will be deleted by f
152
153 if (!t) {
154 const auto msg = "TTreeProcessorMT::Process: an error occurred while getting tree \"" + treeName +
155 "\" from file \"" + fileName + "\"";
156 throw std::runtime_error(msg);
157 }
158
159 auto clusterIter = t->GetClusterIterator(0);
160 Long64_t start = 0ll, end = 0ll;
161 const Long64_t entries = t->GetEntries();
162 // Iterate over the clusters in the current file
163 std::vector<EntryCluster> clusters;
164 while ((start = clusterIter()) < entries) {
165 end = clusterIter.GetNextEntry();
166 // Add the current file's offset to start and end to make them (chain) global
167 clusters.emplace_back(EntryCluster{start + offset, end + offset});
168 }
169 offset += entries;
170 clustersPerFile.emplace_back(std::move(clusters));
171 entriesPerFile.emplace_back(entries);
172 }
173
174 // Here we "fuse" together clusters if the number of clusters is to big with respect to
175 // the number of slots, otherwise we can incurr in an overhead which is so big to make
176 // the parallelisation detrimental for performance.
177 // For example, this is the case when following a merging of many small files a file
178 // contains a tree with many entries and with clusters of just a few entries.
179 // The criterion according to which we fuse clusters together is to have at most
180 // TTreeProcessorMT::GetMaxTasksPerFilePerWorker() clusters per file per slot.
181 // For example: given 2 files and 16 workers, at most
182 // 16 * 2 * TTreeProcessorMT::GetMaxTasksPerFilePerWorker() clusters will be created, at most
183 // 16 * TTreeProcessorMT::GetMaxTasksPerFilePerWorker() per file.
184
185 const auto maxTasksPerFile = TTreeProcessorMT::GetMaxTasksPerFilePerWorker() * ROOT::GetThreadPoolSize();
186 std::vector<std::vector<EntryCluster>> eventRangesPerFile(clustersPerFile.size());
187 auto clustersPerFileIt = clustersPerFile.begin();
188 auto eventRangesPerFileIt = eventRangesPerFile.begin();
189 for (; clustersPerFileIt != clustersPerFile.end(); clustersPerFileIt++, eventRangesPerFileIt++) {
190 const auto clustersInThisFileSize = clustersPerFileIt->size();
191 const auto nFolds = clustersInThisFileSize / maxTasksPerFile;
192 // If the number of clusters is less than maxTasksPerFile
193 // we take the clusters as they are
194 if (nFolds == 0) {
195 std::for_each(
196 clustersPerFileIt->begin(), clustersPerFileIt->end(),
197 [&eventRangesPerFileIt](const EntryCluster &clust) { eventRangesPerFileIt->emplace_back(clust); });
198 continue;
199 }
200 // Otherwise, we have to merge clusters, distributing the reminder evenly
201 // onto the first clusters
202 auto nReminderClusters = clustersInThisFileSize % maxTasksPerFile;
203 const auto clustersInThisFile = *clustersPerFileIt;
204 for (auto i = 0ULL; i < clustersInThisFileSize; ++i) {
205 const auto start = clustersInThisFile[i].start;
206 // We lump together at least nFolds clusters, therefore
207 // we need to jump ahead of nFolds-1.
208 i += (nFolds - 1);
209 // We now add a cluster if we have some reminder left
210 if (nReminderClusters > 0) {
211 i += 1U;
212 nReminderClusters--;
213 }
214 const auto end = clustersInThisFile[i].end;
215 eventRangesPerFileIt->emplace_back(EntryCluster({start, end}));
216 }
217 }
218
219 return std::make_pair(std::move(eventRangesPerFile), std::move(entriesPerFile));
220}
221
222////////////////////////////////////////////////////////////////////////
223/// Return a vector containing the number of entries of each file of each friend TChain
224static std::vector<std::vector<Long64_t>>
225GetFriendEntries(const std::vector<std::pair<std::string, std::string>> &friendNames,
226 const std::vector<std::vector<std::string>> &friendFileNames)
227{
228 std::vector<std::vector<Long64_t>> friendEntries;
229 const auto nFriends = friendNames.size();
230 for (auto i = 0u; i < nFriends; ++i) {
231 std::vector<Long64_t> nEntries;
232 const auto &thisFriendName = friendNames[i].first;
233 const auto &thisFriendFiles = friendFileNames[i];
234 for (const auto &fname : thisFriendFiles) {
235 std::unique_ptr<TFile> f(TFile::Open(fname.c_str()));
236 TTree *t = nullptr; // owned by TFile
237 f->GetObject(thisFriendName.c_str(), t);
238 nEntries.emplace_back(t->GetEntries());
239 }
240 friendEntries.emplace_back(std::move(nEntries));
241 }
242
243 return friendEntries;
244}
245
246////////////////////////////////////////////////////////////////////////
247/// Return the full path of the TTree or the trees in the TChain
248static std::vector<std::string> GetTreeFullPaths(const TTree &tree)
249{
250 // Case 1: this is a TChain. For each file it contains, GetName returns the name of the tree in that file
251 if (tree.IsA() == TChain::Class()) {
252 auto &chain = static_cast<const TChain &>(tree);
253 auto files = chain.GetListOfFiles();
254 if (!files || files->GetEntries() == 0) {
255 throw std::runtime_error("TTreeProcessorMT: input TChain does not contain any file");
256 }
257 std::vector<std::string> treeNames;
258 for (TObject *f : *files)
259 treeNames.emplace_back(f->GetName());
260
261 return treeNames;
262 }
263
264 // Case 2: this is a TTree: we get the full path of it
265 if (auto motherDir = tree.GetDirectory()) {
266 // We have 2 subcases (ROOT-9948):
267 // - 1. motherDir is a TFile
268 // - 2. motherDir is a directory
269 // If 1. we just return the name of the tree, if 2. we reconstruct the path
270 // to the file.
271 if (motherDir->InheritsFrom("TFile")) {
272 return {tree.GetName()};
273 }
274 std::string fullPath = motherDir->GetPath(); // e.g. "file.root:/dir"
275 fullPath = fullPath.substr(fullPath.find(":/") + 1); // e.g. "/dir"
276 fullPath += "/";
277 fullPath += tree.GetName(); // e.g. "/dir/tree"
278 return {fullPath};
279 }
280
281 // We do our best and return the name of the tree
282 return {tree.GetName()};
283}
284
285} // anonymous namespace
286
287namespace ROOT {
288
290
291namespace Internal {
292
293////////////////////////////////////////////////////////////////////////////////
294/// Construct fChain, also adding friends if needed and injecting knowledge of offsets if available.
295/// \param[in] treeNames Name of the tree for each file in `fileNames`.
296/// \param[in] fileNames Files to be opened.
297/// \param[in] friendInfo Information about TTree friends, if any.
298/// \param[in] nEntries Number of entries to be processed.
299/// \param[in] friendEntries Number of entries in each friend. Expected to have same ordering as friendInfo.
300void TTreeView::MakeChain(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames,
301 const FriendInfo &friendInfo, const std::vector<Long64_t> &nEntries,
302 const std::vector<std::vector<Long64_t>> &friendEntries)
303{
304 const std::vector<NameAlias> &friendNames = friendInfo.fFriendNames;
305 const std::vector<std::vector<std::string>> &friendFileNames = friendInfo.fFriendFileNames;
306
307 fChain.reset(new TChain());
308 const auto nFiles = fileNames.size();
309 for (auto i = 0u; i < nFiles; ++i) {
310 fChain->Add((fileNames[i] + "/" + treeNames[i]).c_str(), nEntries[i]);
311 }
312 fChain->ResetBit(TObject::kMustCleanup);
313
314 fFriends.clear();
315 const auto nFriends = friendNames.size();
316 for (auto i = 0u; i < nFriends; ++i) {
317 const auto &friendName = friendNames[i];
318 const auto &name = friendName.first;
319 const auto &alias = friendName.second;
320
321 // Build a friend chain
322 auto frChain = std::make_unique<TChain>(name.c_str());
323 const auto nFileNames = friendFileNames[i].size();
324 for (auto j = 0u; j < nFileNames; ++j)
325 frChain->Add(friendFileNames[i][j].c_str(), friendEntries[i][j]);
326
327 // Make it friends with the main chain
328 fChain->AddFriend(frChain.get(), alias.c_str());
329 fFriends.emplace_back(std::move(frChain));
330 }
331}
332
333//////////////////////////////////////////////////////////////////////////
334/// Get a TTreeReader for the current tree of this view.
335std::unique_ptr<TTreeReader>
336TTreeView::GetTreeReader(Long64_t start, Long64_t end, const std::vector<std::string> &treeNames,
337 const std::vector<std::string> &fileNames, const FriendInfo &friendInfo,
338 const TEntryList &entryList, const std::vector<Long64_t> &nEntries,
339 const std::vector<std::vector<Long64_t>> &friendEntries)
340{
341 const bool hasEntryList = entryList.GetN() > 0;
342 const bool usingLocalEntries = friendInfo.fFriendNames.empty() && !hasEntryList;
343 const bool needNewChain =
344 fChain == nullptr || (usingLocalEntries && (fileNames[0] != fChain->GetListOfFiles()->At(0)->GetTitle() ||
345 treeNames[0] != fChain->GetListOfFiles()->At(0)->GetName()));
346 if (needNewChain) {
347 MakeChain(treeNames, fileNames, friendInfo, nEntries, friendEntries);
348 if (hasEntryList) {
349 fEntryList.reset(new TEntryList(entryList));
350 if (fEntryList->GetLists() != nullptr) {
351 // need to associate the TEntryList to the TChain for the latter to set entry the fTreeNumbers of the
352 // sub-lists of the former...
353 fChain->SetEntryList(fEntryList.get());
354 fEntryList->ResetBit(TObject::kCanDelete); // ...but we want to retain ownership
355 }
356 }
357 }
358 auto reader = std::make_unique<TTreeReader>(fChain.get(), fEntryList.get());
359 reader->SetEntriesRange(start, end);
360 return reader;
361}
362
363} // namespace Internal
364} // namespace ROOT
365
366////////////////////////////////////////////////////////////////////////////////
367/// Get and store the names, aliases and file names of the friends of the tree.
368/// \param[in] tree The main tree whose friends to
369///
370/// Note that "friends of friends" and circular references in the lists of friends are not supported.
371Internal::FriendInfo TTreeProcessorMT::GetFriendInfo(TTree &tree)
372{
373 std::vector<Internal::NameAlias> friendNames;
374 std::vector<std::vector<std::string>> friendFileNames;
375
376 const auto friends = tree.GetListOfFriends();
377 if (!friends)
378 return Internal::FriendInfo();
379
380 for (auto fr : *friends) {
381 const auto frTree = static_cast<TFriendElement *>(fr)->GetTree();
382 const bool isChain = frTree->IsA() == TChain::Class();
383
384 friendFileNames.emplace_back();
385 auto &fileNames = friendFileNames.back();
386
387 // Check if friend tree/chain has an alias
388 const auto alias_c = tree.GetFriendAlias(frTree);
389 const std::string alias = alias_c != nullptr ? alias_c : "";
390
391 if (isChain) {
392 // Note that each TChainElement returned by chain.GetListOfFiles has a name
393 // equal to the tree name of this TChain and a title equal to the filename.
394 // Accessing the information like this ensures that we get the correct
395 // filenames and treenames if the treename is given as part of the filename
396 // via chain.AddFile(file.root/myTree) and as well if the tree name is given
397 // in the constructor via TChain(myTree) and a file is added later by chain.AddFile(file.root).
398
399 // Get name of the trees building the chain
400 const auto chainFiles = static_cast<TChain*>(frTree)->GetListOfFiles();
401 const auto realName = chainFiles->First()->GetName();
402 friendNames.emplace_back(std::make_pair(realName, alias));
403 // Get filenames stored in the title member
404 for (auto f : *chainFiles) {
405 fileNames.emplace_back(f->GetTitle());
406 }
407 } else {
408 // Get name of the tree
409 const auto realName = GetTreeFullPaths(*frTree)[0];
410 friendNames.emplace_back(std::make_pair(realName, alias));
411
412 // Get filename
413 const auto f = frTree->GetCurrentFile();
414 if (!f)
415 throw std::runtime_error("Friend trees with no associated file are not supported.");
416 fileNames.emplace_back(f->GetName());
417 }
418 }
419
420 return Internal::FriendInfo{std::move(friendNames), std::move(friendFileNames)};
421}
422
423/////////////////////////////////////////////////////////////////////////////////////////////////
424/// Retrieve the names of the TTrees in each of the input files, throw if a TTree cannot be found.
425std::vector<std::string> TTreeProcessorMT::FindTreeNames()
426{
427 std::vector<std::string> treeNames;
428
429 if (fFileNames.empty()) // This can never happen
430 throw std::runtime_error("Empty list of files and no tree name provided");
431
433 for (const auto &fname : fFileNames) {
434 std::string treeName;
435 std::unique_ptr<TFile> f(TFile::Open(fname.c_str()));
436 TIter next(f->GetListOfKeys());
437 while (auto *key = static_cast<TKey *>(next())) {
438 const char *className = key->GetClassName();
439 if (strcmp(className, "TTree") == 0) {
440 treeName = key->GetName();
441 break;
442 }
443 }
444 if (treeName.empty())
445 throw std::runtime_error("Cannot find any tree in file " + fname);
446 treeNames.emplace_back(std::move(treeName));
447 }
448
449 return treeNames;
450}
451
452////////////////////////////////////////////////////////////////////////
453/// Constructor based on a file name.
454/// \param[in] filename Name of the file containing the tree to process.
455/// \param[in] treename Name of the tree to process. If not provided, the implementation will search
456/// for a TTree key in the file and will use the first one it finds.
457/// \param[in] nThreads Number of threads to create in the underlying thread-pool. The semantics of this argument are
458/// the same as for TThreadExecutor.
460 : fFileNames({std::string(filename)}),
461 fTreeNames(treename.empty() ? FindTreeNames() : std::vector<std::string>{std::string(treename)}), fFriendInfo(),
462 fPool(nThreads)
463{
465}
466
467std::vector<std::string> CheckAndConvert(const std::vector<std::string_view> &views)
468{
469 if (views.empty())
470 throw std::runtime_error("The provided list of file names is empty");
471
472 std::vector<std::string> strings;
473 strings.reserve(views.size());
474 for (const auto &v : views)
475 strings.emplace_back(v);
476 return strings;
477}
478
479////////////////////////////////////////////////////////////////////////
480/// Constructor based on a collection of file names.
481/// \param[in] filenames Collection of the names of the files containing the tree to process.
482/// \param[in] treename Name of the tree to process. If not provided, the implementation will
483/// search filenames for a TTree key and will use the first one it finds in each file.
484/// \param[in] nThreads Number of threads to create in the underlying thread-pool. The semantics of this argument are
485/// the same as for TThreadExecutor.
486///
487/// If different files contain TTrees with different names and automatic TTree name detection is not an option
488/// (for example, because some of the files contain multiple TTrees) please manually create a TChain and pass
489/// it to the appropriate TTreeProcessorMT constructor.
490TTreeProcessorMT::TTreeProcessorMT(const std::vector<std::string_view> &filenames, std::string_view treename,
491 UInt_t nThreads)
492 : fFileNames(CheckAndConvert(filenames)),
493 fTreeNames(treename.empty() ? FindTreeNames()
494 : std::vector<std::string>(fFileNames.size(), std::string(treename))),
495 fFriendInfo(), fPool(nThreads)
496{
498}
499
500std::vector<std::string> GetFilesFromTree(TTree &tree)
501{
502 std::vector<std::string> filenames;
503
504 const bool isChain = tree.IsA() == TChain::Class();
505 if (isChain) {
506 TObjArray *filelist = static_cast<TChain &>(tree).GetListOfFiles();
507 const auto nFiles = filelist->GetEntries();
508 if (nFiles == 0)
509 throw std::runtime_error("The provided chain of files is empty");
510 filenames.reserve(nFiles);
511 for (auto f : *filelist)
512 filenames.emplace_back(f->GetTitle());
513 } else {
514 TFile *f = tree.GetCurrentFile();
515 if (!f) {
516 const auto msg = "The specified TTree is not linked to any file, in-memory-only trees are not supported.";
517 throw std::runtime_error(msg);
518 }
519
520 filenames.emplace_back(f->GetName());
521 }
522
523 return filenames;
524}
525
526////////////////////////////////////////////////////////////////////////
527/// Constructor based on a TTree and a TEntryList.
528/// \param[in] tree Tree or chain of files containing the tree to process.
529/// \param[in] entries List of entry numbers to process.
530/// \param[in] nThreads Number of threads to create in the underlying thread-pool. The semantics of this argument are
531/// the same as for TThreadExecutor.
533 : fFileNames(GetFilesFromTree(tree)), fTreeNames(GetTreeFullPaths(tree)), fEntryList(entries),
534 fFriendInfo(GetFriendInfo(tree)), fPool(nThreads)
535{
537}
538
539////////////////////////////////////////////////////////////////////////
540/// Constructor based on a TTree.
541/// \param[in] tree Tree or chain of files containing the tree to process.
542/// \param[in] nThreads Number of threads to create in the underlying thread-pool. The semantics of this argument are
543/// the same as for TThreadExecutor.
545{
546}
547
548//////////////////////////////////////////////////////////////////////////////
549/// Process the entries of a TTree in parallel. The user-provided function
550/// receives a TTreeReader which can be used to iterate on a subrange of
551/// entries
552/// ~~~{.cpp}
553/// TTreeProcessorMT::Process([](TTreeReader& readerSubRange) {
554/// // Select branches to read
555/// while (readerSubRange.Next()) {
556/// // Use content of current entry
557/// }
558/// });
559/// ~~~
560/// The user needs to be aware that each of the subranges can potentially
561/// be processed in parallel. This means that the code of the user function
562/// should be thread safe.
563///
564/// \param[in] func User-defined function that processes a subrange of entries
566{
567 const std::vector<Internal::NameAlias> &friendNames = fFriendInfo.fFriendNames;
568 const std::vector<std::vector<std::string>> &friendFileNames = fFriendInfo.fFriendFileNames;
569
570 // If an entry list or friend trees are present, we need to generate clusters with global entry numbers,
571 // so we do it here for all files.
572 // Otherwise we can do it later, concurrently for each file, and clusters will contain local entry numbers.
573 // TODO: in practice we could also find clusters per-file in the case of no friends and a TEntryList with
574 // sub-entrylists.
575 const bool hasFriends = !friendNames.empty();
576 const bool hasEntryList = fEntryList.GetN() > 0;
577 const bool shouldRetrieveAllClusters = hasFriends || hasEntryList;
578 ClustersAndEntries clusterAndEntries{};
579 if (shouldRetrieveAllClusters) {
580 clusterAndEntries = MakeClusters(fTreeNames, fFileNames);
581 if (hasEntryList)
582 clusterAndEntries.first = ConvertToElistClusters(std::move(clusterAndEntries.first), fEntryList, fTreeNames,
583 fFileNames, clusterAndEntries.second);
584 }
585
586 const auto &clusters = clusterAndEntries.first;
587 const auto &entries = clusterAndEntries.second;
588
589 // Retrieve number of entries for each file for each friend tree
590 const auto friendEntries =
591 hasFriends ? GetFriendEntries(friendNames, friendFileNames) : std::vector<std::vector<Long64_t>>{};
592
593 // Parent task, spawns tasks that process each of the entry clusters for each input file
594 // TODO: for readability we should have two versions of this lambda, for shouldRetrieveAllClusters == true/false
595 auto processFile = [&](std::size_t fileIdx) {
596 // theseFiles contains either all files or just the single file to process
597 const auto &theseFiles = shouldRetrieveAllClusters ? fFileNames : std::vector<std::string>({fFileNames[fileIdx]});
598 // either all tree names or just the single tree to process
599 const auto &theseTrees = shouldRetrieveAllClusters ? fTreeNames : std::vector<std::string>({fTreeNames[fileIdx]});
600 // Evaluate clusters (with local entry numbers) and number of entries for this file, if needed
601 const auto theseClustersAndEntries =
602 shouldRetrieveAllClusters ? ClustersAndEntries{} : MakeClusters(theseTrees, theseFiles);
603
604 // All clusters for the file to process, either with global or local entry numbers
605 const auto &thisFileClusters = shouldRetrieveAllClusters ? clusters[fileIdx] : theseClustersAndEntries.first[0];
606
607 // Either all number of entries or just the ones for this file
608 const auto &theseEntries =
609 shouldRetrieveAllClusters ? entries : std::vector<Long64_t>({theseClustersAndEntries.second[0]});
610
611 auto processCluster = [&](const EntryCluster &c) {
612 auto r = fTreeView->GetTreeReader(c.start, c.end, theseTrees, theseFiles, fFriendInfo, fEntryList,
613 theseEntries, friendEntries);
614 func(*r);
615 };
616
617 fPool.Foreach(processCluster, thisFileClusters);
618 };
619
620 std::vector<std::size_t> fileIdxs(fFileNames.size());
621 std::iota(fileIdxs.begin(), fileIdxs.end(), 0u);
622
623 fPool.Foreach(processFile, fileIdxs);
624}
625
626////////////////////////////////////////////////////////////////////////
627/// \brief Sets the maximum number of tasks created per file, per worker.
628/// \return The maximum number of tasks created per file, per worker
630{
632}
633
634////////////////////////////////////////////////////////////////////////
635/// \brief Sets the maximum number of tasks created per file, per worker.
636/// \param[in] maxTasksPerFile Name of the file containing the tree to process.
637///
638/// This allows to create a reasonable number of tasks even if any of the
639/// processed files features a bad clustering, for example with a lot of
640/// entries and just a few entries per cluster.
641void TTreeProcessorMT::SetMaxTasksPerFilePerWorker(unsigned int maxTasksPerFile)
642{
643 fgMaxTasksPerFilePerWorker = maxTasksPerFile;
644}
void Class()
Definition: Class.C:29
ROOT::R::TRInterface & r
Definition: Object.C:4
#define f(i)
Definition: RSha256.hxx:104
#define c(i)
Definition: RSha256.hxx:101
long long Long64_t
Definition: RtypesCore.h:71
#define gDirectory
Definition: TDirectory.h:229
#define R__ASSERT(e)
Definition: TError.h:96
char name[80]
Definition: TGX11.cxx:109
std::vector< std::string > GetFilesFromTree(TTree &tree)
std::vector< std::string > CheckAndConvert(const std::vector< std::string_view > &views)
std::unique_ptr< TChain > fChain
Chain on which to operate.
std::unique_ptr< TTreeReader > GetTreeReader(Long64_t start, Long64_t end, const std::vector< std::string > &treeName, const std::vector< std::string > &fileNames, const FriendInfo &friendInfo, const TEntryList &entryList, const std::vector< Long64_t > &nEntries, const std::vector< std::vector< Long64_t > > &friendEntries)
Get a TTreeReader for the current tree of this view.
std::vector< std::unique_ptr< TChain > > fFriends
Friends of the tree/chain, if present.
std::unique_ptr< TEntryList > fEntryList
TEntryList for fChain, if present.
void MakeChain(const std::vector< std::string > &treeName, const std::vector< std::string > &fileNames, const FriendInfo &friendInfo, const std::vector< Long64_t > &nEntries, const std::vector< std::vector< Long64_t > > &friendEntries)
Construct fChain, also adding friends if needed and injecting knowledge of offsets if available.
void Foreach(F func, unsigned nTimes, unsigned nChunks=0)
Execute func (with no arguments) nTimes in parallel.
A class to process the entries of a TTree in parallel.
const std::vector< std::string > fTreeNames
TTree names (always same size and ordering as fFileNames)
static unsigned int GetMaxTasksPerFilePerWorker()
Sets the maximum number of tasks created per file, per worker.
std::vector< std::string > FindTreeNames()
Retrieve the names of the TTrees in each of the input files, throw if a TTree cannot be found.
const std::vector< std::string > fFileNames
Names of the files.
static void SetMaxTasksPerFilePerWorker(unsigned int m)
Sets the maximum number of tasks created per file, per worker.
static unsigned int fgMaxTasksPerFilePerWorker
ROOT::TThreadExecutor fPool
! Thread pool for processing.
TEntryList fEntryList
User-defined selection of entry numbers to be processed, empty if none was provided.
ROOT::TThreadedObject< ROOT::Internal::TTreeView > fTreeView
Thread-local TreeViews.
TTreeProcessorMT(std::string_view filename, std::string_view treename="", UInt_t nThreads=0u)
Constructor based on a file name.
void Process(std::function< void(TTreeReader &)> func)
Process the entries of a TTree in parallel.
const Internal::FriendInfo fFriendInfo
A chain is a collection of files containing TTree objects.
Definition: TChain.h:34
TObjArray * GetListOfFiles() const
Definition: TChain.h:108
Small helper to keep current directory context.
Definition: TDirectory.h:47
A List of entry numbers in a TTree or TChain.
Definition: TEntryList.h:26
virtual TList * GetLists() const
Definition: TEntryList.h:73
virtual Long64_t GetEntry(Int_t index)
Return the number of the entry #index of this TEntryList in the TTree or TChain See also Next().
Definition: TEntryList.cxx:657
virtual Long64_t GetN() const
Definition: TEntryList.h:75
A ROOT file is a suite of consecutive data records (TKey instances) with a well defined format.
Definition: TFile.h:53
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition: TFile.cxx:3942
A TFriendElement TF describes a TTree object TF in a file.
Book space in a file, create I/O buffers, to fill them, (un)compress them.
Definition: TKey.h:28
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
An array of TObjects.
Definition: TObjArray.h:37
Int_t GetEntries() const
Return the number of objects in array (i.e.
Definition: TObjArray.cxx:523
Mother of all ROOT objects.
Definition: TObject.h:37
@ kCanDelete
if object in a list can be deleted
Definition: TObject.h:58
@ kMustCleanup
if object destructor must call RecursiveRemove()
Definition: TObject.h:60
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition: TTreeReader.h:43
A TTree represents a columnar dataset.
Definition: TTree.h:78
virtual Long64_t GetEntries() const
Definition: TTree.h:457
basic_string_view< char > string_view
void function(const Char_t *name_, T fun, const Char_t *docstring=0)
Definition: RExports.h:151
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Definition: StringConv.hxx:21
UInt_t GetThreadPoolSize()
Returns the size of ROOT's thread pool.
Definition: TROOT.cxx:564
void EnableThreadSafety()
Enables the global mutex to make ROOT thread safe/aware.
Definition: TROOT.cxx:495
Definition: tree.py:1
std::vector< std::vector< std::string > > fFriendFileNames
Names of the files where each friend is stored.
std::vector< Internal::NameAlias > fFriendNames
Pairs of names and aliases of friend trees/chains.