Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterface.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDF_TINTERFACE
12#define ROOT_RDF_TINTERFACE
13
14#include "ROOT/RDataSource.hxx"
19#include "ROOT/RDF/RRange.hxx"
20#include "ROOT/RDF/Utils.hxx"
23#include "ROOT/RResultPtr.hxx"
25#include "ROOT/RStringView.hxx"
26#include "ROOT/TypeTraits.hxx"
27#include "RtypesCore.h" // for ULong64_t
28#include "TDirectory.h"
29#include "TH1.h" // For Histo actions
30#include "TH2.h" // For Histo actions
31#include "TH3.h" // For Histo actions
32#include "TProfile.h"
33#include "TProfile2D.h"
34#include "TStatistic.h"
35
36#include <algorithm>
37#include <cstddef>
38#include <initializer_list>
39#include <limits>
40#include <memory>
41#include <sstream>
42#include <stdexcept>
43#include <string>
44#include <type_traits> // is_same, enable_if
45#include <typeinfo>
46#include <vector>
47
48class TGraph;
49
50// Windows requires a forward decl of printValue to accept it as a valid friend function in RInterface
51namespace ROOT {
54void EnableImplicitMT(UInt_t numthreads);
55class RDataFrame;
56namespace Internal {
57namespace RDF {
59}
60} // namespace Internal
61} // namespace ROOT
62namespace cling {
63std::string printValue(ROOT::RDataFrame *tdf);
64}
65
66namespace ROOT {
67namespace RDF {
70namespace TTraits = ROOT::TypeTraits;
71
72template <typename Proxied, typename DataSource>
73class RInterface;
74
75using RNode = RInterface<::ROOT::Detail::RDF::RNodeBase, void>;
76
77// clang-format off
78/**
79 * \class ROOT::RDF::RInterface
80 * \ingroup dataframe
81 * \brief The public interface to the RDataFrame federation of classes
82 * \tparam Proxied One of the "node" base types (e.g. RLoopManager, RFilterBase). The user never specifies this type manually.
83 * \tparam DataSource The type of the RDataSource which is providing the data to the data frame. There is no source by default.
84 *
85 * The documentation of each method features a one liner illustrating how to use the method, for example showing how
86 * the majority of the template parameters are automatically deduced requiring no or very little effort by the user.
87 */
88// clang-format on
89template <typename Proxied, typename DataSource = void>
91 using DS_t = DataSource;
96 friend std::string cling::printValue(::ROOT::RDataFrame *tdf); // For a nice printing at the prompt
98
99 template <typename T, typename W>
100 friend class RInterface;
101
102 std::shared_ptr<Proxied> fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface.
103 ///< The RLoopManager at the root of this computation graph. Never null.
105 /// Non-owning pointer to a data-source object. Null if no data-source. RLoopManager has ownership of the object.
107
108 /// Contains the custom columns defined up to this node.
110
111public:
112 ////////////////////////////////////////////////////////////////////////////
113 /// \brief Copy-assignment operator for RInterface.
114 RInterface &operator=(const RInterface &) = default;
115
116 ////////////////////////////////////////////////////////////////////////////
117 /// \brief Copy-ctor for RInterface.
118 RInterface(const RInterface &) = default;
119
120 ////////////////////////////////////////////////////////////////////////////
121 /// \brief Move-ctor for RInterface.
122 RInterface(RInterface &&) = default;
123
124 ////////////////////////////////////////////////////////////////////////////
125 /// \brief Only enabled when building a RInterface<RLoopManager>
126 template <typename T = Proxied, typename std::enable_if<std::is_same<T, RLoopManager>::value, int>::type = 0>
127 RInterface(const std::shared_ptr<Proxied> &proxied)
128 : fProxiedPtr(proxied), fLoopManager(proxied.get()), fDataSource(proxied->GetDataSource())
129 {
131 }
132
133 ////////////////////////////////////////////////////////////////////////////
134 /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode.
135 /// Different RDataFrame methods return different C++ types. All nodes, however,
136 /// can be cast to this common type at the cost of a small performance penalty.
137 /// This allows, for example, storing RDataFrame nodes in a vector, or passing them
138 /// around via (non-template, C++11) helper functions.
139 /// Example usage:
140 /// ~~~{.cpp}
141 /// // a function that conditionally adds a Range to a RDataFrame node.
142 /// RNode MaybeAddRange(RNode df, bool mustAddRange)
143 /// {
144 /// return mustAddRange ? df.Range(1) : df;
145 /// }
146 /// // use as :
147 /// ROOT::RDataFrame df(10);
148 /// auto maybeRanged = MaybeAddRange(df, true);
149 /// ~~~
150 /// Note that it is not a problem to pass RNode's by value.
151 operator RNode() const
152 {
153 return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fDefines,
155 }
156
157 ////////////////////////////////////////////////////////////////////////////
158 /// \brief Append a filter to the call graph.
159 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
160 /// signalling whether the event has passed the selection (true) or not (false).
161 /// \param[in] columns Names of the columns/branches in input to the filter function.
162 /// \param[in] name Optional name of this filter. See `Report`.
163 /// \return the filter node of the computation graph.
164 ///
165 /// Append a filter node at the point of the call graph corresponding to the
166 /// object this method is called on.
167 /// The callable `f` should not have side-effects (e.g. modification of an
168 /// external or static variable) to ensure correct results when implicit
169 /// multi-threading is active.
170 ///
171 /// RDataFrame only evaluates filters when necessary: if multiple filters
172 /// are chained one after another, they are executed in order and the first
173 /// one returning false causes the event to be discarded.
174 /// Even if multiple actions or transformations depend on the same filter,
175 /// it is executed once per entry. If its result is requested more than
176 /// once, the cached result is served.
177 ///
178 /// ### Example usage:
179 /// ~~~{.cpp}
180 /// // C++ callable (function, functor class, lambda...) that takes two parameters of the types of "x" and "y"
181 /// auto filtered = df.Filter(myCut, {"x", "y"});
182 ///
183 /// // String: it must contain valid C++ except that column names can be used instead of variable names
184 /// auto filtered = df.Filter("x*y > 0");
185 /// ~~~
186 template <typename F, typename std::enable_if<!std::is_convertible<F, std::string>::value, int>::type = 0>
188 Filter(F f, const ColumnNames_t &columns = {}, std::string_view name = "")
189 {
190 RDFInternal::CheckFilter(f);
191 using ColTypes_t = typename TTraits::CallableTraits<F>::arg_types;
192 constexpr auto nColumns = ColTypes_t::list_size;
193 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
194 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
195
197
198 auto filterPtr = std::make_shared<F_t>(std::move(f), validColumnNames, fProxiedPtr, fDefines, name);
199 fLoopManager->Book(filterPtr.get());
200 return RInterface<F_t, DS_t>(std::move(filterPtr), *fLoopManager, fDefines, fDataSource);
201 }
202
203 ////////////////////////////////////////////////////////////////////////////
204 /// \brief Append a filter to the call graph.
205 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
206 /// signalling whether the event has passed the selection (true) or not (false).
207 /// \param[in] name Optional name of this filter. See `Report`.
208 /// \return the filter node of the computation graph.
209 ///
210 /// Refer to the first overload of this method for the full documentation.
211 template <typename F, typename std::enable_if<!std::is_convertible<F, std::string>::value, int>::type = 0>
213 {
214 // The sfinae is there in order to pick up the overloaded method which accepts two strings
215 // rather than this template method.
216 return Filter(f, {}, name);
217 }
218
219 ////////////////////////////////////////////////////////////////////////////
220 /// \brief Append a filter to the call graph.
221 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
222 /// signalling whether the event has passed the selection (true) or not (false).
223 /// \param[in] columns Names of the columns/branches in input to the filter function.
224 /// \return the filter node of the computation graph.
225 ///
226 /// Refer to the first overload of this method for the full documentation.
227 template <typename F>
228 RInterface<RDFDetail::RFilter<F, Proxied>, DS_t> Filter(F f, const std::initializer_list<std::string> &columns)
229 {
230 return Filter(f, ColumnNames_t{columns});
231 }
232
233 ////////////////////////////////////////////////////////////////////////////
234 /// \brief Append a filter to the call graph.
235 /// \param[in] expression The filter expression in C++
236 /// \param[in] name Optional name of this filter. See `Report`.
237 /// \return the filter node of the computation graph.
238 ///
239 /// The expression is just-in-time compiled and used to filter entries. It must
240 /// be valid C++ syntax in which variable names are substituted with the names
241 /// of branches/columns.
242 ///
243 /// ### Example usage:
244 /// ~~~{.cpp}
245 /// auto filtered_df = df.Filter("myCollection.size() > 3");
246 /// auto filtered_name_df = df.Filter("myCollection.size() > 3", "Minumum collection size");
247 /// ~~~
248 RInterface<RDFDetail::RJittedFilter, DS_t> Filter(std::string_view expression, std::string_view name = "")
249 {
250 // deleted by the jitted call to JitFilterHelper
251 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
252 using BaseNodeType_t = typename std::remove_pointer<decltype(upcastNodeOnHeap)>::type::element_type;
253 RInterface<BaseNodeType_t> upcastInterface(*upcastNodeOnHeap, *fLoopManager, fDefines, fDataSource);
254 const auto jittedFilter = std::make_shared<RDFDetail::RJittedFilter>(fLoopManager, name);
255
256 RDFInternal::BookFilterJit(jittedFilter, upcastNodeOnHeap, name, expression, fLoopManager->GetAliasMap(),
258
259 fLoopManager->Book(jittedFilter.get());
260 return RInterface<RDFDetail::RJittedFilter, DS_t>(std::move(jittedFilter), *fLoopManager, fDefines,
262 }
263
264 // clang-format off
265 ////////////////////////////////////////////////////////////////////////////
266 /// \brief Creates a custom column
267 /// \param[in] name The name of the custom column.
268 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the temporary value. Returns the value that will be assigned to the custom column.
269 /// \param[in] columns Names of the columns/branches in input to the producer function.
270 /// \return the first node of the computation graph for which the new quantity is defined.
271 ///
272 /// Create a custom column that will be visible from all subsequent nodes
273 /// of the functional chain. The `expression` is only evaluated for entries that pass
274 /// all the preceding filters.
275 /// A new variable is created called `name`, accessible as if it was contained
276 /// in the dataset from subsequent transformations/actions.
277 ///
278 /// Use cases include:
279 /// * caching the results of complex calculations for easy and efficient multiple access
280 /// * extraction of quantities of interest from complex objects
281 ///
282 /// An exception is thrown if the name of the new column is already in use in this branch of the computation graph.
283 ///
284 /// ### Example usage:
285 /// ~~~{.cpp}
286 /// // assuming a function with signature:
287 /// double myComplexCalculation(const RVec<float> &muon_pts);
288 /// // we can pass it directly to Define
289 /// auto df_with_define = df.Define("newColumn", myComplexCalculation, {"muon_pts"});
290 /// // alternatively, we can pass the body of the function as a string, as in Filter:
291 /// auto df_with_define = df.Define("newColumn", "x*x + y*y");
292 /// ~~~
293 template <typename F, typename std::enable_if<!std::is_convertible<F, std::string>::value, int>::type = 0>
294 RInterface<Proxied, DS_t> Define(std::string_view name, F expression, const ColumnNames_t &columns = {})
295 {
296 return DefineImpl<F, RDFDetail::CustomColExtraArgs::None>(name, std::move(expression), columns);
297 }
298 // clang-format on
299
300 // clang-format off
301 ////////////////////////////////////////////////////////////////////////////
302 /// \brief Creates a custom column with a value dependent on the processing slot.
303 /// \param[in] name The name of the custom column.
304 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the temporary value. Returns the value that will be assigned to the custom column.
305 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding the slot number).
306 /// \return the first node of the computation graph for which the new quantity is defined.
307 ///
308 /// This alternative implementation of `Define` is meant as a helper in writing thread-safe custom columns.
309 /// The expression must be a callable of signature R(unsigned int, T1, T2, ...) where `T1, T2...` are the types
310 /// of the columns that the expression takes as input. The first parameter is reserved for an unsigned integer
311 /// representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
312 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
313 ///
314 /// The following two calls are equivalent, although `DefineSlot` is slightly more performant:
315 /// ~~~{.cpp}
316 /// int function(unsigned int, double, double);
317 /// df.Define("x", function, {"rdfslot_", "column1", "column2"})
318 /// df.DefineSlot("x", function, {"column1", "column2"})
319 /// ~~~
320 ///
321 /// See Define for more information.
322 template <typename F>
323 RInterface<Proxied, DS_t> DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
324 {
325 return DefineImpl<F, RDFDetail::CustomColExtraArgs::Slot>(name, std::move(expression), columns);
326 }
327 // clang-format on
328
329 // clang-format off
330 ////////////////////////////////////////////////////////////////////////////
331 /// \brief Creates a custom column with a value dependent on the processing slot and the current entry.
332 /// \param[in] name The name of the custom column.
333 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the temporary value. Returns the value that will be assigned to the custom column.
334 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
335 /// \return the first node of the computation graph for which the new quantity is defined.
336 ///
337 /// This alternative implementation of `Define` is meant as a helper in writing entry-specific, thread-safe custom
338 /// columns. The expression must be a callable of signature R(unsigned int, ULong64_t, T1, T2, ...) where `T1, T2...`
339 /// are the types of the columns that the expression takes as input. The first parameter is reserved for an unsigned
340 /// integer representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
341 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1. The second parameter
342 /// is reserved for a `ULong64_t` representing the current entry being processed by the current thread.
343 ///
344 /// The following two `Define`s are equivalent, although `DefineSlotEntry` is slightly more performant:
345 /// ~~~{.cpp}
346 /// int function(unsigned int, ULong64_t, double, double);
347 /// Define("x", function, {"rdfslot_", "rdfentry_", "column1", "column2"})
348 /// DefineSlotEntry("x", function, {"column1", "column2"})
349 /// ~~~
350 ///
351 /// See Define for more information.
352 template <typename F>
353 RInterface<Proxied, DS_t> DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
354 {
355 return DefineImpl<F, RDFDetail::CustomColExtraArgs::SlotAndEntry>(name, std::move(expression), columns);
356 }
357 // clang-format on
358
359 ////////////////////////////////////////////////////////////////////////////
360 /// \brief Creates a custom column
361 /// \param[in] name The name of the custom column.
362 /// \param[in] expression An expression in C++ which represents the temporary value
363 /// \return the first node of the computation graph for which the new quantity is defined.
364 ///
365 /// The expression is just-in-time compiled and used to produce the column entries.
366 /// It must be valid C++ syntax in which variable names are substituted with the names
367 /// of branches/columns.
368 ///
369 /// Refer to the first overload of this method for the full documentation.
370 RInterface<Proxied, DS_t> Define(std::string_view name, std::string_view expression)
371 {
372 // this check must be done before jitting lest we throw exceptions in jitted code
376
377 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
378 auto jittedDefine = RDFInternal::BookDefineJit(name, expression, *fLoopManager, fDataSource, fDefines,
379 fLoopManager->GetBranchNames(), upcastNodeOnHeap);
380
382 newCols.AddColumn(jittedDefine, name);
383
384 RInterface<Proxied, DS_t> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols), fDataSource);
385
386 return newInterface;
387 }
388
389 ////////////////////////////////////////////////////////////////////////////
390 /// \brief Allow to refer to a column with a different name
391 /// \param[in] alias name of the column alias
392 /// \param[in] columnName of the column to be aliased
393 /// \return the first node of the computation graph for which the alias is available.
394 ///
395 /// Aliasing an alias is supported.
396 ///
397 /// ### Example usage:
398 /// ~~~{.cpp}
399 /// auto df_with_alias = df.Alias("simple_name", "very_long&complex_name!!!");
400 /// ~~~
401 RInterface<Proxied, DS_t> Alias(std::string_view alias, std::string_view columnName)
402 {
403 // The symmetry with Define is clear. We want to:
404 // - Create globally the alias and return this very node, unchanged
405 // - Make aliases accessible based on chains and not globally
406
407 // Helper to find out if a name is a column
408 auto &dsColumnNames = fDataSource ? fDataSource->GetColumnNames() : ColumnNames_t{};
409
410 // If the alias name is a column name, there is a problem
412 fLoopManager->GetAliasMap(), dsColumnNames);
413
414 const auto validColumnName = GetValidatedColumnNames(1, {std::string(columnName)})[0];
415
416 fLoopManager->AddColumnAlias(std::string(alias), validColumnName);
417
419
420 newCols.AddName(alias);
421 RInterface<Proxied, DS_t> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols), fDataSource);
422
423 return newInterface;
424 }
425
426 ////////////////////////////////////////////////////////////////////////////
427 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
428 /// \tparam ColumnTypes variadic list of branch/column types.
429 /// \param[in] treename The name of the output TTree.
430 /// \param[in] filename The name of the output TFile.
431 /// \param[in] columnList The list of names of the columns/branches to be written.
432 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
433 /// \return a `RDataFrame` that wraps the snapshotted dataset.
434 ///
435 /// Support for writing of nested branches is limited (although RDataFrame is able to read them) and dot ('.')
436 /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot.
437 /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also
438 /// written out and it appears before the array in the columnList.
439 ///
440 /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
441 /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled with
442 /// respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in wrong
443 /// associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
444 /// error out if such a "shuffled" TTree is used in a friendship.
445 ///
446 /// ### Example invocations:
447 ///
448 /// ~~~{.cpp}
449 /// // without specifying template parameters (column types automatically deduced)
450 /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"});
451 ///
452 /// // specifying template parameters ("x" is `int`, "y" is `float`)
453 /// df.Snapshot<int, float>("outputTree", "outputFile.root", {"x", "y"});
454 /// ~~~
455 ///
456 /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in
457 /// `RSnapshotOptions`:
458 /// ~~~{.cpp}
459 /// RSnapshotOptions opts;
460 /// opts.fLazy = true;
461 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
462 /// ~~~
463 template <typename... ColumnTypes>
465 Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList,
466 const RSnapshotOptions &options = RSnapshotOptions())
467 {
468 return SnapshotImpl<ColumnTypes...>(treename, filename, columnList, options);
469 }
470
471 ////////////////////////////////////////////////////////////////////////////
472 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
473 /// \param[in] treename The name of the output TTree.
474 /// \param[in] filename The name of the output TFile.
475 /// \param[in] columnList The list of names of the columns/branches to be written.
476 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
477 /// \return a `RDataFrame` that wraps the snapshotted dataset.
478 ///
479 /// This function returns a `RDataFrame` built with the output tree as a source.
480 /// The types of the columns are automatically inferred and do not need to be specified.
481 ///
482 /// See above for a more complete description and example usages.
483 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
484 const ColumnNames_t &columnList,
485 const RSnapshotOptions &options = RSnapshotOptions())
486 {
487 const auto validCols = GetValidatedColumnNames(columnList.size(), columnList);
488
489 const auto fullTreeName = treename;
490 const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName);
491 treename = parsedTreePath.fTreeName;
492 const auto &dirname = parsedTreePath.fDirName;
493
494 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
495 std::string(filename), std::string(dirname), std::string(treename), columnList, options});
496
498 auto newRDF = std::make_shared<ROOT::RDataFrame>(fullTreeName, filename, validCols);
499
500 auto resPtr = CreateAction<RDFInternal::ActionTags::Snapshot, RDFDetail::RInferredType>(
501 validCols, newRDF, snapHelperArgs, validCols.size());
502
503 if (!options.fLazy)
504 *resPtr;
505 return resPtr;
506 }
507
508 // clang-format off
509 ////////////////////////////////////////////////////////////////////////////
510 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
511 /// \param[in] treename The name of the output TTree.
512 /// \param[in] filename The name of the output TFile.
513 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
514 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree
515 /// \return a `RDataFrame` that wraps the snapshotted dataset.
516 ///
517 /// This function returns a `RDataFrame` built with the output tree as a source.
518 /// The types of the columns are automatically inferred and do not need to be specified.
519 ///
520 /// See above for a more complete description and example usages.
521 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
522 std::string_view columnNameRegexp = "",
523 const RSnapshotOptions &options = RSnapshotOptions())
524 {
525 const auto definedColumns = fDefines.GetNames();
526 auto *tree = fLoopManager->GetTree();
527 const auto treeBranchNames = tree != nullptr ? RDFInternal::GetTopLevelBranchNames(*tree) : ColumnNames_t{};
528 const auto dsColumns = fDataSource ? fDataSource->GetColumnNames() : ColumnNames_t{};
529 ColumnNames_t columnNames;
530 columnNames.reserve(definedColumns.size() + treeBranchNames.size() + dsColumns.size());
531 columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end());
532 columnNames.insert(columnNames.end(), treeBranchNames.begin(), treeBranchNames.end());
533 columnNames.insert(columnNames.end(), dsColumns.begin(), dsColumns.end());
534 const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot");
535 return Snapshot(treename, filename, selectedColumns, options);
536 }
537 // clang-format on
538
539 // clang-format off
540 ////////////////////////////////////////////////////////////////////////////
541 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
542 /// \param[in] treename The name of the output TTree.
543 /// \param[in] filename The name of the output TFile.
544 /// \param[in] columnList The list of names of the columns/branches to be written.
545 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
546 /// \return a `RDataFrame` that wraps the snapshotted dataset.
547 ///
548 /// This function returns a `RDataFrame` built with the output tree as a source.
549 /// The types of the columns are automatically inferred and do not need to be specified.
550 ///
551 /// See above for a more complete description and example usages.
552 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
553 std::initializer_list<std::string> columnList,
554 const RSnapshotOptions &options = RSnapshotOptions())
555 {
556 ColumnNames_t selectedColumns(columnList);
557 return Snapshot(treename, filename, selectedColumns, options);
558 }
559 // clang-format on
560
561 ////////////////////////////////////////////////////////////////////////////
562 /// \brief Save selected columns in memory
563 /// \tparam ColumnTypes variadic list of branch/column types.
564 /// \param[in] columnList columns to be cached in memory.
565 /// \return a `RDataFrame` that wraps the cached dataset.
566 ///
567 /// This action returns a new `RDataFrame` object, completely detached from
568 /// the originating `RDataFrame`. The new dataframe only contains the cached
569 /// columns and stores their content in memory for fast, zero-copy subsequent access.
570 ///
571 /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that
572 /// fits in memory and that will be accessed many times.
573 ///
574 /// ### Example usage:
575 ///
576 /// **Types and columns specified:**
577 /// ~~~{.cpp}
578 /// auto cache_some_cols_df = df.Cache<double, MyClass, int>({"col0", "col1", "col2"});
579 /// ~~~
580 ///
581 /// **Types inferred and columns specified (this invocation relies on jitting):**
582 /// ~~~{.cpp}
583 /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"});
584 /// ~~~
585 ///
586 /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):**
587 /// ~~~{.cpp}
588 /// auto cache_all_cols_df = df.Cache(myRegexp);
589 /// ~~~
590 template <typename... ColumnTypes>
592 {
593 auto staticSeq = std::make_index_sequence<sizeof...(ColumnTypes)>();
594 return CacheImpl<ColumnTypes...>(columnList, staticSeq);
595 }
596
597 ////////////////////////////////////////////////////////////////////////////
598 /// \brief Save selected columns in memory
599 /// \param[in] columnList columns to be cached in memory
600 /// \return a `RDataFrame` that wraps the cached dataset.
601 ///
602 /// See the previous overloads for more information.
604 {
605 // Early return: if the list of columns is empty, just return an empty RDF
606 // If we proceed, the jitted call will not compile!
607 if (columnList.empty()) {
608 auto nEntries = *this->Count();
609 RInterface<RLoopManager> emptyRDF(std::make_shared<RLoopManager>(nEntries));
610 return emptyRDF;
611 }
612
613 std::stringstream cacheCall;
614 auto upcastNode = RDFInternal::UpcastNode(fProxiedPtr);
615 RInterface<TTraits::TakeFirstParameter_t<decltype(upcastNode)>> upcastInterface(fProxiedPtr, *fLoopManager,
617 // build a string equivalent to
618 // "(RInterface<nodetype*>*)(this)->Cache<Ts...>(*(ColumnNames_t*)(&columnList))"
619 RInterface<RLoopManager> resRDF(std::make_shared<ROOT::Detail::RDF::RLoopManager>(0));
620 cacheCall << "*reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RLoopManager>*>("
622 << ") = reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RNodeBase>*>("
623 << RDFInternal::PrettyPrintAddr(&upcastInterface) << ")->Cache<";
624
625 const auto validColumnNames = GetValidatedColumnNames(columnList.size(), columnList);
626 const auto colTypes = GetValidatedArgTypes(validColumnNames, fDefines, fLoopManager->GetTree(), fDataSource,
627 "Cache", /*vector2rvec=*/false);
628 for (const auto &colType : colTypes)
629 cacheCall << colType << ", ";
630 if (!columnList.empty())
631 cacheCall.seekp(-2, cacheCall.cur); // remove the last ",
632 cacheCall << ">(*reinterpret_cast<std::vector<std::string>*>(" // vector<string> should be ColumnNames_t
633 << RDFInternal::PrettyPrintAddr(&columnList) << "));";
634
635 // book the code to jit with the RLoopManager and trigger the event loop
636 fLoopManager->ToJitExec(cacheCall.str());
637 fLoopManager->Jit();
638
639 return resRDF;
640 }
641
642 ////////////////////////////////////////////////////////////////////////////
643 /// \brief Save selected columns in memory
644 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
645 /// \return a `RDataFrame` that wraps the cached dataset.
646 ///
647 /// The existing columns are matched against the regular expression. If the string provided
648 /// is empty, all columns are selected. See the previous overloads for more information.
649 RInterface<RLoopManager> Cache(std::string_view columnNameRegexp = "")
650 {
651 const auto definedColumns = fDefines.GetNames();
652 auto *tree = fLoopManager->GetTree();
653 const auto treeBranchNames = tree != nullptr ? RDFInternal::GetTopLevelBranchNames(*tree) : ColumnNames_t{};
654 const auto dsColumns = fDataSource ? fDataSource->GetColumnNames() : ColumnNames_t{};
655 ColumnNames_t columnNames;
656 columnNames.reserve(definedColumns.size() + treeBranchNames.size() + dsColumns.size());
657 columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end());
658 columnNames.insert(columnNames.end(), treeBranchNames.begin(), treeBranchNames.end());
659 columnNames.insert(columnNames.end(), dsColumns.begin(), dsColumns.end());
660 const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Cache");
661 return Cache(selectedColumns);
662 }
663
664 ////////////////////////////////////////////////////////////////////////////
665 /// \brief Save selected columns in memory
666 /// \param[in] columnList columns to be cached in memory.
667 /// \return a `RDataFrame` that wraps the cached dataset.
668 ///
669 /// See the previous overloads for more information.
670 RInterface<RLoopManager> Cache(std::initializer_list<std::string> columnList)
671 {
672 ColumnNames_t selectedColumns(columnList);
673 return Cache(selectedColumns);
674 }
675
676 // clang-format off
677 ////////////////////////////////////////////////////////////////////////////
678 /// \brief Creates a node that filters entries based on range: [begin, end)
679 /// \param[in] begin Initial entry number considered for this range.
680 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
681 /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0.
682 /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries.
683 ///
684 /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset.
685 /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported.
686 ///
687 /// ### Example usage:
688 /// ~~~{.cpp}
689 /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries
690 /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards
691 /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3
692 /// ~~~
693 // clang-format on
694 RInterface<RDFDetail::RRange<Proxied>, DS_t> Range(unsigned int begin, unsigned int end, unsigned int stride = 1)
695 {
696 // check invariants
697 if (stride == 0 || (end != 0 && end < begin))
698 throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin.");
699 CheckIMTDisabled("Range");
700
701 using Range_t = RDFDetail::RRange<Proxied>;
702 auto rangePtr = std::make_shared<Range_t>(begin, end, stride, fProxiedPtr);
703 fLoopManager->Book(rangePtr.get());
705 return tdf_r;
706 }
707
708 // clang-format off
709 ////////////////////////////////////////////////////////////////////////////
710 /// \brief Creates a node that filters entries based on range
711 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
712 /// \return a node of the computation graph for which the range is defined.
713 ///
714 /// See the other Range overload for a detailed description.
715 // clang-format on
716 RInterface<RDFDetail::RRange<Proxied>, DS_t> Range(unsigned int end) { return Range(0, end, 1); }
717
718 // clang-format off
719 ////////////////////////////////////////////////////////////////////////////
720 /// \brief Execute a user-defined function on each entry (*instant action*)
721 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
722 /// \param[in] columns Names of the columns/branches in input to the user function.
723 ///
724 /// The callable `f` is invoked once per entry. This is an *instant action*:
725 /// upon invocation, an event loop as well as execution of all scheduled actions
726 /// is triggered.
727 /// Users are responsible for the thread-safety of this callable when executing
728 /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT).
729 ///
730 /// ### Example usage:
731 /// ~~~{.cpp}
732 /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"});
733 /// ~~~
734 // clang-format on
735 template <typename F>
736 void Foreach(F f, const ColumnNames_t &columns = {})
737 {
738 using arg_types = typename TTraits::CallableTraits<decltype(f)>::arg_types_nodecay;
739 using ret_type = typename TTraits::CallableTraits<decltype(f)>::ret_type;
740 ForeachSlot(RDFInternal::AddSlotParameter<ret_type>(f, arg_types()), columns);
741 }
742
743 // clang-format off
744 ////////////////////////////////////////////////////////////////////////////
745 /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*)
746 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
747 /// \param[in] columns Names of the columns/branches in input to the user function.
748 ///
749 /// Same as `Foreach`, but the user-defined function takes an extra
750 /// `unsigned int` as its first parameter, the *processing slot index*.
751 /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`,
752 /// for each thread of execution.
753 /// This is meant as a helper in writing thread-safe `Foreach`
754 /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`.
755 /// The user-defined processing callable is able to follow different
756 /// *streams of processing* indexed by the first parameter.
757 /// `ForeachSlot` works just as well with single-thread execution: in that
758 /// case `slot` will always be `0`.
759 ///
760 /// ### Example usage:
761 /// ~~~{.cpp}
762 /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"});
763 /// ~~~
764 // clang-format on
765 template <typename F>
766 void ForeachSlot(F f, const ColumnNames_t &columns = {})
767 {
769 constexpr auto nColumns = ColTypes_t::list_size;
770
771 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
772 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
773
774 using Helper_t = RDFInternal::ForeachSlotHelper<F>;
776
777 auto action = std::make_unique<Action_t>(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fDefines);
778 fLoopManager->Book(action.get());
779
780 fLoopManager->Run();
781 }
782
783 // clang-format off
784 ////////////////////////////////////////////////////////////////////////////
785 /// \brief Execute a user-defined reduce operation on the values of a column.
786 /// \tparam F The type of the reduce callable. Automatically deduced.
787 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
788 /// \param[in] f A callable with signature `T(T,T)`
789 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
790 /// \return the reduced quantity wrapped in a `RResultPtr`.
791 ///
792 /// A reduction takes two values of a column and merges them into one (e.g.
793 /// by summing them, taking the maximum, etc). This action performs the
794 /// specified reduction operation on all processed column values, returning
795 /// a single value of the same type. The callable f must satisfy the general
796 /// requirements of a *processing function* besides having signature `T(T,T)`
797 /// where `T` is the type of column columnName.
798 ///
799 /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a
800 /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific
801 /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this
802 /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce`
803 /// overload.
804 ///
805 /// ### Example usage:
806 /// ~~~{.cpp}
807 /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol");
808 /// ~~~
809 ///
810 /// This action is *lazy*: upon invocation of this method the calculation is
811 /// booked but not executed. See RResultPtr documentation.
812 // clang-format on
813 template <typename F, typename T = typename TTraits::CallableTraits<F>::ret_type>
814 RResultPtr<T> Reduce(F f, std::string_view columnName = "")
815 {
816 static_assert(
817 std::is_default_constructible<T>::value,
818 "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)");
819 return Reduce(std::move(f), columnName, T());
820 }
821
822 ////////////////////////////////////////////////////////////////////////////
823 /// \brief Execute a user-defined reduce operation on the values of a column.
824 /// \tparam F The type of the reduce callable. Automatically deduced.
825 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
826 /// \param[in] f A callable with signature `T(T,T)`
827 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
828 /// \param[in] redIdentity The reduced object of each thread is initialised to this value.
829 /// \return the reduced quantity wrapped in a `RResultPtr`.
830 ///
831 /// ### Example usage:
832 /// ~~~{.cpp}
833 /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42);
834 /// ~~~
835 /// See the description of the first Reduce overload for more information.
836 template <typename F, typename T = typename TTraits::CallableTraits<F>::ret_type>
837 RResultPtr<T> Reduce(F f, std::string_view columnName, const T &redIdentity)
838 {
839 return Aggregate(f, f, columnName, redIdentity);
840 }
841
842 ////////////////////////////////////////////////////////////////////////////
843 /// \brief Return the number of entries processed (*lazy action*)
844 /// \return the number of entries wrapped in a `RResultPtr`.
845 ///
846 /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`).
847 /// This action is *lazy*: upon invocation of this method the calculation is
848 /// booked but not executed. See RResultPtr documentation.
849 ///
850 /// ### Example usage:
851 /// ~~~{.cpp}
852 /// auto nEntriesAfterCuts = myFilteredDf.Count();
853 /// ~~~
854 ///
856 {
857 const auto nSlots = fLoopManager->GetNSlots();
858 auto cSPtr = std::make_shared<ULong64_t>(0);
859 using Helper_t = RDFInternal::CountHelper;
861 auto action = std::make_unique<Action_t>(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr,
863 fLoopManager->Book(action.get());
864 return MakeResultPtr(cSPtr, *fLoopManager, std::move(action));
865 }
866
867 ////////////////////////////////////////////////////////////////////////////
868 /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default)
869 /// \tparam T The type of the column.
870 /// \tparam COLL The type of collection used to store the values.
871 /// \param[in] column The name of the column to collect the values of.
872 /// \return the content of the selected column wrapped in a `RResultPtr`.
873 ///
874 /// The collection type to be specified for C-style array columns is `RVec<T>`:
875 /// in this case the returned collection is a `std::vector<RVec<T>>`.
876 /// ### Example usage:
877 /// ~~~{.cpp}
878 /// // In this case intCol is a std::vector<int>
879 /// auto intCol = rdf.Take<int>("integerColumn");
880 /// // Same content as above but in this case taken as a RVec<int>
881 /// auto intColAsRVec = rdf.Take<int, RVec<int>>("integerColumn");
882 /// // In this case intCol is a std::vector<RVec<int>>, a collection of collections
883 /// auto cArrayIntCol = rdf.Take<RVec<int>>("cArrayInt");
884 /// ~~~
885 /// This action is *lazy*: upon invocation of this method the calculation is
886 /// booked but not executed. See RResultPtr documentation.
887 template <typename T, typename COLL = std::vector<T>>
888 RResultPtr<COLL> Take(std::string_view column = "")
889 {
890 const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)});
891
892 const auto validColumnNames = GetValidatedColumnNames(1, columns);
893 CheckAndFillDSColumns(validColumnNames, TTraits::TypeList<T>());
894
895 using Helper_t = RDFInternal::TakeHelper<T, T, COLL>;
897 auto valuesPtr = std::make_shared<COLL>();
898 const auto nSlots = fLoopManager->GetNSlots();
899
900 auto action =
901 std::make_unique<Action_t>(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fDefines);
902 fLoopManager->Book(action.get());
903 return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action));
904 }
905
906 ////////////////////////////////////////////////////////////////////////////
907 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*)
908 /// \tparam V The type of the column used to fill the histogram.
909 /// \param[in] model The returned histogram will be constructed using this as a model.
910 /// \param[in] vName The name of the column that will fill the histogram.
911 /// \return the monodimensional histogram wrapped in a `RResultPtr`.
912 ///
913 /// Columns can be of a container type (e.g. `std::vector<double>`), in which case the histogram
914 /// is filled with each one of the elements of the container. In case multiple columns of container type
915 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
916 /// possibly different lengths between events).
917 /// This action is *lazy*: upon invocation of this method the calculation is
918 /// booked but not executed. See RResultPtr documentation.
919 ///
920 /// ### Example usage:
921 /// ~~~{.cpp}
922 /// // Deduce column type (this invocation needs jitting internally)
923 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
924 /// // Explicit column type
925 /// auto myHist2 = myDf.Histo1D<float>({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
926 /// ~~~
927 ///
928 template <typename V = RDFDetail::RInferredType>
929 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "")
930 {
931 const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)});
932
933 const auto validatedColumns = GetValidatedColumnNames(1, userColumns);
934
935 std::shared_ptr<::TH1D> h(nullptr);
936 {
937 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
938 h = model.GetHistogram();
939 h->SetDirectory(nullptr);
940 }
941
942 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
943 RDFInternal::HistoUtils<::TH1D>::SetCanExtendAllAxes(*h);
944 return CreateAction<RDFInternal::ActionTags::Histo1D, V>(validatedColumns, h, h);
945 }
946
947 ////////////////////////////////////////////////////////////////////////////
948 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*)
949 /// \tparam V The type of the column used to fill the histogram.
950 /// \param[in] vName The name of the column that will fill the histogram.
951 /// \return the monodimensional histogram wrapped in a `RResultPtr`.
952 ///
953 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
954 /// The "name" and "title" strings are built starting from the input column name.
955 /// See the description of the first Histo1D overload for more details.
956 ///
957 /// ### Example usage:
958 /// ~~~{.cpp}
959 /// // Deduce column type (this invocation needs jitting internally)
960 /// auto myHist1 = myDf.Histo1D("myColumn");
961 /// // Explicit column type
962 /// auto myHist2 = myDf.Histo1D<float>("myColumn");
963 /// ~~~
964 ///
965 template <typename V = RDFDetail::RInferredType>
966 RResultPtr<::TH1D> Histo1D(std::string_view vName)
967 {
968 const auto h_name = std::string(vName);
969 const auto h_title = h_name + ";" + h_name + ";count";
970 return Histo1D<V>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName);
971 }
972
973 ////////////////////////////////////////////////////////////////////////////
974 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*)
975 /// \tparam V The type of the column used to fill the histogram.
976 /// \tparam W The type of the column used as weights.
977 /// \param[in] model The returned histogram will be constructed using this as a model.
978 /// \param[in] vName The name of the column that will fill the histogram.
979 /// \param[in] wName The name of the column that will provide the weights.
980 /// \return the monodimensional histogram wrapped in a `RResultPtr`.
981 ///
982 /// See the description of the first Histo1D overload for more details.
983 ///
984 /// ### Example usage:
985 /// ~~~{.cpp}
986 /// // Deduce column type (this invocation needs jitting internally)
987 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
988 /// // Explicit column type
989 /// auto myHist2 = myDf.Histo1D<float, int>({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
990 /// ~~~
991 ///
992 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
993 RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
994 {
995 const std::vector<std::string_view> columnViews = {vName, wName};
996 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
997 ? ColumnNames_t()
998 : ColumnNames_t(columnViews.begin(), columnViews.end());
999 std::shared_ptr<::TH1D> h(nullptr);
1000 {
1001 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1002 h = model.GetHistogram();
1003 }
1004 return CreateAction<RDFInternal::ActionTags::Histo1D, V, W>(userColumns, h, h);
1005 }
1006
1007 ////////////////////////////////////////////////////////////////////////////
1008 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*)
1009 /// \tparam V The type of the column used to fill the histogram.
1010 /// \tparam W The type of the column used as weights.
1011 /// \param[in] vName The name of the column that will fill the histogram.
1012 /// \param[in] wName The name of the column that will provide the weights.
1013 /// \return the monodimensional histogram wrapped in a `RResultPtr`.
1014 ///
1015 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1016 /// The "name" and "title" strings are built starting from the input column names.
1017 /// See the description of the first Histo1D overload for more details.
1018 ///
1019 /// ### Example usage:
1020 /// ~~~{.cpp}
1021 /// // Deduce column types (this invocation needs jitting internally)
1022 /// auto myHist1 = myDf.Histo1D("myValue", "myweight");
1023 /// // Explicit column types
1024 /// auto myHist2 = myDf.Histo1D<float, int>("myValue", "myweight");
1025 /// ~~~
1026 ///
1027 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1028 RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName)
1029 {
1030 // We build name and title based on the value and weight column names
1031 std::string str_vName{vName};
1032 std::string str_wName{wName};
1033 const auto h_name = str_vName + "_weighted_" + str_wName;
1034 const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName;
1035 return Histo1D<V, W>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName);
1036 }
1037
1038 ////////////////////////////////////////////////////////////////////////////
1039 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*)
1040 /// \tparam V The type of the column used to fill the histogram.
1041 /// \tparam W The type of the column used as weights.
1042 /// \param[in] model The returned histogram will be constructed using this as a model.
1043 /// \return the monodimensional histogram wrapped in a `RResultPtr`.
1044 ///
1045 /// This overload will use the first two default columns as column names.
1046 /// See the description of the first Histo1D overload for more details.
1047 template <typename V, typename W>
1048 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.})
1049 {
1050 return Histo1D<V, W>(model, "", "");
1051 }
1052
1053 ////////////////////////////////////////////////////////////////////////////
1054 /// \brief Fill and return a two-dimensional histogram (*lazy action*)
1055 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
1056 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
1057 /// \param[in] model The returned histogram will be constructed using this as a model.
1058 /// \param[in] v1Name The name of the column that will fill the x axis.
1059 /// \param[in] v2Name The name of the column that will fill the y axis.
1060 /// \return the bidimensional histogram wrapped in a `RResultPtr`.
1061 ///
1062 /// Columns can be of a container type (e.g. std::vector<double>), in which case the histogram
1063 /// is filled with each one of the elements of the container. In case multiple columns of container type
1064 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1065 /// possibly different lengths between events).
1066 /// This action is *lazy*: upon invocation of this method the calculation is
1067 /// booked but not executed. See RResultPtr documentation.
1068 ///
1069 /// ### Example usage:
1070 /// ~~~{.cpp}
1071 /// // Deduce column types (this invocation needs jitting internally)
1072 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1073 /// // Explicit column types
1074 /// auto myHist2 = myDf.Histo2D<float, float>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1075 /// ~~~
1076 ///
1077 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
1078 RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
1079 {
1080 std::shared_ptr<::TH2D> h(nullptr);
1081 {
1082 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1083 h = model.GetHistogram();
1084 }
1085 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
1086 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
1087 }
1088 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
1089 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1090 ? ColumnNames_t()
1091 : ColumnNames_t(columnViews.begin(), columnViews.end());
1092 return CreateAction<RDFInternal::ActionTags::Histo2D, V1, V2>(userColumns, h, h);
1093 }
1094
1095 ////////////////////////////////////////////////////////////////////////////
1096 /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*)
1097 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
1098 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
1099 /// \tparam W The type of the column used for the weights of the histogram.
1100 /// \param[in] model The returned histogram will be constructed using this as a model.
1101 /// \param[in] v1Name The name of the column that will fill the x axis.
1102 /// \param[in] v2Name The name of the column that will fill the y axis.
1103 /// \param[in] wName The name of the column that will provide the weights.
1104 /// \return the bidimensional histogram wrapped in a `RResultPtr`.
1105 ///
1106 /// This action is *lazy*: upon invocation of this method the calculation is
1107 /// booked but not executed. See RResultPtr documentation.
1108 /// The user gives up ownership of the model histogram.
1109 ///
1110 /// ### Example usage:
1111 /// ~~~{.cpp}
1112 /// // Deduce column types (this invocation needs jitting internally)
1113 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
1114 /// // Explicit column types
1115 /// auto myHist2 = myDf.Histo2D<float, float, double>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
1116 /// ~~~
1117 ///
1118 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1119 typename W = RDFDetail::RInferredType>
1121 Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
1122 {
1123 std::shared_ptr<::TH2D> h(nullptr);
1124 {
1125 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1126 h = model.GetHistogram();
1127 }
1128 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
1129 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
1130 }
1131 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
1132 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1133 ? ColumnNames_t()
1134 : ColumnNames_t(columnViews.begin(), columnViews.end());
1135 return CreateAction<RDFInternal::ActionTags::Histo2D, V1, V2, W>(userColumns, h, h);
1136 }
1137
1138 template <typename V1, typename V2, typename W>
1140 {
1141 return Histo2D<V1, V2, W>(model, "", "", "");
1142 }
1143
1144 ////////////////////////////////////////////////////////////////////////////
1145 /// \brief Fill and return a three-dimensional histogram (*lazy action*)
1146 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1147 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1148 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1149 /// \param[in] model The returned histogram will be constructed using this as a model.
1150 /// \param[in] v1Name The name of the column that will fill the x axis.
1151 /// \param[in] v2Name The name of the column that will fill the y axis.
1152 /// \param[in] v3Name The name of the column that will fill the z axis.
1153 /// \return the tridimensional histogram wrapped in a `RResultPtr`.
1154 ///
1155 /// This action is *lazy*: upon invocation of this method the calculation is
1156 /// booked but not executed. See RResultPtr documentation.
1157 ///
1158 /// ### Example usage:
1159 /// ~~~{.cpp}
1160 /// // Deduce column types (this invocation needs jitting internally)
1161 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1162 /// "myValueX", "myValueY", "myValueZ");
1163 /// // Explicit column types
1164 /// auto myHist2 = myDf.Histo3D<double, double, float>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1165 /// "myValueX", "myValueY", "myValueZ");
1166 /// ~~~
1167 ///
1168 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1169 typename V3 = RDFDetail::RInferredType>
1170 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "",
1171 std::string_view v3Name = "")
1172 {
1173 std::shared_ptr<::TH3D> h(nullptr);
1174 {
1175 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1176 h = model.GetHistogram();
1177 }
1178 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
1179 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
1180 }
1181 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
1182 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1183 ? ColumnNames_t()
1184 : ColumnNames_t(columnViews.begin(), columnViews.end());
1185 return CreateAction<RDFInternal::ActionTags::Histo3D, V1, V2, V3>(userColumns, h, h);
1186 }
1187
1188 ////////////////////////////////////////////////////////////////////////////
1189 /// \brief Fill and return a three-dimensional histogram (*lazy action*)
1190 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1191 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1192 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1193 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
1194 /// \param[in] model The returned histogram will be constructed using this as a model.
1195 /// \param[in] v1Name The name of the column that will fill the x axis.
1196 /// \param[in] v2Name The name of the column that will fill the y axis.
1197 /// \param[in] v3Name The name of the column that will fill the z axis.
1198 /// \param[in] wName The name of the column that will provide the weights.
1199 /// \return the tridimensional histogram wrapped in a `RResultPtr`.
1200 ///
1201 /// This action is *lazy*: upon invocation of this method the calculation is
1202 /// booked but not executed. See RResultPtr documentation.
1203 ///
1204 /// ### Example usage:
1205 /// ~~~{.cpp}
1206 /// // Deduce column types (this invocation needs jitting internally)
1207 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1208 /// "myValueX", "myValueY", "myValueZ", "myWeight");
1209 /// // Explicit column types
1210 /// using d_t = double;
1211 /// auto myHist2 = myDf.Histo3D<d_t, d_t, float, d_t>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1212 /// "myValueX", "myValueY", "myValueZ", "myWeight");
1213 /// ~~~
1214 ///
1215 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1216 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1217 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name,
1218 std::string_view v3Name, std::string_view wName)
1219 {
1220 std::shared_ptr<::TH3D> h(nullptr);
1221 {
1222 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1223 h = model.GetHistogram();
1224 }
1225 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
1226 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
1227 }
1228 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
1229 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1230 ? ColumnNames_t()
1231 : ColumnNames_t(columnViews.begin(), columnViews.end());
1232 return CreateAction<RDFInternal::ActionTags::Histo3D, V1, V2, V3, W>(userColumns, h, h);
1233 }
1234
1235 template <typename V1, typename V2, typename V3, typename W>
1237 {
1238 return Histo3D<V1, V2, V3, W>(model, "", "", "", "");
1239 }
1240
1241 ////////////////////////////////////////////////////////////////////////////
1242 /// \brief Fill and return a graph (*lazy action*)
1243 /// \tparam V1 The type of the column used to fill the x axis of the graph.
1244 /// \tparam V2 The type of the column used to fill the y axis of the graph.
1245 /// \param[in] v1Name The name of the column that will fill the x axis.
1246 /// \param[in] v2Name The name of the column that will fill the y axis.
1247 /// \return the graph wrapped in a `RResultPtr`.
1248 ///
1249 /// Columns can be of a container type (e.g. std::vector<double>), in which case the graph
1250 /// is filled with each one of the elements of the container.
1251 /// If Multithreading is enabled, the order in which points are inserted is undefined.
1252 /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing.
1253 /// A name and a title to the graph is given based on the input column names.
1254 ///
1255 /// This action is *lazy*: upon invocation of this method the calculation is
1256 /// booked but not executed. See RResultPtr documentation.
1257 ///
1258 /// ### Example usage:
1259 /// ~~~{.cpp}
1260 /// // Deduce column types (this invocation needs jitting internally)
1261 /// auto myGraph1 = myDf.Graph("xValues", "yValues");
1262 /// // Explicit column types
1263 /// auto myGraph2 = myDf.Graph<int, float>("xValues", "yValues");
1264 /// ~~~
1265 ///
1266 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
1267 RResultPtr<::TGraph> Graph(std::string_view v1Name = "", std::string_view v2Name = "")
1268 {
1269 auto graph = std::make_shared<::TGraph>();
1270 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
1271 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1272 ? ColumnNames_t()
1273 : ColumnNames_t(columnViews.begin(), columnViews.end());
1274
1275 const auto validatedColumns = GetValidatedColumnNames(2, userColumns);
1276
1277 // We build a default name and title based on the input columns
1278 if (!(validatedColumns[0].empty() && validatedColumns[1].empty())) {
1279 const auto g_name = std::string(v1Name) + "_vs_" + std::string(v2Name);
1280 const auto g_title = std::string(v1Name) + " vs " + std::string(v2Name);
1281 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
1282 graph->GetXaxis()->SetTitle(std::string(v1Name).c_str());
1283 graph->GetYaxis()->SetTitle(std::string(v2Name).c_str());
1284 }
1285
1286 return CreateAction<RDFInternal::ActionTags::Graph, V1, V2>(validatedColumns, graph, graph);
1287 }
1288
1289 ////////////////////////////////////////////////////////////////////////////
1290 /// \brief Fill and return a one-dimensional profile (*lazy action*)
1291 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
1292 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
1293 /// \param[in] model The model to be considered to build the new return value.
1294 /// \param[in] v1Name The name of the column that will fill the x axis.
1295 /// \param[in] v2Name The name of the column that will fill the y axis.
1296 /// \return the monodimensional profile wrapped in a `RResultPtr`.
1297 ///
1298 /// This action is *lazy*: upon invocation of this method the calculation is
1299 /// booked but not executed. See RResultPtr documentation.
1300 ///
1301 /// ### Example usage:
1302 /// ~~~{.cpp}
1303 /// // Deduce column types (this invocation needs jitting internally)
1304 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
1305 /// // Explicit column types
1306 /// auto myProf2 = myDf.Graph<int, float>({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
1307 /// ~~~
1308 ///
1309 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
1311 Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
1312 {
1313 std::shared_ptr<::TProfile> h(nullptr);
1314 {
1315 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1316 h = model.GetProfile();
1317 }
1318
1319 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
1320 throw std::runtime_error("Profiles with no axes limits are not supported yet.");
1321 }
1322 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
1323 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1324 ? ColumnNames_t()
1325 : ColumnNames_t(columnViews.begin(), columnViews.end());
1326 return CreateAction<RDFInternal::ActionTags::Profile1D, V1, V2>(userColumns, h, h);
1327 }
1328
1329 ////////////////////////////////////////////////////////////////////////////
1330 /// \brief Fill and return a one-dimensional profile (*lazy action*)
1331 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
1332 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
1333 /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present.
1334 /// \param[in] model The model to be considered to build the new return value.
1335 /// \param[in] v1Name The name of the column that will fill the x axis.
1336 /// \param[in] v2Name The name of the column that will fill the y axis.
1337 /// \param[in] wName The name of the column that will provide the weights.
1338 /// \return the monodimensional profile wrapped in a `RResultPtr`.
1339 ///
1340 /// This action is *lazy*: upon invocation of this method the calculation is
1341 /// booked but not executed. See RResultPtr documentation.
1342 ///
1343 /// ### Example usage:
1344 /// ~~~{.cpp}
1345 /// // Deduce column types (this invocation needs jitting internally)
1346 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight");
1347 /// // Explicit column types
1348 /// auto myProf2 = myDf.Profile1D<int, float, double>({"profName", "profTitle", 64u, -4., 4.},
1349 /// "xValues", "yValues", "weight");
1350 /// ~~~
1351 ///
1352 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1353 typename W = RDFDetail::RInferredType>
1355 Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
1356 {
1357 std::shared_ptr<::TProfile> h(nullptr);
1358 {
1359 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1360 h = model.GetProfile();
1361 }
1362
1363 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
1364 throw std::runtime_error("Profile histograms with no axes limits are not supported yet.");
1365 }
1366 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
1367 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1368 ? ColumnNames_t()
1369 : ColumnNames_t(columnViews.begin(), columnViews.end());
1370 return CreateAction<RDFInternal::ActionTags::Profile1D, V1, V2, W>(userColumns, h, h);
1371 }
1372
1373 template <typename V1, typename V2, typename W>
1375 {
1376 return Profile1D<V1, V2, W>(model, "", "", "");
1377 }
1378
1379 ////////////////////////////////////////////////////////////////////////////
1380 /// \brief Fill and return a two-dimensional profile (*lazy action*)
1381 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1382 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1383 /// \tparam V2 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1384 /// \param[in] model The returned profile will be constructed using this as a model.
1385 /// \param[in] v1Name The name of the column that will fill the x axis.
1386 /// \param[in] v2Name The name of the column that will fill the y axis.
1387 /// \param[in] v3Name The name of the column that will fill the z axis.
1388 /// \return the bidimensional profile wrapped in a `RResultPtr`.
1389 ///
1390 /// This action is *lazy*: upon invocation of this method the calculation is
1391 /// booked but not executed. See RResultPtr documentation.
1392 ///
1393 /// ### Example usage:
1394 /// ~~~{.cpp}
1395 /// // Deduce column types (this invocation needs jitting internally)
1396 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
1397 /// "xValues", "yValues", "zValues");
1398 /// // Explicit column types
1399 /// auto myProf2 = myDf.Profile2D<int, float, double>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
1400 /// "xValues", "yValues", "zValues");
1401 /// ~~~
1402 ///
1403 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1404 typename V3 = RDFDetail::RInferredType>
1405 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "",
1406 std::string_view v2Name = "", std::string_view v3Name = "")
1407 {
1408 std::shared_ptr<::TProfile2D> h(nullptr);
1409 {
1410 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1411 h = model.GetProfile();
1412 }
1413
1414 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
1415 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
1416 }
1417 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
1418 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1419 ? ColumnNames_t()
1420 : ColumnNames_t(columnViews.begin(), columnViews.end());
1421 return CreateAction<RDFInternal::ActionTags::Profile2D, V1, V2, V3>(userColumns, h, h);
1422 }
1423
1424 ////////////////////////////////////////////////////////////////////////////
1425 /// \brief Fill and return a two-dimensional profile (*lazy action*)
1426 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1427 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1428 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1429 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
1430 /// \param[in] model The returned histogram will be constructed using this as a model.
1431 /// \param[in] v1Name The name of the column that will fill the x axis.
1432 /// \param[in] v2Name The name of the column that will fill the y axis.
1433 /// \param[in] v3Name The name of the column that will fill the z axis.
1434 /// \param[in] wName The name of the column that will provide the weights.
1435 /// \return the bidimensional profile wrapped in a `RResultPtr`.
1436 ///
1437 /// This action is *lazy*: upon invocation of this method the calculation is
1438 /// booked but not executed. See RResultPtr documentation.
1439 ///
1440 /// ### Example usage:
1441 /// ~~~{.cpp}
1442 /// // Deduce column types (this invocation needs jitting internally)
1443 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
1444 /// "xValues", "yValues", "zValues", "weight");
1445 /// // Explicit column types
1446 /// auto myProf2 = myDf.Profile2D<int, float, double, int>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
1447 /// "xValues", "yValues", "zValues", "weight");
1448 /// ~~~
1449 ///
1450 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1451 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1452 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name,
1453 std::string_view v3Name, std::string_view wName)
1454 {
1455 std::shared_ptr<::TProfile2D> h(nullptr);
1456 {
1457 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1458 h = model.GetProfile();
1459 }
1460
1461 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
1462 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
1463 }
1464 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
1465 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
1466 ? ColumnNames_t()
1467 : ColumnNames_t(columnViews.begin(), columnViews.end());
1468 return CreateAction<RDFInternal::ActionTags::Profile2D, V1, V2, V3, W>(userColumns, h, h);
1469 }
1470
1471 template <typename V1, typename V2, typename V3, typename W>
1473 {
1474 return Profile2D<V1, V2, V3, W>(model, "", "", "", "");
1475 }
1476
1477 ////////////////////////////////////////////////////////////////////////////
1478 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*)
1479 ///
1480 /// T must be a type that provides a copy- or move-constructor and a `T::Fill` method that takes as many arguments
1481 /// as the column names pass as columnList. The arguments of `T::Fill` must have type equal to the one of the
1482 /// specified columns (these types are passed as template parameters to this method).
1483 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object.
1484 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object.
1485 /// \tparam T The type of the object to fill. Automatically deduced.
1486 /// \param[in] model The model to be considered to build the new return value.
1487 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
1488 /// \return the filled object wrapped in a `RResultPtr`.
1489 ///
1490 /// The user gives up ownership of the model object.
1491 /// The list of column names to be used for filling must always be specified.
1492 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed.
1493 /// See RResultPtr documentation.
1494 ///
1495 /// ### Example usage:
1496 /// ~~~{.cpp}
1497 /// MyClass obj;
1498 /// auto myFilledObj = myDf.Fill<float>(obj, {"col0", "col1"});
1499 /// ~~~
1500 ///
1501 template <typename FirstColumn, typename... OtherColumns, typename T> // need FirstColumn to disambiguate overloads
1502 RResultPtr<T> Fill(T &&model, const ColumnNames_t &columnList)
1503 {
1504 auto h = std::make_shared<T>(std::forward<T>(model));
1505 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
1506 throw std::runtime_error("The absence of axes limits is not supported yet.");
1507 }
1508 return CreateAction<RDFInternal::ActionTags::Fill, FirstColumn, OtherColumns...>(columnList, h, h);
1509 }
1510
1511 ////////////////////////////////////////////////////////////////////////////
1512 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*)
1513 ///
1514 /// This overload infers the types of the columns specified in columnList at runtime and just-in-time compiles the
1515 /// method with these types. See previous overload for more information.
1516 /// \tparam T The type of the object to fill. Automatically deduced.
1517 /// \param[in] model The model to be considered to build the new return value.
1518 /// \param[in] columnList The name of the columns read to fill the object.
1519 /// \return the filled object wrapped in a `RResultPtr`.
1520 ///
1521 /// This overload of `Fill` infers the type of the specified columns at runtime and just-in-time compiles the
1522 /// previous overload. Check the previous overload for more details on `Fill`.
1523 ///
1524 /// ### Example usage:
1525 /// ~~~{.cpp}
1526 /// MyClass obj;
1527 /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"});
1528 /// ~~~
1529 ///
1530 template <typename T>
1531 RResultPtr<T> Fill(T &&model, const ColumnNames_t &columnList)
1532 {
1533 auto h = std::make_shared<T>(std::forward<T>(model));
1534 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
1535 throw std::runtime_error("The absence of axes limits is not supported yet.");
1536 }
1537 return CreateAction<RDFInternal::ActionTags::Fill, RDFDetail::RInferredType>(columnList, h, h, columnList.size());
1538 }
1539
1540 ////////////////////////////////////////////////////////////////////////////
1541 /// \brief Return a TStatistic object, filled once per event (*lazy action*)
1542 ///
1543 /// \tparam V The type of the value column
1544 /// \param[in] value The name of the column with the values to fill the statistics with.
1545 /// \return the filled TStatistic object wrapped in a `RResultPtr`.
1546 ///
1547 /// ### Example usage:
1548 /// ~~~{.cpp}
1549 /// // Deduce column type (this invocation needs jitting internally)
1550 /// auto stats0 = myDf.Stats("values");
1551 /// // Explicit column type
1552 /// auto stats1 = myDf.Stats<float>("values");
1553 /// ~~~
1554 ///
1555 template <typename V = RDFDetail::RInferredType>
1556 RResultPtr<TStatistic> Stats(std::string_view value = "")
1557 {
1558 ColumnNames_t columns;
1559 if (!value.empty()) {
1560 columns.emplace_back(std::string(value));
1561 }
1562 const auto validColumnNames = GetValidatedColumnNames(1, columns);
1563 if (std::is_same<V, RDFDetail::RInferredType>::value) {
1564 return Fill(TStatistic(), validColumnNames);
1565 } else {
1566 return Fill<V>(TStatistic(), validColumnNames);
1567 }
1568 }
1569
1570 ////////////////////////////////////////////////////////////////////////////
1571 /// \brief Return a TStatistic object, filled once per event (*lazy action*)
1572 ///
1573 /// \tparam V The type of the value column
1574 /// \tparam W The type of the weight column
1575 /// \param[in] value The name of the column with the values to fill the statistics with.
1576 /// \param[in] weight The name of the column with the weights to fill the statistics with.
1577 /// \return the filled TStatistic object wrapped in a `RResultPtr`.
1578 ///
1579 /// ### Example usage:
1580 /// ~~~{.cpp}
1581 /// // Deduce column types (this invocation needs jitting internally)
1582 /// auto stats0 = myDf.Stats("values", "weights");
1583 /// // Explicit column types
1584 /// auto stats1 = myDf.Stats<int, float>("values", "weights");
1585 /// ~~~
1586 ///
1587 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1588 RResultPtr<TStatistic> Stats(std::string_view value, std::string_view weight)
1589 {
1590 ColumnNames_t columns{std::string(value), std::string(weight)};
1591 constexpr auto vIsInferred = std::is_same<V, RDFDetail::RInferredType>::value;
1592 constexpr auto wIsInferred = std::is_same<W, RDFDetail::RInferredType>::value;
1593 const auto validColumnNames = GetValidatedColumnNames(2, columns);
1594 // We have 3 cases:
1595 // 1. Both types are inferred: we use Fill and let the jit kick in.
1596 // 2. One of the two types is explicit and the other one is inferred: the case is not supported.
1597 // 3. Both types are explicit: we invoke the fully compiled Fill method.
1598 if (vIsInferred && wIsInferred) {
1599 return Fill(TStatistic(), validColumnNames);
1600 } else if (vIsInferred != wIsInferred) {
1601 std::string error("The ");
1602 error += vIsInferred ? "value " : "weight ";
1603 error += "column type is explicit, while the ";
1604 error += vIsInferred ? "weight " : "value ";
1605 error += " is specified to be inferred. This case is not supported: please specify both types or none.";
1606 throw std::runtime_error(error);
1607 } else {
1608 return Fill<V, W>(TStatistic(), validColumnNames);
1609 }
1610 }
1611
1612 ////////////////////////////////////////////////////////////////////////////
1613 /// \brief Return the minimum of processed column values (*lazy action*)
1614 /// \tparam T The type of the branch/column.
1615 /// \param[in] columnName The name of the branch/column to be treated.
1616 /// \return the minimum value of the selected column wrapped in a `RResultPtr`.
1617 ///
1618 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
1619 /// template specialization of this method.
1620 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
1621 ///
1622 /// This action is *lazy*: upon invocation of this method the calculation is
1623 /// booked but not executed. See RResultPtr documentation.
1624 ///
1625 /// ### Example usage:
1626 /// ~~~{.cpp}
1627 /// // Deduce column type (this invocation needs jitting internally)
1628 /// auto minVal0 = myDf.Min("values");
1629 /// // Explicit column type
1630 /// auto minVal1 = myDf.Min<double>("values");
1631 /// ~~~
1632 ///
1633 template <typename T = RDFDetail::RInferredType>
1634 RResultPtr<RDFDetail::MinReturnType_t<T>> Min(std::string_view columnName = "")
1635 {
1636 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
1637 using RetType_t = RDFDetail::MinReturnType_t<T>;
1638 auto minV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::max());
1639 return CreateAction<RDFInternal::ActionTags::Min, T>(userColumns, minV, minV);
1640 }
1641
1642 ////////////////////////////////////////////////////////////////////////////
1643 /// \brief Return the maximum of processed column values (*lazy action*)
1644 /// \tparam T The type of the branch/column.
1645 /// \param[in] columnName The name of the branch/column to be treated.
1646 /// \return the maximum value of the selected column wrapped in a `RResultPtr`.
1647 ///
1648 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
1649 /// template specialization of this method.
1650 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
1651 ///
1652 /// This action is *lazy*: upon invocation of this method the calculation is
1653 /// booked but not executed. See RResultPtr documentation.
1654 ///
1655 /// ### Example usage:
1656 /// ~~~{.cpp}
1657 /// // Deduce column type (this invocation needs jitting internally)
1658 /// auto maxVal0 = myDf.Max("values");
1659 /// // Explicit column type
1660 /// auto maxVal1 = myDf.Max<double>("values");
1661 /// ~~~
1662 ///
1663 template <typename T = RDFDetail::RInferredType>
1664 RResultPtr<RDFDetail::MaxReturnType_t<T>> Max(std::string_view columnName = "")
1665 {
1666 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
1667 using RetType_t = RDFDetail::MaxReturnType_t<T>;
1668 auto maxV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::lowest());
1669 return CreateAction<RDFInternal::ActionTags::Max, T>(userColumns, maxV, maxV);
1670 }
1671
1672 ////////////////////////////////////////////////////////////////////////////
1673 /// \brief Return the mean of processed column values (*lazy action*)
1674 /// \tparam T The type of the branch/column.
1675 /// \param[in] columnName The name of the branch/column to be treated.
1676 /// \return the mean value of the selected column wrapped in a `RResultPtr`.
1677 ///
1678 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
1679 /// template specialization of this method.
1680 ///
1681 /// This action is *lazy*: upon invocation of this method the calculation is
1682 /// booked but not executed. See RResultPtr documentation.
1683 ///
1684 /// ### Example usage:
1685 /// ~~~{.cpp}
1686 /// // Deduce column type (this invocation needs jitting internally)
1687 /// auto meanVal0 = myDf.Mean("values");
1688 /// // Explicit column type
1689 /// auto meanVal1 = myDf.Mean<double>("values");
1690 /// ~~~
1691 ///
1692 template <typename T = RDFDetail::RInferredType>
1693 RResultPtr<double> Mean(std::string_view columnName = "")
1694 {
1695 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
1696 auto meanV = std::make_shared<double>(0);
1697 return CreateAction<RDFInternal::ActionTags::Mean, T>(userColumns, meanV, meanV);
1698 }
1699
1700 ////////////////////////////////////////////////////////////////////////////
1701 /// \brief Return the unbiased standard deviation of processed column values (*lazy action*)
1702 /// \tparam T The type of the branch/column.
1703 /// \param[in] columnName The name of the branch/column to be treated.
1704 /// \return the standard deviation value of the selected column wrapped in a `RResultPtr`.
1705 ///
1706 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
1707 /// template specialization of this method.
1708 ///
1709 /// This action is *lazy*: upon invocation of this method the calculation is
1710 /// booked but not executed. See RResultPtr documentation.
1711 ///
1712 /// ### Example usage:
1713 /// ~~~{.cpp}
1714 /// // Deduce column type (this invocation needs jitting internally)
1715 /// auto stdDev0 = myDf.StdDev("values");
1716 /// // Explicit column type
1717 /// auto stdDev1 = myDf.StdDev<double>("values");
1718 /// ~~~
1719 ///
1720 template <typename T = RDFDetail::RInferredType>
1721 RResultPtr<double> StdDev(std::string_view columnName = "")
1722 {
1723 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
1724 auto stdDeviationV = std::make_shared<double>(0);
1725 return CreateAction<RDFInternal::ActionTags::StdDev, T>(userColumns, stdDeviationV, stdDeviationV);
1726 }
1727
1728 // clang-format off
1729 ////////////////////////////////////////////////////////////////////////////
1730 /// \brief Return the sum of processed column values (*lazy action*)
1731 /// \tparam T The type of the branch/column.
1732 /// \param[in] columnName The name of the branch/column.
1733 /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible.
1734 /// \return the sum of the selected column wrapped in a `RResultPtr`.
1735 ///
1736 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
1737 /// template specialization of this method.
1738 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
1739 ///
1740 /// This action is *lazy*: upon invocation of this method the calculation is
1741 /// booked but not executed. See RResultPtr documentation.
1742 ///
1743 /// ### Example usage:
1744 /// ~~~{.cpp}
1745 /// // Deduce column type (this invocation needs jitting internally)
1746 /// auto sum0 = myDf.Sum("values");
1747 /// // Explicit column type
1748 /// auto sum1 = myDf.Sum<double>("values");
1749 /// ~~~
1750 ///
1751 template <typename T = RDFDetail::RInferredType>
1753 Sum(std::string_view columnName = "",
1754 const RDFDetail::SumReturnType_t<T> &initValue = RDFDetail::SumReturnType_t<T>{})
1755 {
1756 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
1757 auto sumV = std::make_shared<RDFDetail::SumReturnType_t<T>>(initValue);
1758 return CreateAction<RDFInternal::ActionTags::Sum, T>(userColumns, sumV, sumV);
1759 }
1760 // clang-format on
1761
1762 ////////////////////////////////////////////////////////////////////////////
1763 /// \brief Gather filtering statistics
1764 /// \return the resulting `RCutFlowReport` instance wrapped in a `RResultPtr`.
1765 ///
1766 /// Calling `Report` on the main `RDataFrame` object gathers stats for
1767 /// all named filters in the call graph. Calling this method on a
1768 /// stored chain state (i.e. a graph node different from the first) gathers
1769 /// the stats for all named filters in the chain section between the original
1770 /// `RDataFrame` and that node (included). Stats are gathered in the same
1771 /// order as the named filters have been added to the graph.
1772 /// A RResultPtr<RCutFlowReport> is returned to allow inspection of the
1773 /// effects cuts had.
1774 ///
1775 /// This action is *lazy*: upon invocation of
1776 /// this method the calculation is booked but not executed. See RResultPtr
1777 /// documentation.
1778 ///
1779 /// ### Example usage:
1780 /// ~~~{.cpp}
1781 /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2");
1782 /// auto cutReport = filtered3.Report();
1783 /// cutReport->Print();
1784 /// ~~~
1785 ///
1787 {
1788 bool returnEmptyReport = false;
1789 // if this is a RInterface<RLoopManager> on which `Define` has been called, users
1790 // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which
1791 // certainly does not contain named filters.
1792 // The number 4 takes into account the implicit columns for entry and slot number
1793 // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_)
1794 if (std::is_same<Proxied, RLoopManager>::value && fDefines.GetNames().size() > 4)
1795 returnEmptyReport = true;
1796
1797 auto rep = std::make_shared<RCutFlowReport>();
1798 using Helper_t = RDFInternal::ReportHelper<Proxied>;
1800
1801 auto action = std::make_unique<Action_t>(Helper_t(rep, fProxiedPtr, returnEmptyReport), ColumnNames_t({}),
1803
1804 fLoopManager->Book(action.get());
1805 return MakeResultPtr(rep, *fLoopManager, std::move(action));
1806 }
1807
1808 /////////////////////////////////////////////////////////////////////////////
1809 /// \brief Returns the names of the available columns
1810 /// \return the container of column names.
1811 ///
1812 /// This is not an action nor a transformation, just a query to the RDataFrame object.
1813 ///
1814 /// ### Example usage:
1815 /// ~~~{.cpp}
1816 /// auto colNames = d.GetColumnNames();
1817 /// // Print columns' names
1818 /// for (auto &&colName : colNames) std::cout << colName << std::endl;
1819 /// ~~~
1820 ///
1822 {
1823 ColumnNames_t allColumns;
1824
1825 auto addIfNotInternal = [&allColumns](std::string_view colName) {
1826 if (!RDFInternal::IsInternalColumn(colName))
1827 allColumns.emplace_back(colName);
1828 };
1829
1830 auto columnNames = fDefines.GetNames();
1831
1832 std::for_each(columnNames.begin(), columnNames.end(), addIfNotInternal);
1833
1834 auto tree = fLoopManager->GetTree();
1835 if (tree) {
1836 auto branchNames = RDFInternal::GetBranchNames(*tree, /*allowDuplicates=*/false);
1837 allColumns.insert(allColumns.end(), branchNames.begin(), branchNames.end());
1838 }
1839
1840 if (fDataSource) {
1841 auto &dsColNames = fDataSource->GetColumnNames();
1842 allColumns.insert(allColumns.end(), dsColNames.begin(), dsColNames.end());
1843 }
1844
1845 return allColumns;
1846 }
1847
1848 /////////////////////////////////////////////////////////////////////////////
1849 /// \brief Return the type of a given column as a string.
1850 /// \return the type of the required column.
1851 ///
1852 /// This is not an action nor a transformation, just a query to the RDataFrame object.
1853 ///
1854 /// ### Example usage:
1855 /// ~~~{.cpp}
1856 /// auto colType = d.GetColumnType("columnName");
1857 /// // Print column type
1858 /// std::cout << "Column " << colType << " has type " << colType << std::endl;
1859 /// ~~~
1860 ///
1861 std::string GetColumnType(std::string_view column)
1862 {
1863 auto col = std::string(column);
1864
1865 // if "col" is an alias, resolve it before doing anything else
1866 const auto aliasMap = fLoopManager->GetAliasMap();
1867 const auto it = aliasMap.find(col);
1868 if (it != aliasMap.end())
1869 col = it->second;
1870
1871 RDFDetail::RDefineBase *define = fDefines.HasName(col) ? fDefines.GetColumns().at(col).get() : nullptr;
1872
1873 const bool convertVector2RVec = true;
1875 convertVector2RVec);
1876 }
1877
1878 /// \brief Returns the names of the filters created.
1879 /// \return the container of filters names.
1880 ///
1881 /// If called on a root node, all the filters in the computation graph will
1882 /// be printed. For any other node, only the filters upstream of that node.
1883 /// Filters without a name are printed as "Unnamed Filter"
1884 /// This is not an action nor a transformation, just a query to the RDataFrame object.
1885 ///
1886 /// ### Example usage:
1887 /// ~~~{.cpp}
1888 /// auto filtNames = d.GetFilterNames();
1889 /// for (auto &&filtName : filtNames) std::cout << filtName << std::endl;
1890 /// ~~~
1891 ///
1892 std::vector<std::string> GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); }
1893
1894 /// \brief Returns the names of the defined columns
1895 /// \return the container of the defined column names.
1896 ///
1897 /// This is not an action nor a transformation, just a simple utility to
1898 /// get the columns names that have been defined up to the node.
1899 /// If no custom column has been defined, e.g. on a root node, it returns an
1900 /// empty collection.
1901 ///
1902 /// ### Example usage:
1903 /// ~~~{.cpp}
1904 /// auto defColNames = d.GetDefinedColumnNames();
1905 /// // Print defined columns' names
1906 /// for (auto &&defColName : defColNames) std::cout << defColName << std::endl;
1907 /// ~~~
1908 ///
1910 {
1911 ColumnNames_t definedColumns;
1912
1913 auto columns = fDefines.GetColumns();
1914
1915 for (auto column : columns) {
1916 if (!RDFInternal::IsInternalColumn(column.first))
1917 definedColumns.emplace_back(column.first);
1918 }
1919
1920 return definedColumns;
1921 }
1922
1923 /// \brief Checks if a column is present in the dataset
1924 /// \return true if the column is available, false otherwise
1925 ///
1926 /// This method checks if a column is part of the input ROOT dataset, has
1927 /// been defined or can be provided by the data source.
1928 ///
1929 /// Example usage:
1930 /// ~~~{.cpp}
1931 /// ROOT::RDataFrame base(1);
1932 /// auto rdf = base.Define("definedColumn", [](){return 0;});
1933 /// rdf.HasColumn("definedColumn"); // true: we defined it
1934 /// rdf.HasColumn("rdfentry_"); // true: it's always there
1935 /// rdf.HasColumn("foo"); // false: it is not there
1936 /// ~~~
1937 bool HasColumn(std::string_view columnName)
1938 {
1939 if (fDefines.HasName(columnName))
1940 return true;
1941
1942 if (auto tree = fLoopManager->GetTree()) {
1943 const auto &branchNames = fLoopManager->GetBranchNames();
1944 const auto branchNamesEnd = branchNames.end();
1945 if (branchNamesEnd != std::find(branchNames.begin(), branchNamesEnd, columnName))
1946 return true;
1947 }
1948
1949 if (fDataSource && fDataSource->HasColumn(columnName))
1950 return true;
1951
1952 return false;
1953 }
1954
1955 /// \brief Gets the number of data processing slots
1956 /// \return The number of data processing slots used by this RDataFrame instance
1957 ///
1958 /// This method returns the number of data processing slots used by this RDataFrame
1959 /// instance. This number is influenced by the global switch ROOT::EnableImplicitMT().
1960 ///
1961 /// Example usage:
1962 /// ~~~{.cpp}
1963 /// ROOT::EnableImplicitMT(6)
1964 /// ROOT::RDataFrame df(1);
1965 /// std::cout << df.GetNSlots() << std::endl; // prints "6"
1966 /// ~~~
1967 unsigned int GetNSlots() const { return fLoopManager->GetNSlots(); }
1968
1969 /// \brief Gets the number of event loops run
1970 /// \return The number of event loops run by this RDataFrame instance
1971 ///
1972 /// This method returns the number of events loops run so far by this RDataFrame instance.
1973 ///
1974 /// Example usage:
1975 /// ~~~{.cpp}
1976 /// ROOT::RDataFrame df(1);
1977 /// std::cout << df.GetNRuns() << std::endl; // prints "0"
1978 /// df.Sum("rdfentry_").GetValue(); // trigger the event loop
1979 /// std::cout << df.GetNRuns() << std::endl; // prints "1"
1980 /// df.Sum("rdfentry_").GetValue(); // trigger another event loop
1981 /// std::cout << df.GetNRuns() << std::endl; // prints "2"
1982 /// ~~~
1983 unsigned int GetNRuns() const { return fLoopManager->GetNRuns(); }
1984
1985 // clang-format off
1986 ////////////////////////////////////////////////////////////////////////////
1987 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot
1988 /// \tparam F The type of the aggregator callable. Automatically deduced.
1989 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
1990 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1991 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U&,T)`, where T is the type of the column, U is the type of the aggregator variable
1992 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
1993 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
1994 /// \param[in] aggIdentity The aggregator variable of each thread is initialised to this value (or is default-constructed if the parameter is omitted)
1995 /// \return the result of the aggregation wrapped in a `RResultPtr`.
1996 ///
1997 /// An aggregator callable takes two values, an aggregator variable and a column value. The aggregator variable is
1998 /// initialized to aggIdentity or default-constructed if aggIdentity is omitted.
1999 /// This action calls the aggregator callable for each processed entry, passing in the aggregator variable and
2000 /// the value of the column columnName.
2001 /// If the signature is `U(U,T)` the aggregator variable is then copy-assigned the result of the execution of the callable.
2002 /// Otherwise the signature of aggregator must be `void(U&,T)`.
2003 ///
2004 /// The merger callable is used to merge the partial accumulation results of each processing thread. It is only called in multi-thread executions.
2005 /// If its signature is `U(U,U)` the aggregator variables of each thread are merged two by two.
2006 /// If its signature is `void(std::vector<U>& a)` it is assumed that it merges all aggregators in a[0].
2007 ///
2008 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. See RResultPtr documentation.
2009 ///
2010 /// Example usage:
2011 /// ~~~{.cpp}
2012 /// auto aggregator = [](double acc, double x) { return acc * x; };
2013 /// ROOT::EnableImplicitMT();
2014 /// // If multithread is enabled, the aggregator function will be called by more threads
2015 /// // and will produce a vector of partial accumulators.
2016 /// // The merger function performs the final aggregation of these partial results.
2017 /// auto merger = [](std::vector<double> &accumulators) {
2018 /// for (auto i : ROOT::TSeqU(1u, accumulators.size())) {
2019 /// accumulators[0] *= accumulators[i];
2020 /// }
2021 /// };
2022 ///
2023 /// // The accumulator is initialized at this value by every thread.
2024 /// double initValue = 1.;
2025 ///
2026 /// // Multiplies all elements of the column "x"
2027 /// auto result = d.Aggregate(aggregator, merger, columnName, initValue);
2028 /// ~~~
2029 // clang-format on
2030 template <typename AccFun, typename MergeFun, typename R = typename TTraits::CallableTraits<AccFun>::ret_type,
2031 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
2032 typename ArgTypesNoDecay = typename TTraits::CallableTraits<AccFun>::arg_types_nodecay,
2033 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
2034 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
2035 RResultPtr<U> Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
2036 {
2037 RDFInternal::CheckAggregate<R, MergeFun>(ArgTypesNoDecay());
2038 const auto columns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2039
2040 const auto validColumnNames = GetValidatedColumnNames(1, columns);
2041 CheckAndFillDSColumns(validColumnNames, TTraits::TypeList<T>());
2042
2043 auto accObjPtr = std::make_shared<U>(aggIdentity);
2044 using Helper_t = RDFInternal::AggregateHelper<AccFun, MergeFun, R, T, U>;
2045 using Action_t = typename RDFInternal::RAction<Helper_t, Proxied>;
2046 auto action = std::make_unique<Action_t>(
2047 Helper_t(std::move(aggregator), std::move(merger), accObjPtr, fLoopManager->GetNSlots()), validColumnNames,
2049 fLoopManager->Book(action.get());
2050 return MakeResultPtr(accObjPtr, *fLoopManager, std::move(action));
2051 }
2052
2053 // clang-format off
2054 ////////////////////////////////////////////////////////////////////////////
2055 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot
2056 /// \tparam F The type of the aggregator callable. Automatically deduced.
2057 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
2058 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
2059 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U,T)`, where T is the type of the column, U is the type of the aggregator variable
2060 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
2061 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
2062 /// \return the result of the aggregation wrapped in a `RResultPtr`.
2063 ///
2064 /// See previous Aggregate overload for more information.
2065 // clang-format on
2066 template <typename AccFun, typename MergeFun, typename R = typename TTraits::CallableTraits<AccFun>::ret_type,
2067 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
2068 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
2069 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
2070 RResultPtr<U> Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName = "")
2071 {
2072 static_assert(
2073 std::is_default_constructible<U>::value,
2074 "aggregated object cannot be default-constructed. Please provide an initialisation value (aggIdentity)");
2075 return Aggregate(std::move(aggregator), std::move(merger), columnName, U());
2076 }
2077
2078 // clang-format off
2079 ////////////////////////////////////////////////////////////////////////////
2080 /// \brief Book execution of a custom action using a user-defined helper object.
2081 /// \tparam ColumnTypes List of types of columns used by this action.
2082 /// \tparam Helper The type of the user-defined helper. See below for the required interface it should expose.
2083 /// \param[in] helper The Action Helper to be scheduled.
2084 /// \param[in] columns The names of the columns on which the helper acts.
2085 /// \return the result of the helper wrapped in a `RResultPtr`.
2086 ///
2087 /// This method books a custom action for execution. The behavior of the action is completely dependent on the
2088 /// Helper object provided by the caller. The minimum required interface for the helper is the following (more
2089 /// methods can be present, e.g. a constructor that takes the number of worker threads is usually useful):
2090 ///
2091 /// * Helper must publicly inherit from ROOT::Detail::RDF::RActionImpl<Helper>
2092 /// * Helper(Helper &&): a move-constructor is required. Copy-constructors are discouraged.
2093 /// * Result_t: alias for the type of the result of this action helper. Must be default-constructible.
2094 /// * void Exec(unsigned int slot, ColumnTypes...columnValues): each working thread shall call this method
2095 /// during the event-loop, possibly concurrently. No two threads will ever call Exec with the same 'slot' value:
2096 /// this parameter is there to facilitate writing thread-safe helpers. The other arguments will be the values of
2097 /// the requested columns for the particular entry being processed.
2098 /// * void InitTask(TTreeReader *, unsigned int slot): each working thread shall call this method during the event
2099 /// loop, before processing a batch of entries (possibly read from the TTreeReader passed as argument, if not null).
2100 /// This method can be used e.g. to prepare the helper to process a batch of entries in a given thread. Can be no-op.
2101 /// * void Initialize(): this method is called once before starting the event-loop. Useful for setup operations. Can be no-op.
2102 /// * void Finalize(): this method is called at the end of the event loop. Commonly used to finalize the contents of the result.
2103 /// * Result_t &PartialUpdate(unsigned int slot): this method is optional, i.e. can be omitted. If present, it should
2104 /// return the value of the partial result of this action for the given 'slot'. Different threads might call this
2105 /// method concurrently, but will always pass different 'slot' numbers.
2106 /// * std::shared_ptr<Result_t> GetResultPtr() const: return a shared_ptr to the result of this action (of type
2107 /// Result_t). The RResultPtr returned by Book will point to this object.
2108 ///
2109 /// See ActionHelpers.hxx for the helpers used by standard RDF actions.
2110 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. See RResultPtr documentation.
2111 // clang-format on
2112 template <typename... ColumnTypes, typename Helper>
2113 RResultPtr<typename Helper::Result_t> Book(Helper &&helper, const ColumnNames_t &columns = {})
2114 {
2115 constexpr auto nColumns = sizeof...(ColumnTypes);
2116 RDFInternal::CheckTypesAndPars(sizeof...(ColumnTypes), columns.size());
2117
2118 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
2119
2120 // TODO add more static sanity checks on Helper
2121 using AH = RDFDetail::RActionImpl<Helper>;
2122 static_assert(std::is_base_of<AH, Helper>::value && std::is_convertible<Helper *, AH *>::value,
2123 "Action helper of type T must publicly inherit from ROOT::Detail::RDF::RActionImpl<T>");
2124
2125 using Action_t = typename RDFInternal::RAction<Helper, Proxied, TTraits::TypeList<ColumnTypes...>>;
2126 auto resPtr = helper.GetResultPtr();
2127
2129
2130 auto action = std::make_unique<Action_t>(Helper(std::forward<Helper>(helper)), validColumnNames, fProxiedPtr,
2131 fDefines);
2132 fLoopManager->Book(action.get());
2133 fLoopManager->AddDataBlockCallback(action->GetDataBlockCallback());
2134 return MakeResultPtr(resPtr, *fLoopManager, std::move(action));
2135 }
2136
2137 ////////////////////////////////////////////////////////////////////////////
2138 /// \brief Provides a representation of the columns in the dataset
2139 /// \tparam ColumnTypes variadic list of branch/column types.
2140 /// \param[in] columnList Names of the columns to be displayed.
2141 /// \param[in] nRows Number of events for each column to be displayed.
2142 /// \return the `RDisplay` instance wrapped in a `RResultPtr`.
2143 ///
2144 /// This function returns a `RResultPtr<RDisplay>` containing all the entries to be displayed, organized in a tabular
2145 /// form. RDisplay will either print on the standard output a summarized version through `Print()` or will return a
2146 /// complete version through `AsString()`.
2147 ///
2148 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. See RResultPtr documentation.
2149 ///
2150 /// Example usage:
2151 /// ~~~{.cpp}
2152 /// // Preparing the RResultPtr<RDisplay> object with all columns and default number of entries
2153 /// auto d1 = rdf.Display("");
2154 /// // Preparing the RResultPtr<RDisplay> object with two columns and 128 entries
2155 /// auto d2 = d.Display({"x", "y"}, 128);
2156 /// // Printing the short representations, the event loop will run
2157 /// d1->Print();
2158 /// d2->Print();
2159 /// ~~~
2160 template <typename... ColumnTypes>
2161 RResultPtr<RDisplay> Display(const ColumnNames_t &columnList, const int &nRows = 5)
2162 {
2163 CheckIMTDisabled("Display");
2164
2165 auto displayer = std::make_shared<RDFInternal::RDisplay>(columnList, GetColumnTypeNamesList(columnList), nRows);
2166 return CreateAction<RDFInternal::ActionTags::Display, ColumnTypes...>(columnList, displayer, displayer);
2167 }
2168
2169 ////////////////////////////////////////////////////////////////////////////
2170 /// \brief Provides a representation of the columns in the dataset
2171 /// \param[in] columnList Names of the columns to be displayed.
2172 /// \param[in] nRows Number of events for each column to be displayed.
2173 /// \return the `RDisplay` instance wrapped in a `RResultPtr`.
2174 ///
2175 /// This overload automatically infers the column types.
2176 /// See the previous overloads for further details.
2177 RResultPtr<RDisplay> Display(const ColumnNames_t &columnList, const int &nRows = 5)
2178 {
2179 CheckIMTDisabled("Display");
2180 auto displayer = std::make_shared<RDFInternal::RDisplay>(columnList, GetColumnTypeNamesList(columnList), nRows);
2181 return CreateAction<RDFInternal::ActionTags::Display, RDFDetail::RInferredType>(columnList, displayer, displayer,
2182 columnList.size());
2183 }
2184
2185 ////////////////////////////////////////////////////////////////////////////
2186 /// \brief Provides a representation of the columns in the dataset
2187 /// \param[in] columnNameRegexp A regular expression to select the columns.
2188 /// \param[in] nRows Number of events for each column to be displayed.
2189 /// \return the `RDisplay` instance wrapped in a `RResultPtr`.
2190 ///
2191 /// The existing columns are matched against the regular expression. If the string provided
2192 /// is empty, all columns are selected.
2193 /// See the previous overloads for further details.
2194 RResultPtr<RDisplay> Display(std::string_view columnNameRegexp = "", const int &nRows = 5)
2195 {
2196 const auto columnNames = GetColumnNames();
2197 const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display");
2198 return Display(selectedColumns, nRows);
2199 }
2200
2201 ////////////////////////////////////////////////////////////////////////////
2202 /// \brief Provides a representation of the columns in the dataset
2203 /// \param[in] columnList Names of the columns to be displayed.
2204 /// \param[in] nRows Number of events for each column to be displayed.
2205 /// \return the `RDisplay` instance wrapped in a `RResultPtr`.
2206 ///
2207 /// See the previous overloads for further details.
2208 RResultPtr<RDisplay> Display(std::initializer_list<std::string> columnList, const int &nRows = 5)
2209 {
2210 ColumnNames_t selectedColumns(columnList);
2211 return Display(selectedColumns, nRows);
2212 }
2213
2214private:
2216 {
2218
2219 // Entry number column
2220 const std::string entryColName = "rdfentry_";
2221 const std::string entryColType = "ULong64_t";
2222 auto entryColGen = [](unsigned int, ULong64_t entry) { return entry; };
2223 using NewColEntry_t =
2224 RDFDetail::RDefine<decltype(entryColGen), RDFDetail::CustomColExtraArgs::SlotAndEntry>;
2225
2226 auto entryColumn = std::make_shared<NewColEntry_t>(entryColName, entryColType, std::move(entryColGen),
2227 ColumnNames_t{}, fLoopManager->GetNSlots(), newCols,
2229 newCols.AddColumn(entryColumn, entryColName);
2230
2231 // Slot number column
2232 const std::string slotColName = "rdfslot_";
2233 const std::string slotColType = "unsigned int";
2234 auto slotColGen = [](unsigned int slot) { return slot; };
2235 using NewColSlot_t = RDFDetail::RDefine<decltype(slotColGen), RDFDetail::CustomColExtraArgs::Slot>;
2236
2237 auto slotColumn = std::make_shared<NewColSlot_t>(slotColName, slotColType, std::move(slotColGen), ColumnNames_t{},
2238 fLoopManager->GetNSlots(), newCols,
2240 newCols.AddColumn(slotColumn, slotColName);
2241
2242 fDefines = std::move(newCols);
2243
2244 fLoopManager->AddColumnAlias("tdfentry_", entryColName);
2245 fDefines.AddName("tdfentry_");
2246 fLoopManager->AddColumnAlias("tdfslot_", slotColName);
2247 fDefines.AddName("tdfslot_");
2248 }
2249
2250 std::vector<std::string> GetColumnTypeNamesList(const ColumnNames_t &columnList)
2251 {
2252 std::vector<std::string> types;
2253
2254 for (auto column : columnList) {
2255 types.push_back(GetColumnType(column));
2256 }
2257 return types;
2258 }
2259
2260 void CheckIMTDisabled(std::string_view callerName)
2261 {
2263 std::string error(callerName);
2264 error += " was called with ImplicitMT enabled, but multi-thread is not supported.";
2265 throw std::runtime_error(error);
2266 }
2267 }
2268
2269 /// Create RAction object, return RResultPtr for the action
2270 /// Overload for the case in which all column types were specified (no jitting).
2271 /// For most actions, `r` and `helperArg` will refer to the same object, because the only argument to forward to
2272 /// the action helper is the result value itself. We need the distinction for actions such as Snapshot or Cache,
2273 /// for which the constructor arguments of the action helper are different from the returned value.
2274 template <typename ActionTag, typename... ColTypes, typename ActionResultType,
2275 typename HelperArgType = ActionResultType,
2276 typename std::enable_if<!RDFInternal::RNeedJitting<ColTypes...>::value, int>::type = 0>
2277 RResultPtr<ActionResultType> CreateAction(const ColumnNames_t &columns, const std::shared_ptr<ActionResultType> &r,
2278 const std::shared_ptr<HelperArgType> &helperArg)
2279 {
2280 constexpr auto nColumns = sizeof...(ColTypes);
2281
2282 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
2284
2285 const auto nSlots = fLoopManager->GetNSlots();
2286
2287 auto action =
2288 RDFInternal::BuildAction<ColTypes...>(validColumnNames, helperArg, nSlots, fProxiedPtr, ActionTag{}, fDefines);
2289 fLoopManager->Book(action.get());
2290 fLoopManager->AddDataBlockCallback(action->GetDataBlockCallback());
2291 return MakeResultPtr(r, *fLoopManager, std::move(action));
2292 }
2293
2294 /// Create RAction object, return RResultPtr for the action
2295 /// Overload for the case in which one or more column types were not specified (RTTI + jitting).
2296 /// This overload has a `nColumns` optional argument. If present, the number of required columns for
2297 /// this action is taken equal to nColumns, otherwise it is assumed to be sizeof...(ColTypes).
2298 template <typename ActionTag, typename... ColTypes, typename ActionResultType,
2299 typename HelperArgType = ActionResultType,
2300 typename std::enable_if<RDFInternal::RNeedJitting<ColTypes...>::value, int>::type = 0>
2301 RResultPtr<ActionResultType> CreateAction(const ColumnNames_t &columns, const std::shared_ptr<ActionResultType> &r,
2302 const std::shared_ptr<HelperArgType> &helperArg, const int nColumns = -1)
2303 {
2304 auto realNColumns = (nColumns > -1 ? nColumns : sizeof...(ColTypes));
2305
2306 const auto validColumnNames = GetValidatedColumnNames(realNColumns, columns);
2307 const unsigned int nSlots = fLoopManager->GetNSlots();
2308
2309 auto tree = fLoopManager->GetTree();
2310 auto helperArgOnHeap = RDFInternal::MakeSharedOnHeap(helperArg);
2311
2312 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
2313 using BaseNodeType_t = typename std::remove_pointer<decltype(upcastNodeOnHeap)>::type::element_type;
2314 RInterface<BaseNodeType_t> upcastInterface(*upcastNodeOnHeap, *fLoopManager, fDefines, fDataSource);
2315
2316 const auto jittedAction = std::make_shared<RDFInternal::RJittedAction>(*fLoopManager);
2317 auto jittedActionOnHeap = RDFInternal::MakeWeakOnHeap(jittedAction);
2318
2319 auto toJit = RDFInternal::JitBuildAction(
2320 validColumnNames, upcastNodeOnHeap, typeid(std::shared_ptr<HelperArgType>), typeid(ActionTag), helperArgOnHeap,
2321 tree, nSlots, fDefines, fDataSource, jittedActionOnHeap);
2322 fLoopManager->Book(jittedAction.get());
2323 fLoopManager->ToJitExec(toJit);
2324 return MakeResultPtr(r, *fLoopManager, std::move(jittedAction));
2325 }
2326
2327 template <typename F, typename DefineType, typename RetType = typename TTraits::CallableTraits<F>::ret_type>
2328 typename std::enable_if<std::is_default_constructible<RetType>::value, RInterface<Proxied, DS_t>>::type
2329 DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns)
2330 {
2334
2335 using ArgTypes_t = typename TTraits::CallableTraits<F>::arg_types;
2336 using ColTypesTmp_t = typename RDFInternal::RemoveFirstParameterIf<
2337 std::is_same<DefineType, RDFDetail::CustomColExtraArgs::Slot>::value, ArgTypes_t>::type;
2338 using ColTypes_t = typename RDFInternal::RemoveFirstTwoParametersIf<
2339 std::is_same<DefineType, RDFDetail::CustomColExtraArgs::SlotAndEntry>::value, ColTypesTmp_t>::type;
2340
2341 constexpr auto nColumns = ColTypes_t::list_size;
2342
2343 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
2344 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
2345
2346 // Declare return type to the interpreter, for future use by jitted actions
2347 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType));
2348 if (retTypeName.empty()) {
2349 // The type is not known to the interpreter.
2350 // We must not error out here, but if/when this column is used in jitted code
2351 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType));
2352 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
2353 }
2354
2355 using NewCol_t = RDFDetail::RDefine<F, DefineType>;
2356 auto newColumn =
2357 std::make_shared<NewCol_t>(name, retTypeName, std::forward<F>(expression), validColumnNames,
2359
2361 newCols.AddColumn(newColumn, name);
2362
2363 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, newCols, fDataSource);
2364
2365 return newInterface;
2366 }
2367
2368 // This overload is chosen when the callable passed to Define or DefineSlot returns void.
2369 // It simply fires a compile-time error. This is preferable to a static_assert in the main `Define` overload because
2370 // this way compilation of `Define` has no way to continue after throwing the error.
2371 template <typename F, typename DefineType, typename RetType = typename TTraits::CallableTraits<F>::ret_type,
2372 bool IsFStringConv = std::is_convertible<F, std::string>::value,
2373 bool IsRetTypeDefConstr = std::is_default_constructible<RetType>::value>
2374 typename std::enable_if<!IsFStringConv && !IsRetTypeDefConstr, RInterface<Proxied, DS_t>>::type
2375 DefineImpl(std::string_view, F, const ColumnNames_t &)
2376 {
2377 static_assert(std::is_default_constructible<typename TTraits::CallableTraits<F>::ret_type>::value,
2378 "Error in `Define`: type returned by expression is not default-constructible");
2379 return *this; // never reached
2380 }
2381
2382 template <typename... ColumnTypes>
2383 RResultPtr<RInterface<RLoopManager>> SnapshotImpl(std::string_view fullTreeName, std::string_view filename,
2384 const ColumnNames_t &columnList, const RSnapshotOptions &options)
2385 {
2386 RDFInternal::CheckTypesAndPars(sizeof...(ColumnTypes), columnList.size());
2387
2388 const auto validCols = GetValidatedColumnNames(columnList.size(), columnList);
2390
2391 const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName);
2392 const auto &treename = parsedTreePath.fTreeName;
2393 const auto &dirname = parsedTreePath.fDirName;
2394
2395 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
2396 std::string(filename), std::string(dirname), std::string(treename), columnList, options});
2397
2399 auto newRDF = std::make_shared<ROOT::RDataFrame>(fullTreeName, filename, validCols);
2400
2401 auto resPtr = CreateAction<RDFInternal::ActionTags::Snapshot, ColumnTypes...>(validCols, newRDF, snapHelperArgs);
2402
2403 if (!options.fLazy)
2404 *resPtr;
2405 return resPtr;
2406 }
2407
2408 ////////////////////////////////////////////////////////////////////////////
2409 /// \brief Implementation of cache
2410 template <typename... ColTypes, std::size_t... S>
2411 RInterface<RLoopManager> CacheImpl(const ColumnNames_t &columnList, std::index_sequence<S...>)
2412 {
2413 // Check at compile time that the columns types are copy constructible
2414 constexpr bool areCopyConstructible =
2415 RDFInternal::TEvalAnd<std::is_copy_constructible<ColTypes>::value...>::value;
2416 static_assert(areCopyConstructible, "Columns of a type which is not copy constructible cannot be cached yet.");
2417
2418 RDFInternal::CheckTypesAndPars(sizeof...(ColTypes), columnList.size());
2419
2420 auto colHolders = std::make_tuple(Take<ColTypes>(columnList[S])...);
2421 auto ds = std::make_unique<RLazyDS<ColTypes...>>(std::make_pair(columnList[S], std::get<S>(colHolders))...);
2422
2423 RInterface<RLoopManager> cachedRDF(std::make_shared<RLoopManager>(std::move(ds), columnList));
2424
2425 return cachedRDF;
2426 }
2427
2428protected:
2429 RInterface(const std::shared_ptr<Proxied> &proxied, RLoopManager &lm,
2430 const RDFInternal::RBookedDefines &columns, RDataSource *ds)
2431 : fProxiedPtr(proxied), fLoopManager(&lm), fDataSource(ds), fDefines(columns)
2432 {
2433 }
2434
2436
2437 const std::shared_ptr<Proxied> &GetProxiedPtr() const { return fProxiedPtr; }
2438
2439 /// Prepare the call to the GetValidatedColumnNames routine, making sure that GetBranchNames,
2440 /// which is expensive in terms of runtime, is called at most once.
2441 ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
2442 {
2444 fDataSource);
2445 }
2446
2447 template <typename... ColumnTypes>
2449 {
2450 if (fDataSource != nullptr)
2451 RDFInternal::AddDSColumns(validCols, *fLoopManager, *fDataSource, typeList);
2452 }
2453};
2454
2455} // namespace RDF
2456
2457} // namespace ROOT
2458
2459#endif // ROOT_RDF_INTERFACE
ROOT::R::TRInterface & r
Definition Object.C:4
#define f(i)
Definition RSha256.hxx:104
#define h(i)
Definition RSha256.hxx:106
unsigned int UInt_t
Definition RtypesCore.h:46
unsigned long long ULong64_t
Definition RtypesCore.h:74
const Int_t kError
Definition TError.h:48
char name[80]
Definition TGX11.cxx:110
int type
Definition TGX11.cxx:121
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
void ToJitExec(const std::string &) const
void Run()
Start the event loop with a different mechanism depending on IMT/no IMT, data source/no data source.
void AddColumnAlias(const std::string &alias, const std::string &colName)
const std::map< std::string, std::vector< void * > > & GetDSValuePtrs() const
RDataSource * GetDataSource() const
void Book(RDFInternal::RActionBase *actionPtr)
void Jit()
Add RDF nodes that require just-in-time compilation to the computation graph.
void AddDataBlockCallback(std::function< void(unsigned int)> &&callback)
Helper class that provides the operation graph nodes.
A RDataFrame node that produces a result.
Definition RAction.hxx:52
Encapsulates the columns defined by the user.
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
void AddColumn(const std::shared_ptr< RDFDetail::RDefineBase > &column, std::string_view name)
Add a new booked column.
const RDefineBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
void AddName(std::string_view name)
Add a new name to the list returned by GetNames without booking a new column.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
The public interface to the RDataFrame federation of classes.
RInterface(const RInterface &)=default
Copy-ctor for RInterface.
RResultPtr<::TH1D > Histo1D(std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action)
RInterface(const std::shared_ptr< Proxied > &proxied, RLoopManager &lm, const RDFInternal::RBookedDefines &columns, RDataSource *ds)
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.})
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action)
RResultPtr< T > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action)
RResultPtr<::TH2D > Histo2D(const TH2DModel &model)
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a one-dimensional profile (lazy action)
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp="", const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< TStatistic > Stats(std::string_view value="")
Return a TStatistic object, filled once per event (lazy action)
RLoopManager * GetLoopManager() const
RResultPtr<::TGraph > Graph(std::string_view v1Name="", std::string_view v2Name="")
Fill and return a graph (lazy action)
RInterface< Proxied, DS_t > DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Creates a custom column with a value dependent on the processing slot.
RResultPtr< double > StdDev(std::string_view columnName="")
Return the unbiased standard deviation of processed column values (lazy action)
unsigned int GetNSlots() const
Gets the number of data processing slots.
RInterface(const std::shared_ptr< Proxied > &proxied)
Only enabled when building a RInterface<RLoopManager>
void ForeachSlot(F f, const ColumnNames_t &columns={})
Execute a user-defined function requiring a processing slot index on each entry (instant action)
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg, const int nColumns=-1)
Create RAction object, return RResultPtr for the action Overload for the case in which one or more co...
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RResultPtr< RDisplay > Display(std::initializer_list< std::string > columnList, const int &nRows=5)
Provides a representation of the columns in the dataset.
RInterface< Proxied, DS_t > Define(std::string_view name, F expression, const ColumnNames_t &columns={})
Creates a custom column.
RResultPtr< TStatistic > Stats(std::string_view value, std::string_view weight)
Return a TStatistic object, filled once per event (lazy action)
RDataSource * fDataSource
Non-owning pointer to a data-source object. Null if no data-source. RLoopManager has ownership of the...
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a two-dimensional histogram (lazy action)
RResultPtr< RInterface< RLoopManager > > SnapshotImpl(std::string_view fullTreeName, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options)
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg)
Create RAction object, return RResultPtr for the action Overload for the case in which all column typ...
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
Prepare the call to the GetValidatedColumnNames routine, making sure that GetBranchNames,...
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model)
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, const std::initializer_list< std::string > &columns)
Append a filter to the call graph.
RResultPtr< double > Mean(std::string_view columnName="")
Return the mean of processed column values (lazy action)
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::initializer_list< std::string > columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
std::enable_if<!IsFStringConv &&!IsRetTypeDefConstr, RInterface< Proxied, DS_t > >::type DefineImpl(std::string_view, F, const ColumnNames_t &)
RInterface< Proxied, DS_t > Alias(std::string_view alias, std::string_view columnName)
Allow to refer to a column with a different name.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RDFInternal::RBookedDefines fDefines
Contains the custom columns defined up to this node.
RInterface< RLoopManager > Cache(std::string_view columnNameRegexp="")
Save selected columns in memory.
RResultPtr< RDisplay > Display(std::string_view columnNameRegexp="", const int &nRows=5)
Provides a representation of the columns in the dataset.
RLoopManager * fLoopManager
friend class RDFInternal::GraphDrawing::GraphCreatorHelper
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a weighted two-dimensional histogram (lazy action)
RInterface & operator=(const RInterface &)=default
Copy-assignment operator for RInterface.
RResultPtr< RDFDetail::SumReturnType_t< T > > Sum(std::string_view columnName="", const RDFDetail::SumReturnType_t< T > &initValue=RDFDetail::SumReturnType_t< T >{})
Return the sum of processed column values (lazy action)
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action)
RResultPtr< T > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action)
RInterface< Proxied, DS_t > Define(std::string_view name, std::string_view expression)
Creates a custom column.
std::shared_ptr< Proxied > fProxiedPtr
Smart pointer to the graph node encapsulated by this RInterface.
RResultPtr<::TH1D > Histo1D(std::string_view vName)
Fill and return a one-dimensional histogram with the values of a column (lazy action)
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, const int &nRows=5)
Provides a representation of the columns in the dataset.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action)
RInterface< RLoopManager > CacheImpl(const ColumnNames_t &columnList, std::index_sequence< S... >)
Implementation of cache.
RInterface< RDFDetail::RRange< Proxied >, DS_t > Range(unsigned int end)
Creates a node that filters entries based on range.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default)
RInterface< RLoopManager > Cache(std::initializer_list< std::string > columnList)
Save selected columns in memory.
void CheckAndFillDSColumns(ColumnNames_t validCols, TTraits::TypeList< ColumnTypes... > typeList)
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a two-dimensional profile (lazy action)
RResultPtr< typename Helper::Result_t > Book(Helper &&helper, const ColumnNames_t &columns={})
Book execution of a custom action using a user-defined helper object.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, const int &nRows=5)
Provides a representation of the columns in the dataset.
const std::shared_ptr< Proxied > & GetProxiedPtr() const
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a three-dimensional histogram (lazy action)
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< RCutFlowReport > Report()
Gather filtering statistics.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a two-dimensional profile (lazy action)
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName="")
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RInterface< Proxied, DS_t > DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Creates a custom column with a value dependent on the processing slot and the current entry.
RResultPtr< RDFDetail::MinReturnType_t< T > > Min(std::string_view columnName="")
Return the minimum of processed column values (lazy action)
RResultPtr< T > Reduce(F f, std::string_view columnName="")
Execute a user-defined reduce operation on the values of a column.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action)
RInterface< RDFDetail::RJittedFilter, DS_t > Filter(std::string_view expression, std::string_view name="")
Append a filter to the call graph.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model)
std::enable_if< std::is_default_constructible< RetType >::value, RInterface< Proxied, DS_t > >::type DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns)
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
ColumnNames_t GetDefinedColumnNames()
Returns the names of the defined columns.
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, const ColumnNames_t &columns={}, std::string_view name="")
Append a filter to the call graph.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RInterface(RInterface &&)=default
Move-ctor for RInterface.
RResultPtr< T > Reduce(F f, std::string_view columnName, const T &redIdentity)
Execute a user-defined reduce operation on the values of a column.
void CheckIMTDisabled(std::string_view callerName)
unsigned int GetNRuns() const
Gets the number of event loops run.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a three-dimensional histogram (lazy action)
bool HasColumn(std::string_view columnName)
Checks if a column is present in the dataset.
RDFDetail::ColumnNames_t ColumnNames_t
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, std::string_view name)
Append a filter to the call graph.
RInterface< RDFDetail::RRange< Proxied >, DS_t > Range(unsigned int begin, unsigned int end, unsigned int stride=1)
Creates a node that filters entries based on range: [begin, end)
std::vector< std::string > GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.}, std::string_view vName="")
Fill and return a one-dimensional histogram with the values of a column (lazy action)
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a one-dimensional profile (lazy action)
RResultPtr<::TH3D > Histo3D(const TH3DModel &model)
RResultPtr< RDFDetail::MaxReturnType_t< T > > Max(std::string_view columnName="")
Return the maximum of processed column values (lazy action)
A RDataSource implementation which is built on top of result proxies.
Smart pointer for the return type of actions.
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
typename RemoveFirstParameter< T >::type RemoveFirstParameter_t
Small helper to keep current directory context.
Definition TDirectory.h:52
A TGraph is an object made of two arrays X and Y with npoints each.
Definition TGraph.h:41
Statistical variable, defined by its mean and variance (RMS).
Definition TStatistic.h:33
#define F(x, y, z)
std::vector< std::string > ColumnNames_t
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
std::vector< std::string > GetBranchNames(TTree &t, bool allowDuplicates=true)
Get all the branches names, including the ones of the friend trees.
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RDefineBase *define, bool vector2rvec)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:223
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:99
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string PrettyPrintAddr(const void *const addr)
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::string JitBuildAction(const ColumnNames_t &cols, std::shared_ptr< RDFDetail::RNodeBase > *prevNode, const std::type_info &helperArgType, const std::type_info &at, void *helperArgOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedDefines &customCols, RDataSource *ds, std::weak_ptr< RJittedAction > *jittedActionOnHeap)
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validDefines, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
bool IsInternalColumn(std::string_view colName)
Definition RDFUtils.cxx:347
void CheckDefine(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
void BookFilterJit(const std::shared_ptr< RJittedFilter > &jittedFilter, std::shared_ptr< RDFDetail::RNodeBase > *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedDefines &customCols, TTree *tree, RDataSource *ds)
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RDFInternal::RBookedDefines &customCols, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
RInterface<::ROOT::Detail::RDF::RNodeBase, void > RNode
ROOT type_traits extensions.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:525
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:556
void DisableImplicitMT()
Disables the implicit multi-threading in ROOT (see EnableImplicitMT).
Definition TROOT.cxx:542
Definition graph.py:1
Definition tree.py:1
A collection of options to steer the creation of the dataset on file.
bool fLazy
Do not start the event loop when Snapshot is called.
A struct which stores the parameters of a TH1D.
std::shared_ptr<::TH1D > GetHistogram() const
A struct which stores the parameters of a TH2D.
std::shared_ptr<::TH2D > GetHistogram() const
A struct which stores the parameters of a TH3D.
std::shared_ptr<::TH3D > GetHistogram() const
A struct which stores the parameters of a TProfile.
std::shared_ptr<::TProfile > GetProfile() const
A struct which stores the parameters of a TProfile2D.
std::shared_ptr<::TProfile2D > GetProfile() const
Lightweight storage for a collection of types.