Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterface.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDF_TINTERFACE
12#define ROOT_RDF_TINTERFACE
13
14#include "ROOT/RDataSource.hxx"
20#include "ROOT/RDF/RDefine.hxx"
22#include "ROOT/RDF/RFilter.hxx"
27#include "ROOT/RDF/RRange.hxx"
29#include "ROOT/RDF/Utils.hxx"
32#include "ROOT/RResultPtr.hxx"
34#include <string_view>
35#include "ROOT/RVec.hxx"
36#include "ROOT/TypeTraits.hxx"
37#include "RtypesCore.h" // for ULong64_t
38#include "TDirectory.h"
39#include "TH1.h" // For Histo actions
40#include "TH2.h" // For Histo actions
41#include "TH3.h" // For Histo actions
42#include "THn.h"
43#include "TProfile.h"
44#include "TProfile2D.h"
45#include "TStatistic.h"
46
47#include <algorithm>
48#include <cstddef>
49#include <initializer_list>
50#include <iterator> // std::back_insterter
51#include <limits>
52#include <memory>
53#include <set>
54#include <sstream>
55#include <stdexcept>
56#include <string>
57#include <type_traits> // is_same, enable_if
58#include <typeinfo>
59#include <unordered_set>
60#include <utility> // std::index_sequence
61#include <vector>
62#include <any>
63
64class TGraph;
65
66// Windows requires a forward decl of printValue to accept it as a valid friend function in RInterface
67namespace ROOT {
71class RDataFrame;
72} // namespace ROOT
73namespace cling {
74std::string printValue(ROOT::RDataFrame *tdf);
75}
76
77namespace ROOT {
78namespace RDF {
81namespace TTraits = ROOT::TypeTraits;
82
83template <typename Proxied, typename DataSource>
84class RInterface;
85
87} // namespace RDF
88
89namespace Internal {
90namespace RDF {
92void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
93void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end);
96std::string GetDataSourceLabel(const ROOT::RDF::RNode &node);
97void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline);
98} // namespace RDF
99} // namespace Internal
100
101namespace RDF {
102
103// clang-format off
104/**
105 * \class ROOT::RDF::RInterface
106 * \ingroup dataframe
107 * \brief The public interface to the RDataFrame federation of classes.
108 * \tparam Proxied One of the "node" base types (e.g. RLoopManager, RFilterBase). The user never specifies this type manually.
109 * \tparam DataSource The type of the RDataSource which is providing the data to the data frame. There is no source by default.
110 *
111 * The documentation of each method features a one liner illustrating how to use the method, for example showing how
112 * the majority of the template parameters are automatically deduced requiring no or very little effort by the user.
113 */
114// clang-format on
115template <typename Proxied, typename DataSource = void>
121 friend std::string cling::printValue(::ROOT::RDataFrame *tdf); // For a nice printing at the prompt
123
124 template <typename T, typename W>
125 friend class RInterface;
126
128 friend void RDFInternal::ChangeEmptyEntryRange(const RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
129 friend void RDFInternal::ChangeBeginAndEndEntries(const RNode &node, Long64_t start, Long64_t end);
131 friend std::string ROOT::Internal::RDF::GetDataSourceLabel(const RNode &node);
133 std::shared_ptr<Proxied> fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface.
134
135public:
136 ////////////////////////////////////////////////////////////////////////////
137 /// \brief Copy-assignment operator for RInterface.
138 RInterface &operator=(const RInterface &) = default;
139
140 ////////////////////////////////////////////////////////////////////////////
141 /// \brief Copy-ctor for RInterface.
142 RInterface(const RInterface &) = default;
143
144 ////////////////////////////////////////////////////////////////////////////
145 /// \brief Move-ctor for RInterface.
146 RInterface(RInterface &&) = default;
147
148 ////////////////////////////////////////////////////////////////////////////
149 /// \brief Move-assignment operator for RInterface.
151
152 ////////////////////////////////////////////////////////////////////////////
153 /// \brief Build a RInterface from a RLoopManager.
154 /// This constructor is only available for RInterface<RLoopManager>.
156 RInterface(const std::shared_ptr<RLoopManager> &proxied) : RInterfaceBase(proxied), fProxiedPtr(proxied)
157 {
158 }
159
160 ////////////////////////////////////////////////////////////////////////////
161 /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode.
162 /// Different RDataFrame methods return different C++ types. All nodes, however,
163 /// can be cast to this common type at the cost of a small performance penalty.
164 /// This allows, for example, storing RDataFrame nodes in a vector, or passing them
165 /// around via (non-template, C++11) helper functions.
166 /// Example usage:
167 /// ~~~{.cpp}
168 /// // a function that conditionally adds a Range to a RDataFrame node.
169 /// RNode MaybeAddRange(RNode df, bool mustAddRange)
170 /// {
171 /// return mustAddRange ? df.Range(1) : df;
172 /// }
173 /// // use as :
174 /// ROOT::RDataFrame df(10);
175 /// auto maybeRanged = MaybeAddRange(df, true);
176 /// ~~~
177 /// Note that it is not a problem to pass RNode's by value.
178 operator RNode() const
179 {
180 return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fColRegister);
181 }
182
183 ////////////////////////////////////////////////////////////////////////////
184 /// \brief Append a filter to the call graph.
185 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
186 /// signalling whether the event has passed the selection (true) or not (false).
187 /// \param[in] columns Names of the columns/branches in input to the filter function.
188 /// \param[in] name Optional name of this filter. See `Report`.
189 /// \return the filter node of the computation graph.
190 ///
191 /// Append a filter node at the point of the call graph corresponding to the
192 /// object this method is called on.
193 /// The callable `f` should not have side-effects (e.g. modification of an
194 /// external or static variable) to ensure correct results when implicit
195 /// multi-threading is active.
196 ///
197 /// RDataFrame only evaluates filters when necessary: if multiple filters
198 /// are chained one after another, they are executed in order and the first
199 /// one returning false causes the event to be discarded.
200 /// Even if multiple actions or transformations depend on the same filter,
201 /// it is executed once per entry. If its result is requested more than
202 /// once, the cached result is served.
203 ///
204 /// ### Example usage:
205 /// ~~~{.cpp}
206 /// // C++ callable (function, functor class, lambda...) that takes two parameters of the types of "x" and "y"
207 /// auto filtered = df.Filter(myCut, {"x", "y"});
208 ///
209 /// // String: it must contain valid C++ except that column names can be used instead of variable names
210 /// auto filtered = df.Filter("x*y > 0");
211 /// ~~~
212 ///
213 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
214 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
215 /// ~~~{.cpp}
216 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
217 /// ~~~
218 /// but instead this will:
219 /// ~~~{.cpp}
220 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
221 /// ~~~
224 Filter(F f, const ColumnNames_t &columns = {}, std::string_view name = "")
225 {
226 RDFInternal::CheckFilter(f);
227 using ColTypes_t = typename TTraits::CallableTraits<F>::arg_types;
228 constexpr auto nColumns = ColTypes_t::list_size;
231
233
234 auto filterPtr = std::make_shared<F_t>(std::move(f), validColumnNames, fProxiedPtr, fColRegister, name);
236 }
237
238 ////////////////////////////////////////////////////////////////////////////
239 /// \brief Append a filter to the call graph.
240 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
241 /// signalling whether the event has passed the selection (true) or not (false).
242 /// \param[in] name Optional name of this filter. See `Report`.
243 /// \return the filter node of the computation graph.
244 ///
245 /// Refer to the first overload of this method for the full documentation.
248 {
249 // The sfinae is there in order to pick up the overloaded method which accepts two strings
250 // rather than this template method.
251 return Filter(f, {}, name);
252 }
253
254 ////////////////////////////////////////////////////////////////////////////
255 /// \brief Append a filter to the call graph.
256 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
257 /// signalling whether the event has passed the selection (true) or not (false).
258 /// \param[in] columns Names of the columns/branches in input to the filter function.
259 /// \return the filter node of the computation graph.
260 ///
261 /// Refer to the first overload of this method for the full documentation.
262 template <typename F>
263 RInterface<RDFDetail::RFilter<F, Proxied>, DS_t> Filter(F f, const std::initializer_list<std::string> &columns)
264 {
265 return Filter(f, ColumnNames_t{columns});
266 }
267
268 ////////////////////////////////////////////////////////////////////////////
269 /// \brief Append a filter to the call graph.
270 /// \param[in] expression The filter expression in C++
271 /// \param[in] name Optional name of this filter. See `Report`.
272 /// \return the filter node of the computation graph.
273 ///
274 /// The expression is just-in-time compiled and used to filter entries. It must
275 /// be valid C++ syntax in which variable names are substituted with the names
276 /// of branches/columns.
277 ///
278 /// ### Example usage:
279 /// ~~~{.cpp}
280 /// auto filtered_df = df.Filter("myCollection.size() > 3");
281 /// auto filtered_name_df = df.Filter("myCollection.size() > 3", "Minumum collection size");
282 /// ~~~
283 ///
284 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
285 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
286 /// ~~~{.cpp}
287 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
288 /// ~~~
289 /// but instead this will:
290 /// ~~~{.cpp}
291 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
292 /// ~~~
293 RInterface<RDFDetail::RJittedFilter, DS_t> Filter(std::string_view expression, std::string_view name = "")
294 {
295 // deleted by the jitted call to JitFilterHelper
296 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
297 using BaseNodeType_t = typename std::remove_pointer_t<decltype(upcastNodeOnHeap)>::element_type;
299 const auto jittedFilter =
301 fLoopManager->GetTree(), GetDataSource());
302
304 }
305
306 ////////////////////////////////////////////////////////////////////////////
307 /// \brief Discard entries with missing values
308 /// \param[in] column Column name whose entries with missing values should be discarded
309 /// \return The filter node of the computation graph
310 ///
311 /// This operation is useful in case an entry of the dataset is incomplete,
312 /// i.e. if one or more of the columns do not have valid values. If the value
313 /// of the input column is missing for an entry, the entire entry will be
314 /// discarded from the rest of this branch of the computation graph.
315 ///
316 /// Use cases include:
317 /// * When processing multiple files, one or more of them is missing a column
318 /// * In horizontal joining with entry matching, a certain dataset has no
319 /// match for the current entry.
320 ///
321 /// ### Example usage:
322 ///
323 /// \code{.py}
324 /// # Assume a dataset with columns [idx, x] matching another dataset with
325 /// # columns [idx, y]. For idx == 42, the right-hand dataset has no match
326 /// df = ROOT.RDataFrame(dataset)
327 /// df_nomissing = df.FilterAvailable("idx").Define("z", "x + y")
328 /// colz = df_nomissing.Take[int]("z")
329 /// \endcode
330 ///
331 /// \code{.cpp}
332 /// // Assume a dataset with columns [idx, x] matching another dataset with
333 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
334 /// ROOT::RDataFrame df{dataset};
335 /// auto df_nomissing = df.FilterAvailable("idx")
336 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
337 /// auto colz = df_nomissing.Take<int>("z");
338 /// \endcode
339 ///
340 /// \note See FilterMissing() if you want to keep only the entries with
341 /// missing values instead.
343 {
344 const auto columns = ColumnNames_t{column.data()};
345 // For now disable this functionality in case of an empty data source and
346 // the column name was not defined previously.
347 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
348 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
350 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ true, fProxiedPtr, fColRegister, columns);
353 }
354
355 ////////////////////////////////////////////////////////////////////////////
356 /// \brief Keep only the entries that have missing values.
357 /// \param[in] column Column name whose entries with missing values should be kept
358 /// \return The filter node of the computation graph
359 ///
360 /// This operation is useful in case an entry of the dataset is incomplete,
361 /// i.e. if one or more of the columns do not have valid values. It only
362 /// keeps the entries for which the value of the input column is missing.
363 ///
364 /// Use cases include:
365 /// * When processing multiple files, one or more of them is missing a column
366 /// * In horizontal joining with entry matching, a certain dataset has no
367 /// match for the current entry.
368 ///
369 /// ### Example usage:
370 ///
371 /// \code{.py}
372 /// # Assume a dataset made of two files vertically chained together, one has
373 /// # column "x" and the other has column "y"
374 /// df = ROOT.RDataFrame(dataset)
375 /// df_valid_col_x = df.FilterMissing("y")
376 /// df_valid_col_y = df.FilterMissing("x")
377 /// display_x = df_valid_col_x.Display(("x",))
378 /// display_y = df_valid_col_y.Display(("y",))
379 /// \endcode
380 ///
381 /// \code{.cpp}
382 /// // Assume a dataset made of two files vertically chained together, one has
383 /// // column "x" and the other has column "y"
384 /// ROOT.RDataFrame df{dataset};
385 /// auto df_valid_col_x = df.FilterMissing("y");
386 /// auto df_valid_col_y = df.FilterMissing("x");
387 /// auto display_x = df_valid_col_x.Display<int>({"x"});
388 /// auto display_y = df_valid_col_y.Display<int>({"y"});
389 /// \endcode
390 ///
391 /// \note See FilterAvailable() if you want to discard the entries in case
392 /// there is a missing value instead.
394 {
395 const auto columns = ColumnNames_t{column.data()};
396 // For now disable this functionality in case of an empty data source and
397 // the column name was not defined previously.
398 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
399 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
401 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ false, fProxiedPtr, fColRegister, columns);
404 }
405
406 // clang-format off
407 ////////////////////////////////////////////////////////////////////////////
408 /// \brief Define a new column.
409 /// \param[in] name The name of the defined column.
410 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
411 /// \param[in] columns Names of the columns/branches in input to the producer function.
412 /// \return the first node of the computation graph for which the new quantity is defined.
413 ///
414 /// Define a column that will be visible from all subsequent nodes
415 /// of the functional chain. The `expression` is only evaluated for entries that pass
416 /// all the preceding filters.
417 /// A new variable is created called `name`, accessible as if it was contained
418 /// in the dataset from subsequent transformations/actions.
419 ///
420 /// Use cases include:
421 /// * caching the results of complex calculations for easy and efficient multiple access
422 /// * extraction of quantities of interest from complex objects
423 ///
424 /// An exception is thrown if the name of the new column is already in use in this branch of the computation graph.
425 ///
426 /// ### Example usage:
427 /// ~~~{.cpp}
428 /// // assuming a function with signature:
429 /// double myComplexCalculation(const RVec<float> &muon_pts);
430 /// // we can pass it directly to Define
431 /// auto df_with_define = df.Define("newColumn", myComplexCalculation, {"muon_pts"});
432 /// // alternatively, we can pass the body of the function as a string, as in Filter:
433 /// auto df_with_define = df.Define("newColumn", "x*x + y*y");
434 /// ~~~
435 ///
436 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
437 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
438 /// ~~~{.cpp}
439 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
440 /// ~~~
441 /// but instead this will:
442 /// ~~~{.cpp}
443 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
444 /// ~~~
446 RInterface<Proxied, DS_t> Define(std::string_view name, F expression, const ColumnNames_t &columns = {})
447 {
448 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Define");
449 }
450 // clang-format on
451
452 // clang-format off
453 ////////////////////////////////////////////////////////////////////////////
454 /// \brief Define a new column with a value dependent on the processing slot.
455 /// \param[in] name The name of the defined column.
456 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
457 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding the slot number).
458 /// \return the first node of the computation graph for which the new quantity is defined.
459 ///
460 /// This alternative implementation of `Define` is meant as a helper to evaluate new column values in a thread-safe manner.
461 /// The expression must be a callable of signature R(unsigned int, T1, T2, ...) where `T1, T2...` are the types
462 /// of the columns that the expression takes as input. The first parameter is reserved for an unsigned integer
463 /// representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
464 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
465 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
466 ///
467 /// The following two calls are equivalent, although `DefineSlot` is slightly more performant:
468 /// ~~~{.cpp}
469 /// int function(unsigned int, double, double);
470 /// df.Define("x", function, {"rdfslot_", "column1", "column2"})
471 /// df.DefineSlot("x", function, {"column1", "column2"})
472 /// ~~~
473 ///
474 /// See Define() for more information.
475 template <typename F>
476 RInterface<Proxied, DS_t> DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
477 {
478 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "DefineSlot");
479 }
480 // clang-format on
481
482 // clang-format off
483 ////////////////////////////////////////////////////////////////////////////
484 /// \brief Define a new column with a value dependent on the processing slot and the current entry.
485 /// \param[in] name The name of the defined column.
486 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
487 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
488 /// \return the first node of the computation graph for which the new quantity is defined.
489 ///
490 /// This alternative implementation of `Define` is meant as a helper in writing entry-specific, thread-safe custom
491 /// columns. The expression must be a callable of signature R(unsigned int, ULong64_t, T1, T2, ...) where `T1, T2...`
492 /// are the types of the columns that the expression takes as input. The first parameter is reserved for an unsigned
493 /// integer representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
494 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
495 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
496 /// The second parameter is reserved for a `ULong64_t` representing the current entry being processed by the current thread.
497 ///
498 /// The following two `Define`s are equivalent, although `DefineSlotEntry` is slightly more performant:
499 /// ~~~{.cpp}
500 /// int function(unsigned int, ULong64_t, double, double);
501 /// Define("x", function, {"rdfslot_", "rdfentry_", "column1", "column2"})
502 /// DefineSlotEntry("x", function, {"column1", "column2"})
503 /// ~~~
504 ///
505 /// See Define() for more information.
506 template <typename F>
507 RInterface<Proxied, DS_t> DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
508 {
510 "DefineSlotEntry");
511 }
512 // clang-format on
513
514 ////////////////////////////////////////////////////////////////////////////
515 /// \brief Define a new column.
516 /// \param[in] name The name of the defined column.
517 /// \param[in] expression An expression in C++ which represents the defined value
518 /// \return the first node of the computation graph for which the new quantity is defined.
519 ///
520 /// The expression is just-in-time compiled and used to produce the column entries.
521 /// It must be valid C++ syntax in which variable names are substituted with the names
522 /// of branches/columns.
523 ///
524 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
525 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
526 /// ~~~{.cpp}
527 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
528 /// ~~~
529 /// but instead this will:
530 /// ~~~{.cpp}
531 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
532 /// ~~~
533 ///
534 /// Refer to the first overload of this method for the full documentation.
535 RInterface<Proxied, DS_t> Define(std::string_view name, std::string_view expression)
536 {
537 constexpr auto where = "Define";
539 // these checks must be done before jitting lest we throw exceptions in jitted code
542
543 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
545 fLoopManager->GetBranchNames(), upcastNodeOnHeap);
546
548 newCols.AddDefine(std::move(jittedDefine));
549
551
552 return newInterface;
553 }
554
555 ////////////////////////////////////////////////////////////////////////////
556 /// \brief Overwrite the value and/or type of an existing column.
557 /// \param[in] name The name of the column to redefine.
558 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
559 /// \param[in] columns Names of the columns/branches in input to the expression.
560 /// \return the first node of the computation graph for which the quantity is redefined.
561 ///
562 /// The old value of the column can be used as an input for the expression.
563 ///
564 /// An exception is thrown in case the column to redefine does not already exist.
565 /// See Define() for more information.
567 RInterface<Proxied, DS_t> Redefine(std::string_view name, F expression, const ColumnNames_t &columns = {})
568 {
569 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Redefine");
570 }
571
572 // clang-format off
573 ////////////////////////////////////////////////////////////////////////////
574 /// \brief Overwrite the value and/or type of an existing column.
575 /// \param[in] name The name of the column to redefine.
576 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
577 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot).
578 /// \return the first node of the computation graph for which the new quantity is defined.
579 ///
580 /// The old value of the column can be used as an input for the expression.
581 /// An exception is thrown in case the column to redefine does not already exist.
582 ///
583 /// See DefineSlot() for more information.
584 // clang-format on
585 template <typename F>
586 RInterface<Proxied, DS_t> RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
587 {
588 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "RedefineSlot");
589 }
590
591 // clang-format off
592 ////////////////////////////////////////////////////////////////////////////
593 /// \brief Overwrite the value and/or type of an existing column.
594 /// \param[in] name The name of the column to redefine.
595 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
596 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
597 /// \return the first node of the computation graph for which the new quantity is defined.
598 ///
599 /// The old value of the column can be used as an input for the expression.
600 /// An exception is thrown in case the column to re-define does not already exist.
601 ///
602 /// See DefineSlotEntry() for more information.
603 // clang-format on
604 template <typename F>
605 RInterface<Proxied, DS_t> RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
606 {
608 "RedefineSlotEntry");
609 }
610
611 ////////////////////////////////////////////////////////////////////////////
612 /// \brief Overwrite the value and/or type of an existing column.
613 /// \param[in] name The name of the column to redefine.
614 /// \param[in] expression An expression in C++ which represents the defined value
615 /// \return the first node of the computation graph for which the new quantity is defined.
616 ///
617 /// The expression is just-in-time compiled and used to produce the column entries.
618 /// It must be valid C++ syntax in which variable names are substituted with the names
619 /// of branches/columns.
620 ///
621 /// The old value of the column can be used as an input for the expression.
622 /// An exception is thrown in case the column to re-define does not already exist.
623 ///
624 /// Aliases cannot be overridden. See the corresponding Define() overload for more information.
625 RInterface<Proxied, DS_t> Redefine(std::string_view name, std::string_view expression)
626 {
627 constexpr auto where = "Redefine";
632
633 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
635 fLoopManager->GetBranchNames(), upcastNodeOnHeap);
636
638 newCols.AddDefine(std::move(jittedDefine));
639
641
642 return newInterface;
643 }
644
645 ////////////////////////////////////////////////////////////////////////////
646 /// \brief In case the value in the given column is missing, provide a default value
647 /// \tparam T The type of the column
648 /// \param[in] column Column name where missing values should be replaced by the given default value
649 /// \param[in] defaultValue Value to provide instead of a missing value
650 /// \return The node of the graph that will provide a default value
651 ///
652 /// This operation is useful in case an entry of the dataset is incomplete,
653 /// i.e. if one or more of the columns do not have valid values. It does not
654 /// modify the values of the column, but in case any entry is missing, it
655 /// will provide the default value to downstream nodes instead.
656 ///
657 /// Use cases include:
658 /// * When processing multiple files, one or more of them is missing a column
659 /// * In horizontal joining with entry matching, a certain dataset has no
660 /// match for the current entry.
661 ///
662 /// ### Example usage:
663 ///
664 /// \code{.cpp}
665 /// // Assume a dataset with columns [idx, x] matching another dataset with
666 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
667 /// ROOT::RDataFrame df{dataset};
668 /// auto df_default = df.DefaultValueFor("y", 33)
669 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
670 /// auto colz = df_default.Take<int>("z");
671 /// \endcode
672 ///
673 /// \code{.py}
674 /// df = ROOT.RDataFrame(dataset)
675 /// df_default = df.DefaultValueFor("y", 33).Define("z", "x + y")
676 /// colz = df_default.Take[int]("z")
677 /// \endcode
678 template <typename T>
679 RInterface<Proxied, DS_t> DefaultValueFor(std::string_view column, const T &defaultValue)
680 {
681 constexpr auto where{"DefaultValueFor"};
683 // For now disable this functionality in case of an empty data source and
684 // the column name was not defined previously.
685 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
688
689 // Declare return type to the interpreter, for future use by jitted actions
691 if (retTypeName.empty()) {
692 // The type is not known to the interpreter.
693 // We must not error out here, but if/when this column is used in jitted code
694 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(T));
695 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
696 }
697
698 const auto validColumnNames = ColumnNames_t{column.data()};
699 auto newColumn = std::make_shared<ROOT::Internal::RDF::RDefaultValueFor<T>>(
700 column, retTypeName, defaultValue, validColumnNames, fColRegister, *fLoopManager);
702
704 newCols.AddDefine(std::move(newColumn));
705
707
708 return newInterface;
709 }
710
711 // clang-format off
712 ////////////////////////////////////////////////////////////////////////////
713 /// \brief Define a new column that is updated when the input sample changes.
714 /// \param[in] name The name of the defined column.
715 /// \param[in] expression A C++ callable that computes the new value of the defined column.
716 /// \return the first node of the computation graph for which the new quantity is defined.
717 ///
718 /// The signature of the callable passed as second argument should be `T(unsigned int slot, const ROOT::RDF::RSampleInfo &id)`
719 /// where:
720 /// - `T` is the type of the defined column
721 /// - `slot` is a number in the range [0, nThreads) that is different for each processing thread. This can simplify
722 /// the definition of thread-safe callables if you are interested in using parallel capabilities of RDataFrame.
723 /// - `id` is an instance of a ROOT::RDF::RSampleInfo object which contains information about the sample which is
724 /// being processed (see the class docs for more information).
725 ///
726 /// DefinePerSample() is useful to e.g. define a quantity that depends on which TTree in which TFile is being
727 /// processed or to inject a callback into the event loop that is only called when the processing of a new sample
728 /// starts rather than at every entry.
729 ///
730 /// The callable will be invoked once per input TTree or once per multi-thread task, whichever is more often.
731 ///
732 /// ### Example usage:
733 /// ~~~{.cpp}
734 /// ROOT::RDataFrame df{"mytree", {"sample1.root","sample2.root"}};
735 /// df.DefinePerSample("weightbysample",
736 /// [](unsigned int slot, const ROOT::RDF::RSampleInfo &id)
737 /// { return id.Contains("sample1") ? 1.0f : 2.0f; });
738 /// ~~~
739 // clang-format on
740 // TODO we could SFINAE on F's signature to provide friendlier compilation errors in case of signature mismatch
742 RInterface<Proxied, DS_t> DefinePerSample(std::string_view name, F expression)
743 {
744 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
745 RDFInternal::CheckForRedefinition("DefinePerSample", name, fColRegister, fLoopManager->GetBranchNames(),
747
748 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType_t));
749 if (retTypeName.empty()) {
750 // The type is not known to the interpreter.
751 // We must not error out here, but if/when this column is used in jitted code
752 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType_t));
753 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
754 }
755
756 auto newColumn =
757 std::make_shared<RDFDetail::RDefinePerSample<F>>(name, retTypeName, std::move(expression), *fLoopManager);
758
760 newCols.AddDefine(std::move(newColumn));
762 return newInterface;
763 }
764
765 // clang-format off
766 ////////////////////////////////////////////////////////////////////////////
767 /// \brief Define a new column that is updated when the input sample changes.
768 /// \param[in] name The name of the defined column.
769 /// \param[in] expression A valid C++ expression as a string, which will be used to compute the defined value.
770 /// \return the first node of the computation graph for which the new quantity is defined.
771 ///
772 /// The expression is just-in-time compiled and used to produce the column entries.
773 /// It must be valid C++ syntax and the usage of the special variable names `rdfslot_` and `rdfsampleinfo_` is
774 /// permitted, where these variables will take the same values as the `slot` and `id` parameters described at the
775 /// DefinePerSample(std::string_view name, F expression) overload. See the documentation of that overload for more information.
776 ///
777 /// ### Example usage:
778 /// ~~~{.py}
779 /// df = ROOT.RDataFrame('mytree', ['sample1.root','sample2.root'])
780 /// df.DefinePerSample('weightbysample', 'rdfsampleinfo_.Contains("sample1") ? 1.0f : 2.0f')
781 /// ~~~
782 ///
783 /// \note
784 /// If you have declared some C++ function to the interpreter, the correct syntax to call that function with this
785 /// overload of DefinePerSample is by calling it explicitly with the special names `rdfslot_` and `rdfsampleinfo_` as
786 /// input parameters. This is for example the correct way to call this overload when working in PyROOT:
787 /// ~~~{.py}
788 /// ROOT.gInterpreter.Declare(
789 /// """
790 /// float weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
791 /// return id.Contains("sample1") ? 1.0f : 2.0f;
792 /// }
793 /// """)
794 /// df = ROOT.RDataFrame("mytree", ["sample1.root","sample2.root"])
795 /// df.DefinePerSample("weightsbysample", "weights(rdfslot_, rdfsampleinfo_)")
796 /// ~~~
797 ///
798 /// \note
799 /// Differently from what happens in Define(), the string expression passed to DefinePerSample cannot contain
800 /// column names other than those mentioned above: the expression is evaluated once before the processing of the
801 /// sample even starts, so column values are not accessible.
802 // clang-format on
803 RInterface<Proxied, DS_t> DefinePerSample(std::string_view name, std::string_view expression)
804 {
805 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
806 // these checks must be done before jitting lest we throw exceptions in jitted code
807 RDFInternal::CheckForRedefinition("DefinePerSample", name, fColRegister, fLoopManager->GetBranchNames(),
809
810 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
811 auto jittedDefine =
813
815 newCols.AddDefine(std::move(jittedDefine));
816
818
819 return newInterface;
820 }
821
822 /// \brief Register systematic variations for a single existing column using custom variation tags.
823 /// \param[in] colName name of the column for which varied values are provided.
824 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
825 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
826 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
827 /// \param[in] inputColumns the names of the columns to be passed to the callable.
828 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
829 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
830 ///
831 /// Vary provides a natural and flexible syntax to define systematic variations that automatically propagate to
832 /// Filters, Defines and results. RDataFrame usage of columns with attached variations does not change, but for
833 /// results that depend on any varied quantity, a map/dictionary of varied results can be produced with
834 /// ROOT::RDF::Experimental::VariationsFor (see the example below).
835 ///
836 /// The dictionary will contain a "nominal" value (accessed with the "nominal" key) for the unchanged result, and
837 /// values for each of the systematic variations that affected the result (via upstream Filters or via direct or
838 /// indirect dependencies of the column values on some registered variations). The keys will be a composition of
839 /// variation names and tags, e.g. "pt:up" and "pt:down" for the example below.
840 ///
841 /// In the following example we add up/down variations of pt and fill a histogram with a quantity that depends on pt.
842 /// We automatically obtain three histograms in output ("nominal", "pt:up" and "pt:down"):
843 /// ~~~{.cpp}
844 /// auto nominal_hx =
845 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, {"down", "up"})
846 /// .Filter("pt > k")
847 /// .Define("x", someFunc, {"pt"})
848 /// .Histo1D("x");
849 ///
850 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
851 /// hx["nominal"].Draw();
852 /// hx["pt:down"].Draw("SAME");
853 /// hx["pt:up"].Draw("SAME");
854 /// ~~~
855 /// RDataFrame computes all variations as part of a single loop over the data.
856 /// In particular, this means that I/O and computation of values shared
857 /// among variations only happen once for all variations. Thus, the event loop
858 /// run-time typically scales much better than linearly with the number of
859 /// variations.
860 ///
861 /// RDataFrame lazily computes the varied values required to produce the
862 /// outputs of \ref ROOT::RDF::Experimental::VariationsFor "VariationsFor()". If \ref
863 /// ROOT::RDF::Experimental::VariationsFor "VariationsFor()" was not called for a result, the computations are only
864 /// run for the nominal case.
865 ///
866 /// See other overloads for examples when variations are added for multiple existing columns,
867 /// or when the tags are auto-generated instead of being directly defined.
868 template <typename F>
869 RInterface<Proxied, DS_t> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
870 const std::vector<std::string> &variationTags, std::string_view variationName = "")
871 {
872 std::vector<std::string> colNames{{std::string(colName)}};
873 const std::string theVariationName{variationName.empty() ? colName : variationName};
874
875 return VaryImpl<true>(std::move(colNames), std::forward<F>(expression), inputColumns, variationTags,
877 }
878
879 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
880 /// \param[in] colName name of the column for which varied values are provided.
881 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
882 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
883 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
884 /// \param[in] inputColumns the names of the columns to be passed to the callable.
885 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
886 /// `"1"`, etc.
887 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
888 /// colName is used if none is provided.
889 ///
890 /// This overload of Vary takes an nVariations parameter instead of a list of tag names.
891 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
892 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
893 ///
894 /// Example usage:
895 /// ~~~{.cpp}
896 /// auto nominal_hx =
897 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, 2)
898 /// .Histo1D("x");
899 ///
900 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
901 /// hx["nominal"].Draw();
902 /// hx["x:0"].Draw("SAME");
903 /// hx["x:1"].Draw("SAME");
904 /// ~~~
905 ///
906 /// \note See also This Vary() overload for more information.
907 template <typename F>
908 RInterface<Proxied, DS_t> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
909 std::size_t nVariations, std::string_view variationName = "")
910 {
911 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
912
913 std::vector<std::string> variationTags;
914 variationTags.reserve(nVariations);
915 for (std::size_t i = 0u; i < nVariations; ++i)
916 variationTags.emplace_back(std::to_string(i));
917
918 const std::string theVariationName{variationName.empty() ? colName : variationName};
919
920 return Vary(colName, std::forward<F>(expression), inputColumns, std::move(variationTags), theVariationName);
921 }
922
923 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
924 /// \param[in] colNames set of names of the columns for which varied values are provided.
925 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
926 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
927 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
928 /// \param[in] inputColumns the names of the columns to be passed to the callable.
929 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
930 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`
931 ///
932 /// This overload of Vary takes a list of column names as first argument and
933 /// requires that the expression returns an RVec of RVecs of values: one inner RVec for the variations of each
934 /// affected column. The `variationTags` are defined as `{"down", "up"}`.
935 ///
936 /// Example usage:
937 /// ~~~{.cpp}
938 /// // produce variations "ptAndEta:down" and "ptAndEta:up"
939 /// auto nominal_hx =
940 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
941 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
942 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
943 /// {"down", "up"}, // variation tags
944 /// "ptAndEta") // variation name
945 /// .Histo1D("pt", "eta");
946 ///
947 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
948 /// hx["nominal"].Draw();
949 /// hx["ptAndEta:down"].Draw("SAME");
950 /// hx["ptAndEta:up"].Draw("SAME");
951 /// ~~~
952 ///
953 /// \note See also This Vary() overload for more information.
954
955 template <typename F>
957 Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
958 const std::vector<std::string> &variationTags, std::string_view variationName)
959 {
960 return VaryImpl<false>(colNames, std::forward<F>(expression), inputColumns, variationTags, variationName);
961 }
962
963 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
964 /// \param[in] colNames set of names of the columns for which varied values are provided.
965 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
966 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
967 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
968 /// \param[in] inputColumns the names of the columns to be passed to the callable.
969 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
970 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
971 /// colName is used if none is provided.
972 ///
973 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
974 /// is avoided.
975 ///
976 /// \note See also This Vary() overload for more information.
977 template <typename F>
979 Vary(std::initializer_list<std::string> colNames, F &&expression, const ColumnNames_t &inputColumns,
980 const std::vector<std::string> &variationTags, std::string_view variationName)
981 {
982 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, variationTags, variationName);
983 }
984
985 /// \brief Register systematic variations for multiple existing columns using auto-generated tags.
986 /// \param[in] colNames set of names of the columns for which varied values are provided.
987 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
988 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
989 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
990 /// \param[in] inputColumns the names of the columns to be passed to the callable.
991 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
992 /// `"1"`, etc.
993 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
994 /// colName is used if none is provided.
995 ///
996 /// This overload of Vary takes a list of column names as first argument.
997 /// It takes an `nVariations` parameter instead of a list of tag names (`variationTags`). Tag names
998 /// will be auto-generated as the sequence 0...``nVariations-1``.
999 ///
1000 /// Example usage:
1001 /// ~~~{.cpp}
1002 /// auto nominal_hx =
1003 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
1004 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
1005 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
1006 /// 2, // auto-generated variation tags
1007 /// "ptAndEta") // variation name
1008 /// .Histo1D("pt", "eta");
1009 ///
1010 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1011 /// hx["nominal"].Draw();
1012 /// hx["ptAndEta:0"].Draw("SAME");
1013 /// hx["ptAndEta:1"].Draw("SAME");
1014 /// ~~~
1015 ///
1016 /// \note See also This Vary() overload for more information.
1017 template <typename F>
1019 Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
1020 std::size_t nVariations, std::string_view variationName)
1021 {
1022 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
1023
1024 std::vector<std::string> variationTags;
1025 variationTags.reserve(nVariations);
1026 for (std::size_t i = 0u; i < nVariations; ++i)
1027 variationTags.emplace_back(std::to_string(i));
1028
1029 return Vary(colNames, std::forward<F>(expression), inputColumns, std::move(variationTags), variationName);
1030 }
1031
1032 /// \brief Register systematic variations for for multiple existing columns using custom variation tags.
1033 /// \param[in] colNames set of names of the columns for which varied values are provided.
1034 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
1035 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
1036 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
1037 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1038 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1039 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1040 /// `"1"`, etc.
1041 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1042 /// colName is used if none is provided.
1043 ///
1044 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1045 /// is avoided.
1046 ///
1047 /// \note See also This Vary() overload for more information.
1048 template <typename F>
1050 Vary(std::initializer_list<std::string> colNames, F &&expression, const ColumnNames_t &inputColumns,
1051 std::size_t nVariations, std::string_view variationName)
1052 {
1053 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, nVariations, variationName);
1054 }
1055
1056 /// \brief Register systematic variations for a single existing column using custom variation tags.
1057 /// \param[in] colName name of the column for which varied values are provided.
1058 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1059 /// values for the specified column.
1060 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1061 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1062 /// colName is used if none is provided.
1063 ///
1064 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1065 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1066 /// defined as `{"down", "up"}`.
1067 /// ~~~{.cpp}
1068 /// auto nominal_hx =
1069 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", {"down", "up"})
1070 /// .Filter("pt > k")
1071 /// .Define("x", someFunc, {"pt"})
1072 /// .Histo1D("x");
1073 ///
1074 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1075 /// hx["nominal"].Draw();
1076 /// hx["pt:down"].Draw("SAME");
1077 /// hx["pt:up"].Draw("SAME");
1078 /// ~~~
1079 ///
1080 /// \note See also This Vary() overload for more information.
1081 RInterface<Proxied, DS_t> Vary(std::string_view colName, std::string_view expression,
1082 const std::vector<std::string> &variationTags, std::string_view variationName = "")
1083 {
1084 std::vector<std::string> colNames{{std::string(colName)}};
1085 const std::string theVariationName{variationName.empty() ? colName : variationName};
1086
1087 return JittedVaryImpl(colNames, expression, variationTags, theVariationName, /*isSingleColumn=*/true);
1088 }
1089
1090 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
1091 /// \param[in] colName name of the column for which varied values are provided.
1092 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1093 /// values for the specified column.
1094 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1095 /// `"1"`, etc.
1096 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1097 /// colName is used if none is provided.
1098 ///
1099 /// This overload adds the possibility for the expression used to evaluate the varied values to be a just-in-time
1100 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1101 /// auto-generated.
1102 /// ~~~{.cpp}
1103 /// auto nominal_hx =
1104 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", 2)
1105 /// .Histo1D("pt");
1106 ///
1107 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1108 /// hx["nominal"].Draw();
1109 /// hx["pt:0"].Draw("SAME");
1110 /// hx["pt:1"].Draw("SAME");
1111 /// ~~~
1112 ///
1113 /// \note See also This Vary() overload for more information.
1114 RInterface<Proxied, DS_t> Vary(std::string_view colName, std::string_view expression, std::size_t nVariations,
1115 std::string_view variationName = "")
1116 {
1117 std::vector<std::string> variationTags;
1118 variationTags.reserve(nVariations);
1119 for (std::size_t i = 0u; i < nVariations; ++i)
1120 variationTags.emplace_back(std::to_string(i));
1121
1122 return Vary(colName, expression, std::move(variationTags), variationName);
1123 }
1124
1125 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1126 /// \param[in] colNames set of names of the columns for which varied values are provided.
1127 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1128 /// values for the specified columns.
1129 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1130 /// `"1"`, etc.
1131 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1132 ///
1133 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1134 /// compiled. It takes an nVariations parameter instead of a list of tag names.
1135 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
1136 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
1137 /// The example below shows how Vary() is used while dealing with multiple columns.
1138 ///
1139 /// ~~~{.cpp}
1140 /// auto nominal_hx =
1141 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1142 /// .Histo1D("x", "y");
1143 ///
1144 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1145 /// hx["nominal"].Draw();
1146 /// hx["xy:0"].Draw("SAME");
1147 /// hx["xy:1"].Draw("SAME");
1148 /// ~~~
1149 ///
1150 /// \note See also This Vary() overload for more information.
1151 RInterface<Proxied, DS_t> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1152 std::size_t nVariations, std::string_view variationName)
1153 {
1154 std::vector<std::string> variationTags;
1155 variationTags.reserve(nVariations);
1156 for (std::size_t i = 0u; i < nVariations; ++i)
1157 variationTags.emplace_back(std::to_string(i));
1158
1159 return Vary(colNames, expression, std::move(variationTags), variationName);
1160 }
1161
1162 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1163 /// \param[in] colNames set of names of the columns for which varied values are provided.
1164 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1165 /// values for the specified column.
1166 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1167 /// `"1"`, etc.
1168 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1169 /// colName is used if none is provided.
1170 ///
1171 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1172 /// is avoided.
1173 ///
1174 /// \note See also This Vary() overload for more information.
1175 RInterface<Proxied, DS_t> Vary(std::initializer_list<std::string> colNames, std::string_view expression,
1176 std::size_t nVariations, std::string_view variationName)
1177 {
1178 return Vary(std::vector<std::string>(colNames), expression, nVariations, variationName);
1179 }
1180
1181 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
1182 /// \param[in] colNames set of names of the columns for which varied values are provided.
1183 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1184 /// values for the specified columns.
1185 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1186 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1187 ///
1188 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1189 /// compiled. The example below shows how Vary() is used while dealing with multiple columns. The tags are defined as
1190 /// `{"down", "up"}`.
1191 /// ~~~{.cpp}
1192 /// auto nominal_hx =
1193 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1194 /// .Histo1D("x", "y");
1195 ///
1196 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1197 /// hx["nominal"].Draw();
1198 /// hx["xy:down"].Draw("SAME");
1199 /// hx["xy:up"].Draw("SAME");
1200 /// ~~~
1201 ///
1202 /// \note See also This Vary() overload for more information.
1203 RInterface<Proxied, DS_t> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1204 const std::vector<std::string> &variationTags, std::string_view variationName)
1205 {
1206 return JittedVaryImpl(colNames, expression, variationTags, variationName, /*isSingleColumn=*/false);
1207 }
1208
1209 ////////////////////////////////////////////////////////////////////////////
1210 /// \brief Allow to refer to a column with a different name.
1211 /// \param[in] alias name of the column alias
1212 /// \param[in] columnName of the column to be aliased
1213 /// \return the first node of the computation graph for which the alias is available.
1214 ///
1215 /// Aliasing an alias is supported.
1216 ///
1217 /// ### Example usage:
1218 /// ~~~{.cpp}
1219 /// auto df_with_alias = df.Alias("simple_name", "very_long&complex_name!!!");
1220 /// ~~~
1221 RInterface<Proxied, DS_t> Alias(std::string_view alias, std::string_view columnName)
1222 {
1223 // The symmetry with Define is clear. We want to:
1224 // - Create globally the alias and return this very node, unchanged
1225 // - Make aliases accessible based on chains and not globally
1226
1227 // Helper to find out if a name is a column
1229
1230 constexpr auto where = "Alias";
1232 // If the alias name is a column name, there is a problem
1234
1235 const auto validColumnName = GetValidatedColumnNames(1, {std::string(columnName)})[0];
1236
1238 newCols.AddAlias(alias, validColumnName);
1239
1241
1242 return newInterface;
1243 }
1244
1245 ////////////////////////////////////////////////////////////////////////////
1246 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
1247 /// \tparam ColumnTypes variadic list of branch/column types.
1248 /// \param[in] treename The name of the output TTree.
1249 /// \param[in] filename The name of the output TFile.
1250 /// \param[in] columnList The list of names of the columns/branches to be written.
1251 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
1252 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1253 ///
1254 /// Support for writing of nested branches is limited (although RDataFrame is able to read them) and dot ('.')
1255 /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot.
1256 /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also
1257 /// written out and it appears before the array in the columnList.
1258 ///
1259 /// By default, in case of TTree or TChain inputs, Snapshot will try to write out all top-level branches. For other
1260 /// types of inputs, all columns returned by GetColumnNames() will be written out. If friend trees or chains are
1261 /// present, by default all friend top-level branches that have names that do not collide with
1262 /// names of branches in the main TTree/TChain will be written out. Since v6.24, Snapshot will also write out
1263 /// friend branches with the same names of branches in the main TTree/TChain with names of the form
1264 /// `<friendname>_<branchname>` in order to differentiate them from the branches in the main tree/chain.
1265 ///
1266 /// ### Writing to a sub-directory
1267 ///
1268 /// Snapshot supports writing the TTree in a sub-directory inside the TFile. It is sufficient to specify the path to
1269 /// the TTree as part of the TTree name, e.g. `df.Snapshot("subdir/t", "f.root")` write TTree `t` in the
1270 /// sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed).
1271 ///
1272 /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
1273 /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled
1274 /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in
1275 /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
1276 /// error out if such a "shuffled" TTree is used in a friendship.
1277 ///
1278 /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the
1279 /// requested output TTree to the file, with all the branches requested to preserve the dataset schema.
1280 ///
1281 /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns
1282 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1283 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1284 /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`.
1285 ///
1286 /// ### Example invocations:
1287 ///
1288 /// ~~~{.cpp}
1289 /// // without specifying template parameters (column types automatically deduced)
1290 /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"});
1291 ///
1292 /// // specifying template parameters ("x" is `int`, "y" is `float`)
1293 /// df.Snapshot<int, float>("outputTree", "outputFile.root", {"x", "y"});
1294 /// ~~~
1295 ///
1296 /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in
1297 /// `RSnapshotOptions`:
1298 /// ~~~{.cpp}
1299 /// RSnapshotOptions opts;
1300 /// opts.fLazy = true;
1301 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
1302 /// ~~~
1303 template <typename... ColumnTypes>
1305 Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList,
1306 const RSnapshotOptions &options = RSnapshotOptions())
1307 {
1308 return SnapshotImpl<ColumnTypes...>(treename, filename, columnList, options);
1309 }
1310
1311 ////////////////////////////////////////////////////////////////////////////
1312 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
1313 /// \param[in] treename The name of the output TTree.
1314 /// \param[in] filename The name of the output TFile.
1315 /// \param[in] columnList The list of names of the columns/branches to be written.
1316 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
1317 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1318 ///
1319 /// This function returns a `RDataFrame` built with the output tree as a source.
1320 /// The types of the columns are automatically inferred and do not need to be specified.
1321 ///
1322 /// See above for a more complete description and example usages.
1325 const RSnapshotOptions &options = RSnapshotOptions())
1326 {
1327 // like columnList but with `#var` columns removed
1329 // like columnListWithoutSizeColumns but with aliases resolved
1332 // like validCols but with missing size branches required by array branches added in the right positions
1334 fLoopManager->GetBranchNames(), GetDataSource(), std::move(colListNoAliases), std::move(colListNoPoundSizes));
1337
1338 const auto fullTreeName = treename;
1340 treename = parsedTreePath.fTreeName;
1341 const auto &dirname = parsedTreePath.fDirName;
1342
1344
1346
1347 if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) {
1348 if (RDFInternal::GetDataSourceLabel(*this) == "TTreeDS") {
1349 throw std::runtime_error("Snapshotting from TTree to RNTuple is not yet supported. The current recommended "
1350 "way to convert TTrees to RNTuple is through the RNTupleImporter.");
1351 }
1352
1353 // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one
1354 // without a data source for now, and set it once the actual data source can be created (i.e., after
1355 // writing the RNTuple).
1356 auto newRDF = std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(colListNoPoundSizes));
1357
1358 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1359 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1360 options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */});
1361
1362 // The Snapshot helper will use colListNoAliasesWithSizeBranches (with aliases resolved) as input columns, and
1363 // colListWithAliasesAndSizeBranches (still with aliases in it, passed through snapHelperArgs) as output column
1364 // names.
1368 } else {
1369 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" &&
1370 options.fOutputFormat == ESnapshotOutputFormat::kDefault) {
1371 Warning("Snapshot",
1372 "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you "
1373 "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in "
1374 "RSnapshotOptions. Note that this current default behaviour might change in the future.");
1375 }
1376
1377 // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset
1378 // has actually been created and written to TFile, i.e. at the end of the Snapshot execution.
1379 auto newRDF = std::make_shared<RInterface<RLoopManager>>(
1380 std::make_shared<RLoopManager>(colListNoAliasesWithSizeBranches));
1381
1382 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1383 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1384 options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */});
1385
1388 colListNoAliasesWithSizeBranches.size(), options.fVector2RVec);
1389 }
1390
1391 if (!options.fLazy)
1392 *resPtr;
1393 return resPtr;
1394 }
1395
1396 // clang-format off
1397 ////////////////////////////////////////////////////////////////////////////
1398 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
1399 /// \param[in] treename The name of the output TTree.
1400 /// \param[in] filename The name of the output TFile.
1401 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1402 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree
1403 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1404 ///
1405 /// This function returns a `RDataFrame` built with the output tree as a source.
1406 /// The types of the columns are automatically inferred and do not need to be specified.
1407 ///
1408 /// See above for a more complete description and example usages.
1410 std::string_view columnNameRegexp = "",
1411 const RSnapshotOptions &options = RSnapshotOptions())
1412 {
1414
1416 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1418 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1419 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1424
1425 // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd.
1426 // RemoveDuplicates should preserve ordering of the columns: it might be meaningful.
1428
1430
1431 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") {
1433 }
1434
1435 return Snapshot(treename, filename, selectedColumns, options);
1436 }
1437 // clang-format on
1438
1439 // clang-format off
1440 ////////////////////////////////////////////////////////////////////////////
1441 /// \brief Save selected columns to disk, in a new TTree `treename` in file `filename`.
1442 /// \param[in] treename The name of the output TTree.
1443 /// \param[in] filename The name of the output TFile.
1444 /// \param[in] columnList The list of names of the columns/branches to be written.
1445 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree.
1446 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1447 ///
1448 /// This function returns a `RDataFrame` built with the output tree as a source.
1449 /// The types of the columns are automatically inferred and do not need to be specified.
1450 ///
1451 /// See above for a more complete description and example usages.
1453 std::initializer_list<std::string> columnList,
1454 const RSnapshotOptions &options = RSnapshotOptions())
1455 {
1457 return Snapshot(treename, filename, selectedColumns, options);
1458 }
1459 // clang-format on
1460
1461 ////////////////////////////////////////////////////////////////////////////
1462 /// \brief Save selected columns in memory.
1463 /// \tparam ColumnTypes variadic list of branch/column types.
1464 /// \param[in] columnList columns to be cached in memory.
1465 /// \return a `RDataFrame` that wraps the cached dataset.
1466 ///
1467 /// This action returns a new `RDataFrame` object, completely detached from
1468 /// the originating `RDataFrame`. The new dataframe only contains the cached
1469 /// columns and stores their content in memory for fast, zero-copy subsequent access.
1470 ///
1471 /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that
1472 /// fits in memory and that will be accessed many times.
1473 ///
1474 /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns
1475 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1476 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1477 /// Alias(): `df.Alias("nbar", "#bar").Cache<std::size_t>(..., {"nbar"})`.
1478 ///
1479 /// ### Example usage:
1480 ///
1481 /// **Types and columns specified:**
1482 /// ~~~{.cpp}
1483 /// auto cache_some_cols_df = df.Cache<double, MyClass, int>({"col0", "col1", "col2"});
1484 /// ~~~
1485 ///
1486 /// **Types inferred and columns specified (this invocation relies on jitting):**
1487 /// ~~~{.cpp}
1488 /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"});
1489 /// ~~~
1490 ///
1491 /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):**
1492 /// ~~~{.cpp}
1493 /// auto cache_all_cols_df = df.Cache(myRegexp);
1494 /// ~~~
1495 template <typename... ColumnTypes>
1497 {
1498 auto staticSeq = std::make_index_sequence<sizeof...(ColumnTypes)>();
1500 }
1501
1502 ////////////////////////////////////////////////////////////////////////////
1503 /// \brief Save selected columns in memory.
1504 /// \param[in] columnList columns to be cached in memory
1505 /// \return a `RDataFrame` that wraps the cached dataset.
1506 ///
1507 /// See the previous overloads for more information.
1509 {
1510 // Early return: if the list of columns is empty, just return an empty RDF
1511 // If we proceed, the jitted call will not compile!
1512 if (columnList.empty()) {
1513 auto nEntries = *this->Count();
1514 RInterface<RLoopManager> emptyRDF(std::make_shared<RLoopManager>(nEntries));
1515 return emptyRDF;
1516 }
1517
1518 std::stringstream cacheCall;
1520 RInterface<TTraits::TakeFirstParameter_t<decltype(upcastNode)>> upcastInterface(fProxiedPtr, *fLoopManager,
1521 fColRegister);
1522 // build a string equivalent to
1523 // "(RInterface<nodetype*>*)(this)->Cache<Ts...>(*(ColumnNames_t*)(&columnList))"
1524 RInterface<RLoopManager> resRDF(std::make_shared<ROOT::Detail::RDF::RLoopManager>(0));
1525 cacheCall << "*reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RLoopManager>*>("
1527 << ") = reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RNodeBase>*>("
1529
1531
1532 const auto validColumnNames =
1534 const auto colTypes = GetValidatedArgTypes(validColumnNames, fColRegister, fLoopManager->GetTree(),
1535 GetDataSource(), "Cache", /*vector2RVec=*/false);
1536 for (const auto &colType : colTypes)
1537 cacheCall << colType << ", ";
1538 if (!columnListWithoutSizeColumns.empty())
1539 cacheCall.seekp(-2, cacheCall.cur); // remove the last ",
1540 cacheCall << ">(*reinterpret_cast<std::vector<std::string>*>(" // vector<string> should be ColumnNames_t
1542
1543 // book the code to jit with the RLoopManager and trigger the event loop
1544 fLoopManager->ToJitExec(cacheCall.str());
1545 fLoopManager->Jit();
1546
1547 return resRDF;
1548 }
1549
1550 ////////////////////////////////////////////////////////////////////////////
1551 /// \brief Save selected columns in memory.
1552 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1553 /// \return a `RDataFrame` that wraps the cached dataset.
1554 ///
1555 /// The existing columns are matched against the regular expression. If the string provided
1556 /// is empty, all columns are selected. See the previous overloads for more information.
1558 {
1561 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1563 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1564 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1566 columnNames.reserve(definedColumns.size() + dsColumns.size());
1570 return Cache(selectedColumns);
1571 }
1572
1573 ////////////////////////////////////////////////////////////////////////////
1574 /// \brief Save selected columns in memory.
1575 /// \param[in] columnList columns to be cached in memory.
1576 /// \return a `RDataFrame` that wraps the cached dataset.
1577 ///
1578 /// See the previous overloads for more information.
1579 RInterface<RLoopManager> Cache(std::initializer_list<std::string> columnList)
1580 {
1582 return Cache(selectedColumns);
1583 }
1584
1585 // clang-format off
1586 ////////////////////////////////////////////////////////////////////////////
1587 /// \brief Creates a node that filters entries based on range: [begin, end).
1588 /// \param[in] begin Initial entry number considered for this range.
1589 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1590 /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0.
1591 /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries.
1592 ///
1593 /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset.
1594 /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported.
1595 ///
1596 /// ### Example usage:
1597 /// ~~~{.cpp}
1598 /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries
1599 /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards
1600 /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3
1601 /// ~~~
1602 // clang-format on
1603 RInterface<RDFDetail::RRange<Proxied>, DS_t> Range(unsigned int begin, unsigned int end, unsigned int stride = 1)
1604 {
1605 // check invariants
1606 if (stride == 0 || (end != 0 && end < begin))
1607 throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin.");
1608 CheckIMTDisabled("Range");
1609
1610 using Range_t = RDFDetail::RRange<Proxied>;
1611 auto rangePtr = std::make_shared<Range_t>(begin, end, stride, fProxiedPtr);
1613 return newInterface;
1614 }
1615
1616 // clang-format off
1617 ////////////////////////////////////////////////////////////////////////////
1618 /// \brief Creates a node that filters entries based on range.
1619 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1620 /// \return a node of the computation graph for which the range is defined.
1621 ///
1622 /// See the other Range overload for a detailed description.
1623 // clang-format on
1624 RInterface<RDFDetail::RRange<Proxied>, DS_t> Range(unsigned int end) { return Range(0, end, 1); }
1625
1626 // clang-format off
1627 ////////////////////////////////////////////////////////////////////////////
1628 /// \brief Execute a user-defined function on each entry (*instant action*).
1629 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1630 /// \param[in] columns Names of the columns/branches in input to the user function.
1631 ///
1632 /// The callable `f` is invoked once per entry. This is an *instant action*:
1633 /// upon invocation, an event loop as well as execution of all scheduled actions
1634 /// is triggered.
1635 /// Users are responsible for the thread-safety of this callable when executing
1636 /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT).
1637 ///
1638 /// ### Example usage:
1639 /// ~~~{.cpp}
1640 /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"});
1641 /// ~~~
1642 // clang-format on
1643 template <typename F>
1644 void Foreach(F f, const ColumnNames_t &columns = {})
1645 {
1646 using arg_types = typename TTraits::CallableTraits<decltype(f)>::arg_types_nodecay;
1647 using ret_type = typename TTraits::CallableTraits<decltype(f)>::ret_type;
1648 ForeachSlot(RDFInternal::AddSlotParameter<ret_type>(f, arg_types()), columns);
1649 }
1650
1651 // clang-format off
1652 ////////////////////////////////////////////////////////////////////////////
1653 /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*).
1654 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1655 /// \param[in] columns Names of the columns/branches in input to the user function.
1656 ///
1657 /// Same as `Foreach`, but the user-defined function takes an extra
1658 /// `unsigned int` as its first parameter, the *processing slot index*.
1659 /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`,
1660 /// for each thread of execution.
1661 /// This is meant as a helper in writing thread-safe `Foreach`
1662 /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`.
1663 /// The user-defined processing callable is able to follow different
1664 /// *streams of processing* indexed by the first parameter.
1665 /// `ForeachSlot` works just as well with single-thread execution: in that
1666 /// case `slot` will always be `0`.
1667 ///
1668 /// ### Example usage:
1669 /// ~~~{.cpp}
1670 /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"});
1671 /// ~~~
1672 // clang-format on
1673 template <typename F>
1674 void ForeachSlot(F f, const ColumnNames_t &columns = {})
1675 {
1677 constexpr auto nColumns = ColTypes_t::list_size;
1678
1681
1682 using Helper_t = RDFInternal::ForeachSlotHelper<F>;
1684
1685 auto action = std::make_unique<Action_t>(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister);
1686
1687 fLoopManager->Run();
1688 }
1689
1690 // clang-format off
1691 ////////////////////////////////////////////////////////////////////////////
1692 /// \brief Execute a user-defined reduce operation on the values of a column.
1693 /// \tparam F The type of the reduce callable. Automatically deduced.
1694 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1695 /// \param[in] f A callable with signature `T(T,T)`
1696 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1697 /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr.
1698 ///
1699 /// A reduction takes two values of a column and merges them into one (e.g.
1700 /// by summing them, taking the maximum, etc). This action performs the
1701 /// specified reduction operation on all processed column values, returning
1702 /// a single value of the same type. The callable f must satisfy the general
1703 /// requirements of a *processing function* besides having signature `T(T,T)`
1704 /// where `T` is the type of column columnName.
1705 ///
1706 /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a
1707 /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific
1708 /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this
1709 /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce`
1710 /// overload.
1711 ///
1712 /// ### Example usage:
1713 /// ~~~{.cpp}
1714 /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol");
1715 /// ~~~
1716 ///
1717 /// This action is *lazy*: upon invocation of this method the calculation is
1718 /// booked but not executed. Also see RResultPtr.
1719 // clang-format on
1721 RResultPtr<T> Reduce(F f, std::string_view columnName = "")
1722 {
1723 static_assert(
1724 std::is_default_constructible<T>::value,
1725 "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)");
1726 return Reduce(std::move(f), columnName, T());
1727 }
1728
1729 ////////////////////////////////////////////////////////////////////////////
1730 /// \brief Execute a user-defined reduce operation on the values of a column.
1731 /// \tparam F The type of the reduce callable. Automatically deduced.
1732 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1733 /// \param[in] f A callable with signature `T(T,T)`
1734 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1735 /// \param[in] redIdentity The reduced object of each thread is initialized to this value.
1736 /// \return the reduced quantity wrapped in a RResultPtr.
1737 ///
1738 /// ### Example usage:
1739 /// ~~~{.cpp}
1740 /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42);
1741 /// ~~~
1742 /// See the description of the first Reduce overload for more information.
1744 RResultPtr<T> Reduce(F f, std::string_view columnName, const T &redIdentity)
1745 {
1746 return Aggregate(f, f, columnName, redIdentity);
1747 }
1748
1749 ////////////////////////////////////////////////////////////////////////////
1750 /// \brief Return the number of entries processed (*lazy action*).
1751 /// \return the number of entries wrapped in a RResultPtr.
1752 ///
1753 /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`).
1754 /// This action is *lazy*: upon invocation of this method the calculation is
1755 /// booked but not executed. Also see RResultPtr.
1756 ///
1757 /// ### Example usage:
1758 /// ~~~{.cpp}
1759 /// auto nEntriesAfterCuts = myFilteredDf.Count();
1760 /// ~~~
1761 ///
1763 {
1764 const auto nSlots = fLoopManager->GetNSlots();
1765 auto cSPtr = std::make_shared<ULong64_t>(0);
1766 using Helper_t = RDFInternal::CountHelper;
1768 auto action = std::make_unique<Action_t>(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr,
1770 return MakeResultPtr(cSPtr, *fLoopManager, std::move(action));
1771 }
1772
1773 ////////////////////////////////////////////////////////////////////////////
1774 /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default).
1775 /// \tparam T The type of the column.
1776 /// \tparam COLL The type of collection used to store the values.
1777 /// \param[in] column The name of the column to collect the values of.
1778 /// \return the content of the selected column wrapped in a RResultPtr.
1779 ///
1780 /// The collection type to be specified for C-style array columns is `RVec<T>`:
1781 /// in this case the returned collection is a `std::vector<RVec<T>>`.
1782 /// ### Example usage:
1783 /// ~~~{.cpp}
1784 /// // In this case intCol is a std::vector<int>
1785 /// auto intCol = rdf.Take<int>("integerColumn");
1786 /// // Same content as above but in this case taken as a RVec<int>
1787 /// auto intColAsRVec = rdf.Take<int, RVec<int>>("integerColumn");
1788 /// // In this case intCol is a std::vector<RVec<int>>, a collection of collections
1789 /// auto cArrayIntCol = rdf.Take<RVec<int>>("cArrayInt");
1790 /// ~~~
1791 /// This action is *lazy*: upon invocation of this method the calculation is
1792 /// booked but not executed. Also see RResultPtr.
1793 template <typename T, typename COLL = std::vector<T>>
1794 RResultPtr<COLL> Take(std::string_view column = "")
1795 {
1796 const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)});
1797
1800
1801 using Helper_t = RDFInternal::TakeHelper<T, T, COLL>;
1803 auto valuesPtr = std::make_shared<COLL>();
1804 const auto nSlots = fLoopManager->GetNSlots();
1805
1806 auto action =
1807 std::make_unique<Action_t>(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister);
1808 return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action));
1809 }
1810
1811 ////////////////////////////////////////////////////////////////////////////
1812 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1813 /// \tparam V The type of the column used to fill the histogram.
1814 /// \param[in] model The returned histogram will be constructed using this as a model.
1815 /// \param[in] vName The name of the column that will fill the histogram.
1816 /// \return the monodimensional histogram wrapped in a RResultPtr.
1817 ///
1818 /// Columns can be of a container type (e.g. `std::vector<double>`), in which case the histogram
1819 /// is filled with each one of the elements of the container. In case multiple columns of container type
1820 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1821 /// possibly different lengths between events).
1822 /// This action is *lazy*: upon invocation of this method the calculation is
1823 /// booked but not executed. Also see RResultPtr.
1824 ///
1825 /// ### Example usage:
1826 /// ~~~{.cpp}
1827 /// // Deduce column type (this invocation needs jitting internally)
1828 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1829 /// // Explicit column type
1830 /// auto myHist2 = myDf.Histo1D<float>({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1831 /// ~~~
1832 ///
1833 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1834 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1835 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1836 template <typename V = RDFDetail::RInferredType>
1837 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "")
1838 {
1839 const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)});
1840
1842
1843 std::shared_ptr<::TH1D> h(nullptr);
1844 {
1845 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1846 h = model.GetHistogram();
1847 h->SetDirectory(nullptr);
1848 }
1849
1850 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1851 RDFInternal::HistoUtils<::TH1D>::SetCanExtendAllAxes(*h);
1853 }
1854
1855 ////////////////////////////////////////////////////////////////////////////
1856 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1857 /// \tparam V The type of the column used to fill the histogram.
1858 /// \param[in] vName The name of the column that will fill the histogram.
1859 /// \return the monodimensional histogram wrapped in a RResultPtr.
1860 ///
1861 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1862 /// The "name" and "title" strings are built starting from the input column name.
1863 /// See the description of the first Histo1D() overload for more details.
1864 ///
1865 /// ### Example usage:
1866 /// ~~~{.cpp}
1867 /// // Deduce column type (this invocation needs jitting internally)
1868 /// auto myHist1 = myDf.Histo1D("myColumn");
1869 /// // Explicit column type
1870 /// auto myHist2 = myDf.Histo1D<float>("myColumn");
1871 /// ~~~
1872 template <typename V = RDFDetail::RInferredType>
1874 {
1875 const auto h_name = std::string(vName);
1876 const auto h_title = h_name + ";" + h_name + ";count";
1877 return Histo1D<V>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName);
1878 }
1879
1880 ////////////////////////////////////////////////////////////////////////////
1881 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1882 /// \tparam V The type of the column used to fill the histogram.
1883 /// \tparam W The type of the column used as weights.
1884 /// \param[in] model The returned histogram will be constructed using this as a model.
1885 /// \param[in] vName The name of the column that will fill the histogram.
1886 /// \param[in] wName The name of the column that will provide the weights.
1887 /// \return the monodimensional histogram wrapped in a RResultPtr.
1888 ///
1889 /// See the description of the first Histo1D() overload for more details.
1890 ///
1891 /// ### Example usage:
1892 /// ~~~{.cpp}
1893 /// // Deduce column type (this invocation needs jitting internally)
1894 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1895 /// // Explicit column type
1896 /// auto myHist2 = myDf.Histo1D<float, int>({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1897 /// ~~~
1898 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1899 RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
1900 {
1901 const std::vector<std::string_view> columnViews = {vName, wName};
1903 ? ColumnNames_t()
1905 std::shared_ptr<::TH1D> h(nullptr);
1906 {
1907 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1908 h = model.GetHistogram();
1909 }
1911 }
1912
1913 ////////////////////////////////////////////////////////////////////////////
1914 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1915 /// \tparam V The type of the column used to fill the histogram.
1916 /// \tparam W The type of the column used as weights.
1917 /// \param[in] vName The name of the column that will fill the histogram.
1918 /// \param[in] wName The name of the column that will provide the weights.
1919 /// \return the monodimensional histogram wrapped in a RResultPtr.
1920 ///
1921 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1922 /// The "name" and "title" strings are built starting from the input column names.
1923 /// See the description of the first Histo1D() overload for more details.
1924 ///
1925 /// ### Example usage:
1926 /// ~~~{.cpp}
1927 /// // Deduce column types (this invocation needs jitting internally)
1928 /// auto myHist1 = myDf.Histo1D("myValue", "myweight");
1929 /// // Explicit column types
1930 /// auto myHist2 = myDf.Histo1D<float, int>("myValue", "myweight");
1931 /// ~~~
1932 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1933 RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName)
1934 {
1935 // We build name and title based on the value and weight column names
1936 std::string str_vName{vName};
1937 std::string str_wName{wName};
1938 const auto h_name = str_vName + "_weighted_" + str_wName;
1939 const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName;
1940 return Histo1D<V, W>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName);
1941 }
1942
1943 ////////////////////////////////////////////////////////////////////////////
1944 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1945 /// \tparam V The type of the column used to fill the histogram.
1946 /// \tparam W The type of the column used as weights.
1947 /// \param[in] model The returned histogram will be constructed using this as a model.
1948 /// \return the monodimensional histogram wrapped in a RResultPtr.
1949 ///
1950 /// This overload will use the first two default columns as column names.
1951 /// See the description of the first Histo1D() overload for more details.
1952 template <typename V, typename W>
1953 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.})
1954 {
1955 return Histo1D<V, W>(model, "", "");
1956 }
1957
1958 ////////////////////////////////////////////////////////////////////////////
1959 /// \brief Fill and return a two-dimensional histogram (*lazy action*).
1960 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
1961 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
1962 /// \param[in] model The returned histogram will be constructed using this as a model.
1963 /// \param[in] v1Name The name of the column that will fill the x axis.
1964 /// \param[in] v2Name The name of the column that will fill the y axis.
1965 /// \return the bidimensional histogram wrapped in a RResultPtr.
1966 ///
1967 /// Columns can be of a container type (e.g. std::vector<double>), in which case the histogram
1968 /// is filled with each one of the elements of the container. In case multiple columns of container type
1969 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1970 /// possibly different lengths between events).
1971 /// This action is *lazy*: upon invocation of this method the calculation is
1972 /// booked but not executed. Also see RResultPtr.
1973 ///
1974 /// ### Example usage:
1975 /// ~~~{.cpp}
1976 /// // Deduce column types (this invocation needs jitting internally)
1977 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1978 /// // Explicit column types
1979 /// auto myHist2 = myDf.Histo2D<float, float>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1980 /// ~~~
1981 ///
1982 ///
1983 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1984 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1985 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1986 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
1987 RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
1988 {
1989 std::shared_ptr<::TH2D> h(nullptr);
1990 {
1991 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1992 h = model.GetHistogram();
1993 }
1994 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
1995 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
1996 }
1997 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
1999 ? ColumnNames_t()
2002 }
2003
2004 ////////////////////////////////////////////////////////////////////////////
2005 /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*).
2006 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
2007 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
2008 /// \tparam W The type of the column used for the weights of the histogram.
2009 /// \param[in] model The returned histogram will be constructed using this as a model.
2010 /// \param[in] v1Name The name of the column that will fill the x axis.
2011 /// \param[in] v2Name The name of the column that will fill the y axis.
2012 /// \param[in] wName The name of the column that will provide the weights.
2013 /// \return the bidimensional histogram wrapped in a RResultPtr.
2014 ///
2015 /// This action is *lazy*: upon invocation of this method the calculation is
2016 /// booked but not executed. Also see RResultPtr.
2017 ///
2018 /// ### Example usage:
2019 /// ~~~{.cpp}
2020 /// // Deduce column types (this invocation needs jitting internally)
2021 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2022 /// // Explicit column types
2023 /// auto myHist2 = myDf.Histo2D<float, float, double>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2024 /// ~~~
2025 ///
2026 /// See the documentation of the first Histo2D() overload for more details.
2027 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2028 typename W = RDFDetail::RInferredType>
2030 Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2031 {
2032 std::shared_ptr<::TH2D> h(nullptr);
2033 {
2034 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2035 h = model.GetHistogram();
2036 }
2037 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
2038 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
2039 }
2040 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2042 ? ColumnNames_t()
2045 }
2046
2047 template <typename V1, typename V2, typename W>
2049 {
2050 return Histo2D<V1, V2, W>(model, "", "", "");
2051 }
2052
2053 ////////////////////////////////////////////////////////////////////////////
2054 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2055 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2056 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2057 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2058 /// \param[in] model The returned histogram will be constructed using this as a model.
2059 /// \param[in] v1Name The name of the column that will fill the x axis.
2060 /// \param[in] v2Name The name of the column that will fill the y axis.
2061 /// \param[in] v3Name The name of the column that will fill the z axis.
2062 /// \return the tridimensional histogram wrapped in a RResultPtr.
2063 ///
2064 /// This action is *lazy*: upon invocation of this method the calculation is
2065 /// booked but not executed. Also see RResultPtr.
2066 ///
2067 /// ### Example usage:
2068 /// ~~~{.cpp}
2069 /// // Deduce column types (this invocation needs jitting internally)
2070 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2071 /// "myValueX", "myValueY", "myValueZ");
2072 /// // Explicit column types
2073 /// auto myHist2 = myDf.Histo3D<double, double, float>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2074 /// "myValueX", "myValueY", "myValueZ");
2075 /// ~~~
2076 /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D
2077 /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in
2078 /// the RDataFrame description.
2079 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
2080 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2081 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2082 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2083 typename V3 = RDFDetail::RInferredType>
2084 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "",
2085 std::string_view v3Name = "")
2086 {
2087 std::shared_ptr<::TH3D> h(nullptr);
2088 {
2089 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2090 h = model.GetHistogram();
2091 }
2092 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2093 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2094 }
2095 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
2097 ? ColumnNames_t()
2100 }
2101
2102 ////////////////////////////////////////////////////////////////////////////
2103 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2104 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2105 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2106 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2107 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
2108 /// \param[in] model The returned histogram will be constructed using this as a model.
2109 /// \param[in] v1Name The name of the column that will fill the x axis.
2110 /// \param[in] v2Name The name of the column that will fill the y axis.
2111 /// \param[in] v3Name The name of the column that will fill the z axis.
2112 /// \param[in] wName The name of the column that will provide the weights.
2113 /// \return the tridimensional histogram wrapped in a RResultPtr.
2114 ///
2115 /// This action is *lazy*: upon invocation of this method the calculation is
2116 /// booked but not executed. Also see RResultPtr.
2117 ///
2118 /// ### Example usage:
2119 /// ~~~{.cpp}
2120 /// // Deduce column types (this invocation needs jitting internally)
2121 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2122 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2123 /// // Explicit column types
2124 /// using d_t = double;
2125 /// auto myHist2 = myDf.Histo3D<d_t, d_t, float, d_t>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2126 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2127 /// ~~~
2128 ///
2129 ///
2130 /// See the documentation of the first Histo2D() overload for more details.
2131 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2132 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2133 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name,
2134 std::string_view v3Name, std::string_view wName)
2135 {
2136 std::shared_ptr<::TH3D> h(nullptr);
2137 {
2138 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2139 h = model.GetHistogram();
2140 }
2141 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2142 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2143 }
2144 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
2146 ? ColumnNames_t()
2149 }
2150
2151 template <typename V1, typename V2, typename V3, typename W>
2153 {
2154 return Histo3D<V1, V2, V3, W>(model, "", "", "", "");
2155 }
2156
2157 ////////////////////////////////////////////////////////////////////////////
2158 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2159 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
2160 /// present.
2161 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
2162 /// object.
2163 /// \param[in] model The returned histogram will be constructed using this as a model.
2164 /// \param[in] columnList
2165 /// A list containing the names of the columns that will be passed when calling `Fill`.
2166 /// (N columns for unweighted filling, or N+1 columns for weighted filling)
2167 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2168 ///
2169 /// This action is *lazy*: upon invocation of this method the calculation is
2170 /// booked but not executed. See RResultPtr documentation.
2171 ///
2172 /// ### Example usage:
2173 /// ~~~{.cpp}
2174 /// auto myFilledObj = myDf.HistoND<float, float, float, float>({"name","title", 4,
2175 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2176 /// {"col0", "col1", "col2", "col3"});
2177 /// ~~~
2178 ///
2179 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
2181 {
2182 std::shared_ptr<::THnD> h(nullptr);
2183 {
2184 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2185 h = model.GetHistogram();
2186
2187 if (int(columnList.size()) == (h->GetNdimensions() + 1)) {
2188 h->Sumw2();
2189 } else if (int(columnList.size()) != h->GetNdimensions()) {
2190 throw std::runtime_error("Wrong number of columns for the specified number of histogram axes.");
2191 }
2192 }
2193 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(columnList, h, h,
2194 fProxiedPtr);
2195 }
2196
2197 ////////////////////////////////////////////////////////////////////////////
2198 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2199 /// \param[in] model The returned histogram will be constructed using this as a model.
2200 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2201 /// (N columns for unweighted filling, or N+1 columns for weighted filling)
2202 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2203 ///
2204 /// This action is *lazy*: upon invocation of this method the calculation is
2205 /// booked but not executed. Also see RResultPtr.
2206 ///
2207 /// ### Example usage:
2208 /// ~~~{.cpp}
2209 /// auto myFilledObj = myDf.HistoND({"name","title", 4,
2210 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2211 /// {"col0", "col1", "col2", "col3"});
2212 /// ~~~
2213 ///
2215 {
2216 std::shared_ptr<::THnD> h(nullptr);
2217 {
2218 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2219 h = model.GetHistogram();
2220
2221 if (int(columnList.size()) == (h->GetNdimensions() + 1)) {
2222 h->Sumw2();
2223 } else if (int(columnList.size()) != h->GetNdimensions()) {
2224 throw std::runtime_error("Wrong number of columns for the specified number of histogram axes.");
2225 }
2226 }
2228 columnList.size());
2229 }
2230
2231 ////////////////////////////////////////////////////////////////////////////
2232 /// \brief Fill and return a TGraph object (*lazy action*).
2233 /// \tparam X The type of the column used to fill the x axis.
2234 /// \tparam Y The type of the column used to fill the y axis.
2235 /// \param[in] x The name of the column that will fill the x axis.
2236 /// \param[in] y The name of the column that will fill the y axis.
2237 /// \return the TGraph wrapped in a RResultPtr.
2238 ///
2239 /// Columns can be of a container type (e.g. std::vector<double>), in which case the TGraph
2240 /// is filled with each one of the elements of the container.
2241 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2242 /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing.
2243 /// A name and a title to the TGraph is given based on the input column names.
2244 ///
2245 /// This action is *lazy*: upon invocation of this method the calculation is
2246 /// booked but not executed. Also see RResultPtr.
2247 ///
2248 /// ### Example usage:
2249 /// ~~~{.cpp}
2250 /// // Deduce column types (this invocation needs jitting internally)
2251 /// auto myGraph1 = myDf.Graph("xValues", "yValues");
2252 /// // Explicit column types
2253 /// auto myGraph2 = myDf.Graph<int, float>("xValues", "yValues");
2254 /// ~~~
2255 ///
2256 /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory
2257 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2258 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2259 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType>
2260 RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "")
2261 {
2262 auto graph = std::make_shared<::TGraph>();
2263 const std::vector<std::string_view> columnViews = {x, y};
2265 ? ColumnNames_t()
2267
2269
2270 // We build a default name and title based on the input columns
2271 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2272 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2273 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2274 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2275 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2276
2278 }
2279
2280 ////////////////////////////////////////////////////////////////////////////
2281 /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*).
2282 /// \param[in] x The name of the column that will fill the x axis.
2283 /// \param[in] y The name of the column that will fill the y axis.
2284 /// \param[in] exl The name of the column of X low errors
2285 /// \param[in] exh The name of the column of X high errors
2286 /// \param[in] eyl The name of the column of Y low errors
2287 /// \param[in] eyh The name of the column of Y high errors
2288 /// \return the TGraphAsymmErrors wrapped in a RResultPtr.
2289 ///
2290 /// Columns can be of a container type (e.g. std::vector<double>), in which case the graph
2291 /// is filled with each one of the elements of the container.
2292 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2293 ///
2294 /// This action is *lazy*: upon invocation of this method the calculation is
2295 /// booked but not executed. Also see RResultPtr.
2296 ///
2297 /// ### Example usage:
2298 /// ~~~{.cpp}
2299 /// // Deduce column types (this invocation needs jitting internally)
2300 /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2301 /// // Explicit column types
2302 /// using f = float
2303 /// auto myGAE2 = myDf.GraphAsymmErrors<f, f, f, f, f, f>("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2304 /// ~~~
2305 ///
2306 /// `GraphAssymErrors` should also be used for the cases in which values associated only with
2307 /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero.
2308 /// In such cases, user should do the following:
2309 /// ~~~{.cpp}
2310 /// // Create a column of zeros in RDataFrame
2311 /// auto rdf_withzeros = rdf.Define("zero", "0");
2312 /// // or alternatively:
2313 /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;});
2314 /// // Create the graph with y errors only
2315 /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh");
2316 /// ~~~
2317 ///
2318 /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory
2319 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2320 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2321 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType,
2325 GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "",
2326 std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "")
2327 {
2328 auto graph = std::make_shared<::TGraphAsymmErrors>();
2329 const std::vector<std::string_view> columnViews = {x, y, exl, exh, eyl, eyh};
2331 ? ColumnNames_t()
2333
2335
2336 // We build a default name and title based on the input columns
2337 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2338 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2339 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2340 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2341 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2342
2344 graph, fProxiedPtr);
2345 }
2346
2347 ////////////////////////////////////////////////////////////////////////////
2348 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2349 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2350 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2351 /// \param[in] model The model to be considered to build the new return value.
2352 /// \param[in] v1Name The name of the column that will fill the x axis.
2353 /// \param[in] v2Name The name of the column that will fill the y axis.
2354 /// \return the monodimensional profile wrapped in a RResultPtr.
2355 ///
2356 /// This action is *lazy*: upon invocation of this method the calculation is
2357 /// booked but not executed. Also see RResultPtr.
2358 ///
2359 /// ### Example usage:
2360 /// ~~~{.cpp}
2361 /// // Deduce column types (this invocation needs jitting internally)
2362 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2363 /// // Explicit column types
2364 /// auto myProf2 = myDf.Graph<int, float>({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2365 /// ~~~
2366 ///
2367 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2368 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2369 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2370 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2372 Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2373 {
2374 std::shared_ptr<::TProfile> h(nullptr);
2375 {
2376 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2377 h = model.GetProfile();
2378 }
2379
2380 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2381 throw std::runtime_error("Profiles with no axes limits are not supported yet.");
2382 }
2383 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2385 ? ColumnNames_t()
2388 }
2389
2390 ////////////////////////////////////////////////////////////////////////////
2391 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2392 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2393 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2394 /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present.
2395 /// \param[in] model The model to be considered to build the new return value.
2396 /// \param[in] v1Name The name of the column that will fill the x axis.
2397 /// \param[in] v2Name The name of the column that will fill the y axis.
2398 /// \param[in] wName The name of the column that will provide the weights.
2399 /// \return the monodimensional profile wrapped in a RResultPtr.
2400 ///
2401 /// This action is *lazy*: upon invocation of this method the calculation is
2402 /// booked but not executed. Also see RResultPtr.
2403 ///
2404 /// ### Example usage:
2405 /// ~~~{.cpp}
2406 /// // Deduce column types (this invocation needs jitting internally)
2407 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight");
2408 /// // Explicit column types
2409 /// auto myProf2 = myDf.Profile1D<int, float, double>({"profName", "profTitle", 64u, -4., 4.},
2410 /// "xValues", "yValues", "weight");
2411 /// ~~~
2412 ///
2413 /// See the first Profile1D() overload for more details.
2414 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2415 typename W = RDFDetail::RInferredType>
2417 Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2418 {
2419 std::shared_ptr<::TProfile> h(nullptr);
2420 {
2421 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2422 h = model.GetProfile();
2423 }
2424
2425 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2426 throw std::runtime_error("Profile histograms with no axes limits are not supported yet.");
2427 }
2428 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2430 ? ColumnNames_t()
2433 }
2434
2435 ////////////////////////////////////////////////////////////////////////////
2436 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2437 /// See the first Profile1D() overload for more details.
2438 template <typename V1, typename V2, typename W>
2440 {
2441 return Profile1D<V1, V2, W>(model, "", "", "");
2442 }
2443
2444 ////////////////////////////////////////////////////////////////////////////
2445 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2446 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2447 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2448 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2449 /// \param[in] model The returned profile will be constructed using this as a model.
2450 /// \param[in] v1Name The name of the column that will fill the x axis.
2451 /// \param[in] v2Name The name of the column that will fill the y axis.
2452 /// \param[in] v3Name The name of the column that will fill the z axis.
2453 /// \return the bidimensional profile wrapped in a RResultPtr.
2454 ///
2455 /// This action is *lazy*: upon invocation of this method the calculation is
2456 /// booked but not executed. Also see RResultPtr.
2457 ///
2458 /// ### Example usage:
2459 /// ~~~{.cpp}
2460 /// // Deduce column types (this invocation needs jitting internally)
2461 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2462 /// "xValues", "yValues", "zValues");
2463 /// // Explicit column types
2464 /// auto myProf2 = myDf.Profile2D<int, float, double>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2465 /// "xValues", "yValues", "zValues");
2466 /// ~~~
2467 ///
2468 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2469 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2470 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2471 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2472 typename V3 = RDFDetail::RInferredType>
2473 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "",
2474 std::string_view v2Name = "", std::string_view v3Name = "")
2475 {
2476 std::shared_ptr<::TProfile2D> h(nullptr);
2477 {
2478 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2479 h = model.GetProfile();
2480 }
2481
2482 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
2483 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
2484 }
2485 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
2487 ? ColumnNames_t()
2490 }
2491
2492 ////////////////////////////////////////////////////////////////////////////
2493 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2494 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2495 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2496 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2497 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
2498 /// \param[in] model The returned histogram will be constructed using this as a model.
2499 /// \param[in] v1Name The name of the column that will fill the x axis.
2500 /// \param[in] v2Name The name of the column that will fill the y axis.
2501 /// \param[in] v3Name The name of the column that will fill the z axis.
2502 /// \param[in] wName The name of the column that will provide the weights.
2503 /// \return the bidimensional profile wrapped in a RResultPtr.
2504 ///
2505 /// This action is *lazy*: upon invocation of this method the calculation is
2506 /// booked but not executed. Also see RResultPtr.
2507 ///
2508 /// ### Example usage:
2509 /// ~~~{.cpp}
2510 /// // Deduce column types (this invocation needs jitting internally)
2511 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2512 /// "xValues", "yValues", "zValues", "weight");
2513 /// // Explicit column types
2514 /// auto myProf2 = myDf.Profile2D<int, float, double, int>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2515 /// "xValues", "yValues", "zValues", "weight");
2516 /// ~~~
2517 ///
2518 /// See the first Profile2D() overload for more details.
2519 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2520 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2521 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name,
2522 std::string_view v3Name, std::string_view wName)
2523 {
2524 std::shared_ptr<::TProfile2D> h(nullptr);
2525 {
2526 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2527 h = model.GetProfile();
2528 }
2529
2530 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
2531 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
2532 }
2533 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
2535 ? ColumnNames_t()
2538 }
2539
2540 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2541 /// See the first Profile2D() overload for more details.
2542 template <typename V1, typename V2, typename V3, typename W>
2544 {
2545 return Profile2D<V1, V2, V3, W>(model, "", "", "", "");
2546 }
2547
2548 ////////////////////////////////////////////////////////////////////////////
2549 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*).
2550 ///
2551 /// Type T must provide at least:
2552 /// - a copy-constructor
2553 /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList
2554 /// (these types can also be passed as template parameters to this method)
2555 /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector<T *>&)` that merges the
2556 /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that
2557 /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in
2558 /// the TCollection*).
2559 ///
2560 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present.
2561 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object.
2562 /// \tparam T The type of the object to fill. Automatically deduced.
2563 /// \param[in] model The model to be considered to build the new return value.
2564 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2565 /// \return the filled object wrapped in a RResultPtr.
2566 ///
2567 /// The user gives up ownership of the model object.
2568 /// The list of column names to be used for filling must always be specified.
2569 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed.
2570 /// Also see RResultPtr.
2571 ///
2572 /// ### Example usage:
2573 /// ~~~{.cpp}
2574 /// MyClass obj;
2575 /// // Deduce column types (this invocation needs jitting internally, and in this case
2576 /// // MyClass needs to be known to the interpreter)
2577 /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"});
2578 /// // explicit column types
2579 /// auto myFilledObj = myDf.Fill<float, float>(obj, {"col0", "col1"});
2580 /// ~~~
2581 ///
2582 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename T>
2584 {
2585 auto h = std::make_shared<std::decay_t<T>>(std::forward<T>(model));
2586 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
2587 throw std::runtime_error("The absence of axes limits is not supported yet.");
2588 }
2589 return CreateAction<RDFInternal::ActionTags::Fill, FirstColumn, OtherColumns...>(columnList, h, h, fProxiedPtr,
2590 columnList.size());
2591 }
2592
2593 ////////////////////////////////////////////////////////////////////////////
2594 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
2595 ///
2596 /// \tparam V The type of the value column
2597 /// \param[in] value The name of the column with the values to fill the statistics with.
2598 /// \return the filled TStatistic object wrapped in a RResultPtr.
2599 ///
2600 /// ### Example usage:
2601 /// ~~~{.cpp}
2602 /// // Deduce column type (this invocation needs jitting internally)
2603 /// auto stats0 = myDf.Stats("values");
2604 /// // Explicit column type
2605 /// auto stats1 = myDf.Stats<float>("values");
2606 /// ~~~
2607 ///
2608 template <typename V = RDFDetail::RInferredType>
2609 RResultPtr<TStatistic> Stats(std::string_view value = "")
2610 {
2612 if (!value.empty()) {
2613 columns.emplace_back(std::string(value));
2614 }
2616 if (std::is_same<V, RDFDetail::RInferredType>::value) {
2617 return Fill(TStatistic(), validColumnNames);
2618 } else {
2620 }
2621 }
2622
2623 ////////////////////////////////////////////////////////////////////////////
2624 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
2625 ///
2626 /// \tparam V The type of the value column
2627 /// \tparam W The type of the weight column
2628 /// \param[in] value The name of the column with the values to fill the statistics with.
2629 /// \param[in] weight The name of the column with the weights to fill the statistics with.
2630 /// \return the filled TStatistic object wrapped in a RResultPtr.
2631 ///
2632 /// ### Example usage:
2633 /// ~~~{.cpp}
2634 /// // Deduce column types (this invocation needs jitting internally)
2635 /// auto stats0 = myDf.Stats("values", "weights");
2636 /// // Explicit column types
2637 /// auto stats1 = myDf.Stats<int, float>("values", "weights");
2638 /// ~~~
2639 ///
2640 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2641 RResultPtr<TStatistic> Stats(std::string_view value, std::string_view weight)
2642 {
2643 ColumnNames_t columns{std::string(value), std::string(weight)};
2644 constexpr auto vIsInferred = std::is_same<V, RDFDetail::RInferredType>::value;
2645 constexpr auto wIsInferred = std::is_same<W, RDFDetail::RInferredType>::value;
2647 // We have 3 cases:
2648 // 1. Both types are inferred: we use Fill and let the jit kick in.
2649 // 2. One of the two types is explicit and the other one is inferred: the case is not supported.
2650 // 3. Both types are explicit: we invoke the fully compiled Fill method.
2651 if (vIsInferred && wIsInferred) {
2652 return Fill(TStatistic(), validColumnNames);
2653 } else if (vIsInferred != wIsInferred) {
2654 std::string error("The ");
2655 error += vIsInferred ? "value " : "weight ";
2656 error += "column type is explicit, while the ";
2657 error += vIsInferred ? "weight " : "value ";
2658 error += " is specified to be inferred. This case is not supported: please specify both types or none.";
2659 throw std::runtime_error(error);
2660 } else {
2662 }
2663 }
2664
2665 ////////////////////////////////////////////////////////////////////////////
2666 /// \brief Return the minimum of processed column values (*lazy action*).
2667 /// \tparam T The type of the branch/column.
2668 /// \param[in] columnName The name of the branch/column to be treated.
2669 /// \return the minimum value of the selected column wrapped in a RResultPtr.
2670 ///
2671 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2672 /// template specialization of this method.
2673 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2674 ///
2675 /// This action is *lazy*: upon invocation of this method the calculation is
2676 /// booked but not executed. Also see RResultPtr.
2677 ///
2678 /// ### Example usage:
2679 /// ~~~{.cpp}
2680 /// // Deduce column type (this invocation needs jitting internally)
2681 /// auto minVal0 = myDf.Min("values");
2682 /// // Explicit column type
2683 /// auto minVal1 = myDf.Min<double>("values");
2684 /// ~~~
2685 ///
2686 template <typename T = RDFDetail::RInferredType>
2688 {
2689 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2690 using RetType_t = RDFDetail::MinReturnType_t<T>;
2691 auto minV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::max());
2693 }
2694
2695 ////////////////////////////////////////////////////////////////////////////
2696 /// \brief Return the maximum of processed column values (*lazy action*).
2697 /// \tparam T The type of the branch/column.
2698 /// \param[in] columnName The name of the branch/column to be treated.
2699 /// \return the maximum value of the selected column wrapped in a RResultPtr.
2700 ///
2701 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2702 /// template specialization of this method.
2703 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2704 ///
2705 /// This action is *lazy*: upon invocation of this method the calculation is
2706 /// booked but not executed. Also see RResultPtr.
2707 ///
2708 /// ### Example usage:
2709 /// ~~~{.cpp}
2710 /// // Deduce column type (this invocation needs jitting internally)
2711 /// auto maxVal0 = myDf.Max("values");
2712 /// // Explicit column type
2713 /// auto maxVal1 = myDf.Max<double>("values");
2714 /// ~~~
2715 ///
2716 template <typename T = RDFDetail::RInferredType>
2718 {
2719 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2720 using RetType_t = RDFDetail::MaxReturnType_t<T>;
2721 auto maxV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::lowest());
2723 }
2724
2725 ////////////////////////////////////////////////////////////////////////////
2726 /// \brief Return the mean of processed column values (*lazy action*).
2727 /// \tparam T The type of the branch/column.
2728 /// \param[in] columnName The name of the branch/column to be treated.
2729 /// \return the mean value of the selected column wrapped in a RResultPtr.
2730 ///
2731 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2732 /// template specialization of this method.
2733 ///
2734 /// This action is *lazy*: upon invocation of this method the calculation is
2735 /// booked but not executed. Also see RResultPtr.
2736 ///
2737 /// ### Example usage:
2738 /// ~~~{.cpp}
2739 /// // Deduce column type (this invocation needs jitting internally)
2740 /// auto meanVal0 = myDf.Mean("values");
2741 /// // Explicit column type
2742 /// auto meanVal1 = myDf.Mean<double>("values");
2743 /// ~~~
2744 ///
2745 template <typename T = RDFDetail::RInferredType>
2746 RResultPtr<double> Mean(std::string_view columnName = "")
2747 {
2748 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2749 auto meanV = std::make_shared<double>(0);
2751 }
2752
2753 ////////////////////////////////////////////////////////////////////////////
2754 /// \brief Return the unbiased standard deviation of processed column values (*lazy action*).
2755 /// \tparam T The type of the branch/column.
2756 /// \param[in] columnName The name of the branch/column to be treated.
2757 /// \return the standard deviation value of the selected column wrapped in a RResultPtr.
2758 ///
2759 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2760 /// template specialization of this method.
2761 ///
2762 /// This action is *lazy*: upon invocation of this method the calculation is
2763 /// booked but not executed. Also see RResultPtr.
2764 ///
2765 /// ### Example usage:
2766 /// ~~~{.cpp}
2767 /// // Deduce column type (this invocation needs jitting internally)
2768 /// auto stdDev0 = myDf.StdDev("values");
2769 /// // Explicit column type
2770 /// auto stdDev1 = myDf.StdDev<double>("values");
2771 /// ~~~
2772 ///
2773 template <typename T = RDFDetail::RInferredType>
2774 RResultPtr<double> StdDev(std::string_view columnName = "")
2775 {
2776 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2777 auto stdDeviationV = std::make_shared<double>(0);
2779 }
2780
2781 // clang-format off
2782 ////////////////////////////////////////////////////////////////////////////
2783 /// \brief Return the sum of processed column values (*lazy action*).
2784 /// \tparam T The type of the branch/column.
2785 /// \param[in] columnName The name of the branch/column.
2786 /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible.
2787 /// \return the sum of the selected column wrapped in a RResultPtr.
2788 ///
2789 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2790 /// template specialization of this method.
2791 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2792 ///
2793 /// This action is *lazy*: upon invocation of this method the calculation is
2794 /// booked but not executed. Also see RResultPtr.
2795 ///
2796 /// ### Example usage:
2797 /// ~~~{.cpp}
2798 /// // Deduce column type (this invocation needs jitting internally)
2799 /// auto sum0 = myDf.Sum("values");
2800 /// // Explicit column type
2801 /// auto sum1 = myDf.Sum<double>("values");
2802 /// ~~~
2803 ///
2804 template <typename T = RDFDetail::RInferredType>
2806 Sum(std::string_view columnName = "",
2807 const RDFDetail::SumReturnType_t<T> &initValue = RDFDetail::SumReturnType_t<T>{})
2808 {
2809 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2810 auto sumV = std::make_shared<RDFDetail::SumReturnType_t<T>>(initValue);
2812 }
2813 // clang-format on
2814
2815 ////////////////////////////////////////////////////////////////////////////
2816 /// \brief Gather filtering statistics.
2817 /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr.
2818 ///
2819 /// Calling `Report` on the main `RDataFrame` object gathers stats for
2820 /// all named filters in the call graph. Calling this method on a
2821 /// stored chain state (i.e. a graph node different from the first) gathers
2822 /// the stats for all named filters in the chain section between the original
2823 /// `RDataFrame` and that node (included). Stats are gathered in the same
2824 /// order as the named filters have been added to the graph.
2825 /// A RResultPtr<RCutFlowReport> is returned to allow inspection of the
2826 /// effects cuts had.
2827 ///
2828 /// This action is *lazy*: upon invocation of
2829 /// this method the calculation is booked but not executed. See RResultPtr
2830 /// documentation.
2831 ///
2832 /// ### Example usage:
2833 /// ~~~{.cpp}
2834 /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2");
2835 /// auto cutReport = filtered3.Report();
2836 /// cutReport->Print();
2837 /// ~~~
2838 ///
2840 {
2841 bool returnEmptyReport = false;
2842 // if this is a RInterface<RLoopManager> on which `Define` has been called, users
2843 // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which
2844 // certainly does not contain named filters.
2845 // The number 4 takes into account the implicit columns for entry and slot number
2846 // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_)
2847 if (std::is_same<Proxied, RLoopManager>::value && fColRegister.GenerateColumnNames().size() > 4)
2848 returnEmptyReport = true;
2849
2850 auto rep = std::make_shared<RCutFlowReport>();
2851 using Helper_t = RDFInternal::ReportHelper<Proxied>;
2853
2854 auto action = std::make_unique<Action_t>(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}),
2856
2857 return MakeResultPtr(rep, *fLoopManager, std::move(action));
2858 }
2859
2860 /// \brief Returns the names of the filters created.
2861 /// \return the container of filters names.
2862 ///
2863 /// If called on a root node, all the filters in the computation graph will
2864 /// be printed. For any other node, only the filters upstream of that node.
2865 /// Filters without a name are printed as "Unnamed Filter"
2866 /// This is not an action nor a transformation, just a query to the RDataFrame object.
2867 ///
2868 /// ### Example usage:
2869 /// ~~~{.cpp}
2870 /// auto filtNames = d.GetFilterNames();
2871 /// for (auto &&filtName : filtNames) std::cout << filtName << std::endl;
2872 /// ~~~
2873 ///
2874 std::vector<std::string> GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); }
2875
2876 // clang-format off
2877 ////////////////////////////////////////////////////////////////////////////
2878 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
2879 /// \tparam F The type of the aggregator callable. Automatically deduced.
2880 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
2881 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
2882 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U&,T)`, where T is the type of the column, U is the type of the aggregator variable
2883 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
2884 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
2885 /// \param[in] aggIdentity The aggregator variable of each thread is initialized to this value (or is default-constructed if the parameter is omitted)
2886 /// \return the result of the aggregation wrapped in a RResultPtr.
2887 ///
2888 /// An aggregator callable takes two values, an aggregator variable and a column value. The aggregator variable is
2889 /// initialized to aggIdentity or default-constructed if aggIdentity is omitted.
2890 /// This action calls the aggregator callable for each processed entry, passing in the aggregator variable and
2891 /// the value of the column columnName.
2892 /// If the signature is `U(U,T)` the aggregator variable is then copy-assigned the result of the execution of the callable.
2893 /// Otherwise the signature of aggregator must be `void(U&,T)`.
2894 ///
2895 /// The merger callable is used to merge the partial accumulation results of each processing thread. It is only called in multi-thread executions.
2896 /// If its signature is `U(U,U)` the aggregator variables of each thread are merged two by two.
2897 /// If its signature is `void(std::vector<U>& a)` it is assumed that it merges all aggregators in a[0].
2898 ///
2899 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
2900 ///
2901 /// Example usage:
2902 /// ~~~{.cpp}
2903 /// auto aggregator = [](double acc, double x) { return acc * x; };
2904 /// ROOT::EnableImplicitMT();
2905 /// // If multithread is enabled, the aggregator function will be called by more threads
2906 /// // and will produce a vector of partial accumulators.
2907 /// // The merger function performs the final aggregation of these partial results.
2908 /// auto merger = [](std::vector<double> &accumulators) {
2909 /// for (auto i : ROOT::TSeqU(1u, accumulators.size())) {
2910 /// accumulators[0] *= accumulators[i];
2911 /// }
2912 /// };
2913 ///
2914 /// // The accumulator is initialized at this value by every thread.
2915 /// double initValue = 1.;
2916 ///
2917 /// // Multiplies all elements of the column "x"
2918 /// auto result = d.Aggregate(aggregator, merger, "x", initValue);
2919 /// ~~~
2920 // clang-format on
2922 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
2923 typename ArgTypesNoDecay = typename TTraits::CallableTraits<AccFun>::arg_types_nodecay,
2924 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
2925 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
2927 {
2928 RDFInternal::CheckAggregate<R, MergeFun>(ArgTypesNoDecay());
2929 const auto columns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2930
2933
2934 auto accObjPtr = std::make_shared<U>(aggIdentity);
2935 using Helper_t = RDFInternal::AggregateHelper<AccFun, MergeFun, R, T, U>;
2937 auto action = std::make_unique<Action_t>(
2938 Helper_t(std::move(aggregator), std::move(merger), accObjPtr, fLoopManager->GetNSlots()), validColumnNames,
2940 return MakeResultPtr(accObjPtr, *fLoopManager, std::move(action));
2941 }
2942
2943 // clang-format off
2944 ////////////////////////////////////////////////////////////////////////////
2945 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
2946 /// \tparam F The type of the aggregator callable. Automatically deduced.
2947 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
2948 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
2949 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U,T)`, where T is the type of the column, U is the type of the aggregator variable
2950 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
2951 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
2952 /// \return the result of the aggregation wrapped in a RResultPtr.
2953 ///
2954 /// See previous Aggregate overload for more information.
2955 // clang-format on
2957 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
2958 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
2959 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
2961 {
2962 static_assert(
2963 std::is_default_constructible<U>::value,
2964 "aggregated object cannot be default-constructed. Please provide an initialisation value (aggIdentity)");
2965 return Aggregate(std::move(aggregator), std::move(merger), columnName, U());
2966 }
2967
2968 // clang-format off
2969 ////////////////////////////////////////////////////////////////////////////
2970 /// \brief Book execution of a custom action using a user-defined helper object.
2971 /// \tparam FirstColumn The type of the first column used by this action. Inferred together with OtherColumns if not present.
2972 /// \tparam OtherColumns A list of the types of the other columns used by this action
2973 /// \tparam Helper The type of the user-defined helper. See below for the required interface it should expose.
2974 /// \param[in] helper The Action Helper to be scheduled.
2975 /// \param[in] columns The names of the columns on which the helper acts.
2976 /// \return the result of the helper wrapped in a RResultPtr.
2977 ///
2978 /// This method books a custom action for execution. The behavior of the action is completely dependent on the
2979 /// Helper object provided by the caller. The required interface for the helper is described below (more
2980 /// methods that the ones required can be present, e.g. a constructor that takes the number of worker threads is usually useful):
2981 ///
2982 /// ### Mandatory interface
2983 ///
2984 /// * `Helper` must publicly inherit from `ROOT::Detail::RDF::RActionImpl<Helper>`
2985 /// * `Helper::Result_t`: public alias for the type of the result of this action helper. `Result_t` must be default-constructible.
2986 /// * `Helper(Helper &&)`: a move-constructor is required. Copy-constructors are discouraged.
2987 /// * `std::shared_ptr<Result_t> GetResultPtr() const`: return a shared_ptr to the result of this action (of type
2988 /// Result_t). The RResultPtr returned by Book will point to this object. Note that this method can be called
2989 /// _before_ Initialize(), because the RResultPtr is constructed before the event loop is started.
2990 /// * `void Initialize()`: this method is called once before starting the event-loop. Useful for setup operations.
2991 /// It must reset the state of the helper to the expected state at the beginning of the event loop: the same helper,
2992 /// or copies of it, might be used for multiple event loops (e.g. in the presence of systematic variations).
2993 /// * `void InitTask(TTreeReader *, unsigned int slot)`: each working thread shall call this method during the event
2994 /// loop, before processing a batch of entries. The pointer passed as argument, if not null, will point to the TTreeReader
2995 /// that RDataFrame has set up to read the task's batch of entries. It is passed to the helper to allow certain advanced optimizations
2996 /// it should not usually serve any purpose for the Helper. This method is often no-op for simple helpers.
2997 /// * `void Exec(unsigned int slot, ColumnTypes...columnValues)`: each working thread shall call this method
2998 /// during the event-loop, possibly concurrently. No two threads will ever call Exec with the same 'slot' value:
2999 /// this parameter is there to facilitate writing thread-safe helpers. The other arguments will be the values of
3000 /// the requested columns for the particular entry being processed.
3001 /// * `void Finalize()`: this method is called at the end of the event loop. Commonly used to finalize the contents of the result.
3002 /// * `std::string GetActionName()`: it returns a string identifier for this type of action that RDataFrame will use in
3003 /// diagnostics, SaveGraph(), etc.
3004 ///
3005 /// ### Optional methods
3006 ///
3007 /// If these methods are implemented they enable extra functionality as per the description below.
3008 ///
3009 /// * `Result_t &PartialUpdate(unsigned int slot)`: if present, it must return the value of the partial result of this action for the given 'slot'.
3010 /// Different threads might call this method concurrently, but will do so with different 'slot' numbers.
3011 /// RDataFrame leverages this method to implement RResultPtr::OnPartialResult().
3012 /// * `ROOT::RDF::SampleCallback_t GetSampleCallback()`: if present, it must return a callable with the
3013 /// appropriate signature (see ROOT::RDF::SampleCallback_t) that will be invoked at the beginning of the processing
3014 /// of every sample, as in DefinePerSample().
3015 /// * `Helper MakeNew(void *newResult, std::string_view variation = "nominal")`: if implemented, it enables varying
3016 /// the action's result with VariationsFor(). It takes a type-erased new result that can be safely cast to a
3017 /// `std::shared_ptr<Result_t> *` (a pointer to shared pointer) and should be used as the action's output result.
3018 /// The function optionally takes the name of the current variation which could be useful in customizing its behaviour.
3019 ///
3020 /// In case Book is called without specifying column types as template arguments, corresponding typed code will be just-in-time compiled
3021 /// by RDataFrame. In that case the Helper class needs to be known to the ROOT interpreter.
3022 ///
3023 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3024 ///
3025 /// ### Examples
3026 /// See [this tutorial](https://root.cern/doc/master/df018__customActions_8C.html) for an example implementation of an action helper.
3027 ///
3028 /// It is also possible to inspect the code used by built-in RDataFrame actions at ActionHelpers.hxx.
3029 ///
3030 // clang-format on
3031 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename Helper>
3033 {
3034 using HelperT = std::decay_t<Helper>;
3035 // TODO add more static sanity checks on Helper
3037 static_assert(std::is_base_of<AH, HelperT>::value && std::is_convertible<HelperT *, AH *>::value,
3038 "Action helper of type T must publicly inherit from ROOT::Detail::RDF::RActionImpl<T>");
3039
3040 auto hPtr = std::make_shared<HelperT>(std::forward<Helper>(helper));
3041 auto resPtr = hPtr->GetResultPtr();
3042
3043 if (std::is_same<FirstColumn, RDFDetail::RInferredType>::value && columns.empty()) {
3045 } else {
3046 return CreateAction<RDFInternal::ActionTags::Book, FirstColumn, OtherColumns...>(columns, resPtr, hPtr,
3047 fProxiedPtr, columns.size());
3048 }
3049 }
3050
3051 ////////////////////////////////////////////////////////////////////////////
3052 /// \brief Provides a representation of the columns in the dataset.
3053 /// \tparam ColumnTypes variadic list of branch/column types.
3054 /// \param[in] columnList Names of the columns to be displayed.
3055 /// \param[in] nRows Number of events for each column to be displayed.
3056 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3057 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3058 ///
3059 /// This function returns a `RResultPtr<RDisplay>` containing all the entries to be displayed, organized in a tabular
3060 /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will
3061 /// return a complete version through `RDisplay::AsString()`.
3062 ///
3063 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see
3064 /// RResultPtr.
3065 ///
3066 /// Example usage:
3067 /// ~~~{.cpp}
3068 /// // Preparing the RResultPtr<RDisplay> object with all columns and default number of entries
3069 /// auto d1 = rdf.Display("");
3070 /// // Preparing the RResultPtr<RDisplay> object with two columns and 128 entries
3071 /// auto d2 = d.Display({"x", "y"}, 128);
3072 /// // Printing the short representations, the event loop will run
3073 /// d1->Print();
3074 /// d2->Print();
3075 /// ~~~
3076 template <typename... ColumnTypes>
3078 {
3079 CheckIMTDisabled("Display");
3080 auto newCols = columnList;
3081 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3082 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3083 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3084 // Need to add ULong64_t type corresponding to the first column rdfentry_
3085 return CreateAction<RDFInternal::ActionTags::Display, ULong64_t, ColumnTypes...>(
3086 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr);
3087 }
3088
3089 ////////////////////////////////////////////////////////////////////////////
3090 /// \brief Provides a representation of the columns in the dataset.
3091 /// \param[in] columnList Names of the columns to be displayed.
3092 /// \param[in] nRows Number of events for each column to be displayed.
3093 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3094 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3095 ///
3096 /// This overload automatically infers the column types.
3097 /// See the previous overloads for further details.
3098 ///
3099 /// Invoked when no types are specified to Display
3101 {
3102 CheckIMTDisabled("Display");
3103 auto newCols = columnList;
3104 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3105 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3106 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3108 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr,
3109 columnList.size() + 1);
3110 }
3111
3112 ////////////////////////////////////////////////////////////////////////////
3113 /// \brief Provides a representation of the columns in the dataset.
3114 /// \param[in] columnNameRegexp A regular expression to select the columns.
3115 /// \param[in] nRows Number of events for each column to be displayed.
3116 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3117 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3118 ///
3119 /// The existing columns are matched against the regular expression. If the string provided
3120 /// is empty, all columns are selected.
3121 /// See the previous overloads for further details.
3123 Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10)
3124 {
3125 const auto columnNames = GetColumnNames();
3128 }
3129
3130 ////////////////////////////////////////////////////////////////////////////
3131 /// \brief Provides a representation of the columns in the dataset.
3132 /// \param[in] columnList Names of the columns to be displayed.
3133 /// \param[in] nRows Number of events for each column to be displayed.
3134 /// \param[in] nMaxCollectionElements Number of maximum elements in collection.
3135 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3136 ///
3137 /// See the previous overloads for further details.
3139 Display(std::initializer_list<std::string> columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3140 {
3143 }
3144
3145private:
3147 std::enable_if_t<std::is_default_constructible<RetType>::value, RInterface<Proxied, DS_t>>
3148 DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
3149 {
3150 if (where.compare(0, 8, "Redefine") != 0) { // not a Redefine
3154 } else {
3158 }
3159
3160 using ArgTypes_t = typename TTraits::CallableTraits<F>::arg_types;
3162 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::Slot>::value, ArgTypes_t>::type;
3164 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::SlotAndEntry>::value, ColTypesTmp_t>::type;
3165
3166 constexpr auto nColumns = ColTypes_t::list_size;
3167
3170
3171 // Declare return type to the interpreter, for future use by jitted actions
3173 if (retTypeName.empty()) {
3174 // The type is not known to the interpreter.
3175 // We must not error out here, but if/when this column is used in jitted code
3177 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3178 }
3179
3181 auto newColumn = std::make_shared<NewCol_t>(name, retTypeName, std::forward<F>(expression), validColumnNames,
3183
3185 newCols.AddDefine(std::move(newColumn));
3186
3188
3189 return newInterface;
3190 }
3191
3192 // This overload is chosen when the callable passed to Define or DefineSlot returns void.
3193 // It simply fires a compile-time error. This is preferable to a static_assert in the main `Define` overload because
3194 // this way compilation of `Define` has no way to continue after throwing the error.
3196 bool IsFStringConv = std::is_convertible<F, std::string>::value,
3197 bool IsRetTypeDefConstr = std::is_default_constructible<RetType>::value>
3198 std::enable_if_t<!IsFStringConv && !IsRetTypeDefConstr, RInterface<Proxied, DS_t>>
3199 DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
3200 {
3201 static_assert(std::is_default_constructible<typename TTraits::CallableTraits<F>::ret_type>::value,
3202 "Error in `Define`: type returned by expression is not default-constructible");
3203 return *this; // never reached
3204 }
3205
3206 template <typename... ColumnTypes>
3208 const ColumnNames_t &columnList, const RSnapshotOptions &options)
3209 {
3211
3213 // validCols has aliases resolved, while columnListWithoutSizeColumns still has aliases in it.
3217
3219 const auto &treename = parsedTreePath.fTreeName;
3220 const auto &dirname = parsedTreePath.fDirName;
3221
3223
3225
3227 if (RDFInternal::GetDataSourceLabel(*this) == "TTreeDS") {
3228 throw std::runtime_error("Snapshotting from TTree to RNTuple is not yet supported. The current recommended "
3229 "way to convert TTrees to RNTuple is through the RNTupleImporter.");
3230 }
3231
3232 auto newRDF =
3233 std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(columnListWithoutSizeColumns));
3234
3235 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
3236 std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options,
3237 newRDF->GetLoopManager(), GetLoopManager(), true /* fToRNTuple */});
3238
3239 // The Snapshot helper will use validCols (with aliases resolved) as input columns, and
3240 // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column
3241 // names.
3242 resPtr = CreateAction<RDFInternal::ActionTags::Snapshot, ColumnTypes...>(validCols, newRDF, snapHelperArgs,
3243 fProxiedPtr);
3244 } else {
3245 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" &&
3247 Warning("Snapshot",
3248 "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you "
3249 "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in "
3250 "RSnapshotOptions. Note that this current default behaviour might change in the future.");
3251 }
3252
3253 // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset
3254 // has actually been created and written to TFile, i.e. at the end of the Snapshot execution.
3255 auto newRDF =
3256 std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(columnListWithoutSizeColumns));
3257
3258 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
3259 std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options,
3260 newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */});
3261
3262 // The Snapshot helper will use validCols (with aliases resolved) as input columns, and
3263 // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column
3264 // names.
3265 resPtr = CreateAction<RDFInternal::ActionTags::Snapshot, ColumnTypes...>(validCols, newRDF, snapHelperArgs,
3266 fProxiedPtr);
3267 }
3268
3269 if (!options.fLazy)
3270 *resPtr;
3271 return resPtr;
3272 }
3273
3274 ////////////////////////////////////////////////////////////////////////////
3275 /// \brief Implementation of cache.
3276 template <typename... ColTypes, std::size_t... S>
3278 {
3280
3281 // Check at compile time that the columns types are copy constructible
3282 constexpr bool areCopyConstructible =
3283 RDFInternal::TEvalAnd<std::is_copy_constructible<ColTypes>::value...>::value;
3284 static_assert(areCopyConstructible, "Columns of a type which is not copy constructible cannot be cached yet.");
3285
3287
3288 auto colHolders = std::make_tuple(Take<ColTypes>(columnListWithoutSizeColumns[S])...);
3289 auto ds = std::make_unique<RLazyDS<ColTypes...>>(
3290 std::make_pair(columnListWithoutSizeColumns[S], std::get<S>(colHolders))...);
3291
3292 RInterface<RLoopManager> cachedRDF(std::make_shared<RLoopManager>(std::move(ds), columnListWithoutSizeColumns));
3293
3294 return cachedRDF;
3295 }
3296
3297 template <bool IsSingleColumn, typename F>
3299 VaryImpl(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
3300 const std::vector<std::string> &variationTags, std::string_view variationName)
3301 {
3302 using F_t = std::decay_t<F>;
3303 using ColTypes_t = typename TTraits::CallableTraits<F_t>::arg_types;
3304 using RetType = typename TTraits::CallableTraits<F_t>::ret_type;
3305 constexpr auto nColumns = ColTypes_t::list_size;
3306
3308
3311
3313 if (retTypeName.empty()) {
3314 // The type is not known to the interpreter, but we don't want to error out
3315 // here, rather if/when this column is used in jitted code, so we inject a broken but telling type name.
3317 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3318 }
3319
3320 auto variation = std::make_shared<RDFInternal::RVariation<F_t, IsSingleColumn>>(
3321 colNames, variationName, std::forward<F>(expression), variationTags, retTypeName, fColRegister, *fLoopManager,
3323
3325 newCols.AddVariation(std::move(variation));
3326
3328
3329 return newInterface;
3330 }
3331
3332 RInterface<Proxied, DS_t> JittedVaryImpl(const std::vector<std::string> &colNames, std::string_view expression,
3333 const std::vector<std::string> &variationTags,
3334 std::string_view variationName, bool isSingleColumn)
3335 {
3336 R__ASSERT(!variationTags.empty() && "Must have at least one variation.");
3337 R__ASSERT(!colNames.empty() && "Must have at least one varied column.");
3338 R__ASSERT(!variationName.empty() && "Must provide a variation name.");
3339
3340 for (auto &colName : colNames) {
3344 }
3346
3347 // when varying multiple columns, they must be different columns
3348 if (colNames.size() > 1) {
3349 std::set<std::string> uniqueCols(colNames.begin(), colNames.end());
3350 if (uniqueCols.size() != colNames.size())
3351 throw std::logic_error("A column name was passed to the same Vary invocation multiple times.");
3352 }
3353
3354 auto upcastNodeOnHeap = RDFInternal::MakeSharedOnHeap(RDFInternal::UpcastNode(fProxiedPtr));
3357 fLoopManager->GetBranchNames(), upcastNodeOnHeap, isSingleColumn);
3358
3360 newColRegister.AddVariation(std::move(jittedVariation));
3361
3363
3364 return newInterface;
3365 }
3366
3367 template <typename Helper, typename ActionResultType>
3368 auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &resPtr,
3369 const std::shared_ptr<Helper> &hPtr,
3371 -> decltype(hPtr->Exec(0u), RResultPtr<ActionResultType>{})
3372 {
3374 }
3375
3376 template <typename Helper, typename ActionResultType, typename... Others>
3378 CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &,
3379 const std::shared_ptr<Helper>& /*hPtr*/,
3380 Others...)
3381 {
3382 throw std::logic_error(std::string("An action was booked with no input columns, but the action requires "
3383 "columns! The action helper type was ") +
3384 typeid(Helper).name());
3385 return {};
3386 }
3387
3388protected:
3389 RInterface(const std::shared_ptr<Proxied> &proxied, RLoopManager &lm,
3392 {
3393 }
3394
3395 const std::shared_ptr<Proxied> &GetProxiedPtr() const { return fProxiedPtr; }
3396};
3397
3398} // namespace RDF
3399
3400} // namespace ROOT
3401
3402#endif // ROOT_RDF_INTERFACE
#define f(i)
Definition RSha256.hxx:104
#define h(i)
Definition RSha256.hxx:106
unsigned int UInt_t
Definition RtypesCore.h:46
long long Long64_t
Definition RtypesCore.h:69
unsigned long long ULong64_t
Definition RtypesCore.h:70
#define X(type, name)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:229
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char name[80]
Definition TGX11.cxx:110
Base class for action helpers, see RInterface::Book() for more information.
implementation of FilterAvailable and FilterMissing operations
The head node of a RDF computation graph.
Helper class that provides the operation graph nodes.
A RDataFrame node that produces a result.
Definition RAction.hxx:53
A binder for user-defined columns, variations and aliases.
std::vector< std::string_view > GenerateColumnNames() const
Return the list of the names of the defined columns (Defines + Aliases).
The dataset specification for RDataFrame.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg, const std::shared_ptr< RDFNode > &proxiedPtr, const int=-1)
Create RAction object, return RResultPtr for the action Overload for the case in which all column typ...
RDataSource * GetDataSource() const
void CheckAndFillDSColumns(ColumnNames_t validCols, TTraits::TypeList< ColumnTypes... > typeList)
void CheckIMTDisabled(std::string_view callerName)
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
RDFDetail::RLoopManager * GetLoopManager() const
RDFInternal::RColumnRegister fColRegister
Contains the columns defined up to this node.
The public interface to the RDataFrame federation of classes.
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList)
Fill and return an N-dimensional histogram (lazy action).
RInterface(const RInterface &)=default
Copy-ctor for RInterface.
RResultPtr<::TH1D > Histo1D(std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RInterface(const std::shared_ptr< Proxied > &proxied, RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister)
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.})
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model)
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a one-dimensional profile (lazy action).
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList)
Fill and return an N-dimensional histogram (lazy action).
std::enable_if_t<!IsFStringConv &&!IsRetTypeDefConstr, RInterface< Proxied, DS_t > > DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp="", const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< TStatistic > Stats(std::string_view value="")
Return a TStatistic object, filled once per event (lazy action).
RInterface< Proxied, DS_t > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RInterface< Proxied, DS_t > Vary(std::string_view colName, std::string_view expression, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RResultPtr<::TGraph > Graph(std::string_view x="", std::string_view y="")
Fill and return a TGraph object (lazy action).
RResultPtr< ActionResultType > CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &, const std::shared_ptr< Helper > &, Others...)
RInterface< Proxied, DS_t > DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot.
RResultPtr< double > StdDev(std::string_view columnName="")
Return the unbiased standard deviation of processed column values (lazy action).
std::enable_if_t< std::is_default_constructible< RetType >::value, RInterface< Proxied, DS_t > > DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
RInterface< Proxied, DS_t > DefinePerSample(std::string_view name, F expression)
Define a new column that is updated when the input sample changes.
RInterface & operator=(RInterface &&)=default
Move-assignment operator for RInterface.
RInterface< Proxied, DS_t > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated tags.
void ForeachSlot(F f, const ColumnNames_t &columns={})
Execute a user-defined function requiring a processing slot index on each entry (instant action).
RInterface< Proxied, DS_t > Vary(std::string_view colName, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RInterface< Proxied, DS_t > Define(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column.
RResultPtr< TStatistic > Stats(std::string_view value, std::string_view weight)
Return a TStatistic object, filled once per event (lazy action).
RInterface< Proxied, DS_t > Redefine(std::string_view name, std::string_view expression)
Overwrite the value and/or type of an existing column.
auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &resPtr, const std::shared_ptr< Helper > &hPtr, TTraits::TypeList< RDFDetail::RInferredType >) -> decltype(hPtr->Exec(0u), RResultPtr< ActionResultType >{})
RInterface< Proxied, DS_t > Vary(const std::vector< std::string > &colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a two-dimensional histogram (lazy action).
RResultPtr< RInterface< RLoopManager > > SnapshotImpl(std::string_view fullTreeName, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options)
RInterface< Proxied, DS_t > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model)
Fill and return a one-dimensional profile (lazy action).
RInterface(const std::shared_ptr< RLoopManager > &proxied)
Build a RInterface from a RLoopManager.
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, const std::initializer_list< std::string > &columns)
Append a filter to the call graph.
RInterface< Proxied, DS_t > DefinePerSample(std::string_view name, std::string_view expression)
Define a new column that is updated when the input sample changes.
RResultPtr< double > Mean(std::string_view columnName="")
Return the mean of processed column values (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::initializer_list< std::string > columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< RDisplay > Display(std::initializer_list< std::string > columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface< Proxied, DS_t > Alias(std::string_view alias, std::string_view columnName)
Allow to refer to a column with a different name.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RInterface< Proxied, DS_t > Redefine(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RInterface< RLoopManager > Cache(std::string_view columnNameRegexp="")
Save selected columns in memory.
RInterface< Proxied, DS_t > VaryImpl(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
RResultPtr< typename std::decay_t< Helper >::Result_t > Book(Helper &&helper, const ColumnNames_t &columns={})
Book execution of a custom action using a user-defined helper object.
RResultPtr< RDisplay > Display(std::string_view columnNameRegexp="", size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied >, DS_t > FilterAvailable(std::string_view column)
Discard entries with missing values.
friend class RDFInternal::GraphDrawing::GraphCreatorHelper
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a weighted two-dimensional histogram (lazy action).
RInterface & operator=(const RInterface &)=default
Copy-assignment operator for RInterface.
RResultPtr< RDFDetail::SumReturnType_t< T > > Sum(std::string_view columnName="", const RDFDetail::SumReturnType_t< T > &initValue=RDFDetail::SumReturnType_t< T >{})
Return the sum of processed column values (lazy action).
RInterface< Proxied, DS_t > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action).
RInterface< Proxied, DS_t > Vary(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< Proxied, DS_t > Define(std::string_view name, std::string_view expression)
Define a new column.
std::shared_ptr< Proxied > fProxiedPtr
Smart pointer to the graph node encapsulated by this RInterface.
RResultPtr<::TH1D > Histo1D(std::string_view vName)
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< Proxied, DS_t > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< Proxied, DS_t > RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RInterface< RLoopManager > CacheImpl(const ColumnNames_t &columnList, std::index_sequence< S... >)
Implementation of cache.
RInterface< RDFDetail::RRange< Proxied >, DS_t > Range(unsigned int end)
Creates a node that filters entries based on range.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied >, DS_t > FilterMissing(std::string_view column)
Keep only the entries that have missing values.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
RInterface< RLoopManager > Cache(std::initializer_list< std::string > columnList)
Save selected columns in memory.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a two-dimensional profile (lazy action).
const std::shared_ptr< Proxied > & GetProxiedPtr() const
RInterface< Proxied, DS_t > JittedVaryImpl(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName, bool isSingleColumn)
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a three-dimensional histogram (lazy action).
RInterface< Proxied, DS_t > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for for multiple existing columns using custom variation tags.
RResultPtr< std::decay_t< T > > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr< RCutFlowReport > Report()
Gather filtering statistics.
RInterface< Proxied, DS_t > RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a two-dimensional profile (lazy action).
RResultPtr<::TGraphAsymmErrors > GraphAsymmErrors(std::string_view x="", std::string_view y="", std::string_view exl="", std::string_view exh="", std::string_view eyl="", std::string_view eyh="")
Fill and return a TGraphAsymmErrors object (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree treename in file filename.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName="")
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RInterface< Proxied, DS_t > DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot and the current entry.
RResultPtr< RDFDetail::MinReturnType_t< T > > Min(std::string_view columnName="")
Return the minimum of processed column values (lazy action).
RResultPtr< T > Reduce(F f, std::string_view columnName="")
Execute a user-defined reduce operation on the values of a column.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
RInterface< RDFDetail::RJittedFilter, DS_t > Filter(std::string_view expression, std::string_view name="")
Append a filter to the call graph.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model)
Fill and return a two-dimensional profile (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, const ColumnNames_t &columns={}, std::string_view name="")
Append a filter to the call graph.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RInterface(RInterface &&)=default
Move-ctor for RInterface.
RResultPtr< T > Reduce(F f, std::string_view columnName, const T &redIdentity)
Execute a user-defined reduce operation on the values of a column.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a three-dimensional histogram (lazy action).
RInterface< Proxied, DS_t > DefaultValueFor(std::string_view column, const T &defaultValue)
In case the value in the given column is missing, provide a default value.
RInterface< RDFDetail::RFilter< F, Proxied >, DS_t > Filter(F f, std::string_view name)
Append a filter to the call graph.
RInterface< RDFDetail::RRange< Proxied >, DS_t > Range(unsigned int begin, unsigned int end, unsigned int stride=1)
Creates a node that filters entries based on range: [begin, end).
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.}, std::string_view vName="")
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a one-dimensional profile (lazy action).
RResultPtr<::TH3D > Histo3D(const TH3DModel &model)
RResultPtr< RDFDetail::MaxReturnType_t< T > > Max(std::string_view columnName="")
Return the maximum of processed column values (lazy action).
RInterface< Proxied, DS_t > Vary(std::initializer_list< std::string > colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
A RDataSource implementation which is built on top of result proxies.
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
typename RemoveFirstParameter< T >::type RemoveFirstParameter_t
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
A TGraph is an object made of two arrays X and Y with npoints each.
Definition TGraph.h:41
Statistical variable, defined by its mean and variance (RMS).
Definition TStatistic.h:33
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
void CheckForNoVariations(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister)
Throw if the column has systematic variations attached.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
void CheckForRedefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &treeColumns, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is already there.
void CheckForDefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &treeColumns, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is not already there.
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
Book the jitting of a Define call.
void CheckValidCppVarName(std::string_view var, const std::string &where)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:559
void RemoveDuplicates(ColumnNames_t &columnNames)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:129
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
std::string PrettyPrintAddr(const void *const addr)
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::pair< std::vector< std::string >, std::vector< std::string > > AddSizeBranches(const std::vector< std::string > &branches, ROOT::RDF::RDataSource *ds, std::vector< std::string > &&colsWithoutAliases, std::vector< std::string > &&colsWithAliases)
Return copies of colsWithoutAliases and colsWithAliases with size branches for variable-sized array b...
void RemoveRNTupleSubFields(ColumnNames_t &columnNames)
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
std::shared_ptr< RDFDetail::RJittedFilter > BookFilterJit(std::shared_ptr< RDFDetail::RNodeBase > *prevNodeOnHeap, std::string_view name, std::string_view expression, const ColumnNames_t &branches, const RColumnRegister &colRegister, TTree *tree, RDataSource *ds)
Book the jitting of a Filter call.
ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
Take a list of column names, return that list with entries starting by '#' filtered out.
std::shared_ptr< RJittedVariation > BookVariationJit(const std::vector< std::string > &colNames, std::string_view variationName, const std::vector< std::string > &variationTags, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap, bool isSingleColumn)
Book the jitting of a Vary call.
void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
std::shared_ptr< RJittedDefine > BookDefinePerSampleJit(std::string_view name, std::string_view expression, RLoopManager &lm, const RColumnRegister &colRegister, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
Book the jitting of a DefinePerSample call.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
RInterface<::ROOT::Detail::RDF::RNodeBase, void > RNode
std::vector< std::string > ColumnNames_t
ROOT type_traits extensions.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:539
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:595
@ kError
An error.
void DisableImplicitMT()
Disables the implicit multi-threading in ROOT (see EnableImplicitMT).
Definition TROOT.cxx:581
type is TypeList if MustRemove is false, otherwise it is a TypeList with the first type removed
Definition Utils.hxx:152
A collection of options to steer the creation of the dataset on file.
ESnapshotOutputFormat fOutputFormat
Which data format to write to.
bool fLazy
Do not start the event loop when Snapshot is called.
A struct which stores some basic parameters of a TH1D.
std::shared_ptr<::TH1D > GetHistogram() const
A struct which stores some basic parameters of a TH2D.
std::shared_ptr<::TH2D > GetHistogram() const
A struct which stores some basic parameters of a TH3D.
std::shared_ptr<::TH3D > GetHistogram() const
A struct which stores some basic parameters of a THnD.
std::shared_ptr<::THnD > GetHistogram() const
A struct which stores some basic parameters of a TProfile.
std::shared_ptr<::TProfile > GetProfile() const
A struct which stores some basic parameters of a TProfile2D.
std::shared_ptr<::TProfile2D > GetProfile() const
Lightweight storage for a collection of types.