Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterface.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDF_TINTERFACE
12#define ROOT_RDF_TINTERFACE
13
14#include "ROOT/RDataSource.hxx"
20#include "ROOT/RDF/RDefine.hxx"
22#include "ROOT/RDF/RFilter.hxx"
27#include "ROOT/RDF/RRange.hxx"
29#include "ROOT/RDF/Utils.hxx"
32#include "ROOT/RResultPtr.hxx"
34#include <string_view>
35#include "ROOT/RVec.hxx"
36#include "ROOT/TypeTraits.hxx"
37#include "RtypesCore.h" // for ULong64_t
38#include "TDirectory.h"
39#include "TH1.h" // For Histo actions
40#include "TH2.h" // For Histo actions
41#include "TH3.h" // For Histo actions
42#include "THn.h"
43#include "THnSparse.h"
44#include "TProfile.h"
45#include "TProfile2D.h"
46#include "TStatistic.h"
47
48#include "RConfigure.h" // for R__HAS_ROOT7
49#ifdef R__HAS_ROOT7
51#include <ROOT/RHist.hxx>
52#include <ROOT/RHistEngine.hxx>
53#endif
54
55#include <algorithm>
56#include <cstddef>
57#include <initializer_list>
58#include <iterator> // std::back_insterter
59#include <limits>
60#include <memory>
61#include <set>
62#include <sstream>
63#include <stdexcept>
64#include <string>
65#include <type_traits> // is_same, enable_if
66#include <typeinfo>
67#include <unordered_set>
68#include <utility> // std::index_sequence
69#include <vector>
70#include <any>
71
72class TGraph;
73
74// Windows requires a forward decl of printValue to accept it as a valid friend function in RInterface
75namespace ROOT {
79class RDataFrame;
80} // namespace ROOT
81namespace cling {
82std::string printValue(ROOT::RDataFrame *tdf);
83}
84
85namespace ROOT {
86namespace RDF {
89namespace TTraits = ROOT::TypeTraits;
90
91template <typename Proxied>
92class RInterface;
93
95} // namespace RDF
96
97namespace Internal {
98namespace RDF {
100void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
101void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end);
104std::string GetDataSourceLabel(const ROOT::RDF::RNode &node);
105void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline);
106} // namespace RDF
107} // namespace Internal
108
109namespace RDF {
110
111// clang-format off
112/**
113 * \class ROOT::RDF::RInterface
114 * \ingroup dataframe
115 * \brief The public interface to the RDataFrame federation of classes.
116 * \tparam Proxied One of the "node" base types (e.g. RLoopManager, RFilterBase). The user never specifies this type manually.
117 *
118 * The documentation of each method features a one liner illustrating how to use the method, for example showing how
119 * the majority of the template parameters are automatically deduced requiring no or very little effort by the user.
120 */
121// clang-format on
122template <typename Proxied>
127 friend std::string cling::printValue(::ROOT::RDataFrame *tdf); // For a nice printing at the prompt
129
130 template <typename T>
131 friend class RInterface;
132
134 friend void RDFInternal::ChangeEmptyEntryRange(const RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
135 friend void RDFInternal::ChangeBeginAndEndEntries(const RNode &node, Long64_t start, Long64_t end);
137 friend std::string ROOT::Internal::RDF::GetDataSourceLabel(const RNode &node);
139 std::shared_ptr<Proxied> fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface.
140
141public:
142 ////////////////////////////////////////////////////////////////////////////
143 /// \brief Copy-assignment operator for RInterface.
144 RInterface &operator=(const RInterface &) = default;
145
146 ////////////////////////////////////////////////////////////////////////////
147 /// \brief Copy-ctor for RInterface.
148 RInterface(const RInterface &) = default;
149
150 ////////////////////////////////////////////////////////////////////////////
151 /// \brief Move-ctor for RInterface.
152 RInterface(RInterface &&) = default;
153
154 ////////////////////////////////////////////////////////////////////////////
155 /// \brief Move-assignment operator for RInterface.
157
158 ////////////////////////////////////////////////////////////////////////////
159 /// \brief Build a RInterface from a RLoopManager.
160 /// This constructor is only available for RInterface<RLoopManager>.
162 RInterface(const std::shared_ptr<RLoopManager> &proxied) : RInterfaceBase(proxied), fProxiedPtr(proxied)
163 {
164 }
165
166 ////////////////////////////////////////////////////////////////////////////
167 /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode.
168 /// Different RDataFrame methods return different C++ types. All nodes, however,
169 /// can be cast to this common type at the cost of a small performance penalty.
170 /// This allows, for example, storing RDataFrame nodes in a vector, or passing them
171 /// around via (non-template, C++11) helper functions.
172 /// Example usage:
173 /// ~~~{.cpp}
174 /// // a function that conditionally adds a Range to a RDataFrame node.
175 /// RNode MaybeAddRange(RNode df, bool mustAddRange)
176 /// {
177 /// return mustAddRange ? df.Range(1) : df;
178 /// }
179 /// // use as :
180 /// ROOT::RDataFrame df(10);
181 /// auto maybeRanged = MaybeAddRange(df, true);
182 /// ~~~
183 /// Note that it is not a problem to pass RNode's by value.
184 operator RNode() const
185 {
186 return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fColRegister);
187 }
188
189 ////////////////////////////////////////////////////////////////////////////
190 /// \brief Append a filter to the call graph.
191 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
192 /// signalling whether the event has passed the selection (true) or not (false).
193 /// \param[in] columns Names of the columns/branches in input to the filter function.
194 /// \param[in] name Optional name of this filter. See `Report`.
195 /// \return the filter node of the computation graph.
196 ///
197 /// Append a filter node at the point of the call graph corresponding to the
198 /// object this method is called on.
199 /// The callable `f` should not have side-effects (e.g. modification of an
200 /// external or static variable) to ensure correct results when implicit
201 /// multi-threading is active.
202 ///
203 /// RDataFrame only evaluates filters when necessary: if multiple filters
204 /// are chained one after another, they are executed in order and the first
205 /// one returning false causes the event to be discarded.
206 /// Even if multiple actions or transformations depend on the same filter,
207 /// it is executed once per entry. If its result is requested more than
208 /// once, the cached result is served.
209 ///
210 /// ### Example usage:
211 /// ~~~{.cpp}
212 /// // C++ callable (function, functor class, lambda...) that takes two parameters of the types of "x" and "y"
213 /// auto filtered = df.Filter(myCut, {"x", "y"});
214 ///
215 /// // String: it must contain valid C++ except that column names can be used instead of variable names
216 /// auto filtered = df.Filter("x*y > 0");
217 /// ~~~
218 ///
219 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
220 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
221 /// ~~~{.cpp}
222 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
223 /// ~~~
224 /// but instead this will:
225 /// ~~~{.cpp}
226 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
227 /// ~~~
230 {
231 RDFInternal::CheckFilter(f);
232 using ColTypes_t = typename TTraits::CallableTraits<F>::arg_types;
233 constexpr auto nColumns = ColTypes_t::list_size;
236
238
239 auto filterPtr = std::make_shared<F_t>(std::move(f), validColumnNames, fProxiedPtr, fColRegister, name);
241 }
242
243 ////////////////////////////////////////////////////////////////////////////
244 /// \brief Append a filter to the call graph.
245 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
246 /// signalling whether the event has passed the selection (true) or not (false).
247 /// \param[in] name Optional name of this filter. See `Report`.
248 /// \return the filter node of the computation graph.
249 ///
250 /// Refer to the first overload of this method for the full documentation.
253 {
254 // The sfinae is there in order to pick up the overloaded method which accepts two strings
255 // rather than this template method.
256 return Filter(f, {}, name);
257 }
258
259 ////////////////////////////////////////////////////////////////////////////
260 /// \brief Append a filter to the call graph.
261 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
262 /// signalling whether the event has passed the selection (true) or not (false).
263 /// \param[in] columns Names of the columns/branches in input to the filter function.
264 /// \return the filter node of the computation graph.
265 ///
266 /// Refer to the first overload of this method for the full documentation.
267 template <typename F>
268 RInterface<RDFDetail::RFilter<F, Proxied>> Filter(F f, const std::initializer_list<std::string> &columns)
269 {
270 return Filter(f, ColumnNames_t{columns});
271 }
272
273 ////////////////////////////////////////////////////////////////////////////
274 /// \brief Append a filter to the call graph.
275 /// \param[in] expression The filter expression in C++
276 /// \param[in] name Optional name of this filter. See `Report`.
277 /// \return the filter node of the computation graph.
278 ///
279 /// The expression is just-in-time compiled and used to filter entries. It must
280 /// be valid C++ syntax in which variable names are substituted with the names
281 /// of branches/columns.
282 ///
283 /// ### Example usage:
284 /// ~~~{.cpp}
285 /// auto filtered_df = df.Filter("myCollection.size() > 3");
286 /// auto filtered_name_df = df.Filter("myCollection.size() > 3", "Minumum collection size");
287 /// ~~~
288 ///
289 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
290 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
291 /// ~~~{.cpp}
292 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
293 /// ~~~
294 /// but instead this will:
295 /// ~~~{.cpp}
296 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
297 /// ~~~
298 RInterface<RDFDetail::RJittedFilter> Filter(std::string_view expression, std::string_view name = "")
299 {
301 fColRegister, nullptr, GetDataSource());
302
304 }
305
306 ////////////////////////////////////////////////////////////////////////////
307 /// \brief Discard entries with missing values
308 /// \param[in] column Column name whose entries with missing values should be discarded
309 /// \return The filter node of the computation graph
310 ///
311 /// This operation is useful in case an entry of the dataset is incomplete,
312 /// i.e. if one or more of the columns do not have valid values. If the value
313 /// of the input column is missing for an entry, the entire entry will be
314 /// discarded from the rest of this branch of the computation graph.
315 ///
316 /// Use cases include:
317 /// * When processing multiple files, one or more of them is missing a column
318 /// * In horizontal joining with entry matching, a certain dataset has no
319 /// match for the current entry.
320 ///
321 /// ### Example usage:
322 ///
323 /// \code{.py}
324 /// # Assume a dataset with columns [idx, x] matching another dataset with
325 /// # columns [idx, y]. For idx == 42, the right-hand dataset has no match
326 /// df = ROOT.RDataFrame(dataset)
327 /// df_nomissing = df.FilterAvailable("idx").Define("z", "x + y")
328 /// colz = df_nomissing.Take[int]("z")
329 /// \endcode
330 ///
331 /// \code{.cpp}
332 /// // Assume a dataset with columns [idx, x] matching another dataset with
333 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
334 /// ROOT::RDataFrame df{dataset};
335 /// auto df_nomissing = df.FilterAvailable("idx")
336 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
337 /// auto colz = df_nomissing.Take<int>("z");
338 /// \endcode
339 ///
340 /// \note See FilterMissing() if you want to keep only the entries with
341 /// missing values instead.
343 {
344 const auto columns = ColumnNames_t{column.data()};
345 // For now disable this functionality in case of an empty data source and
346 // the column name was not defined previously.
347 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
348 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
350 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ true, fProxiedPtr, fColRegister, columns);
353 }
354
355 ////////////////////////////////////////////////////////////////////////////
356 /// \brief Keep only the entries that have missing values.
357 /// \param[in] column Column name whose entries with missing values should be kept
358 /// \return The filter node of the computation graph
359 ///
360 /// This operation is useful in case an entry of the dataset is incomplete,
361 /// i.e. if one or more of the columns do not have valid values. It only
362 /// keeps the entries for which the value of the input column is missing.
363 ///
364 /// Use cases include:
365 /// * When processing multiple files, one or more of them is missing a column
366 /// * In horizontal joining with entry matching, a certain dataset has no
367 /// match for the current entry.
368 ///
369 /// ### Example usage:
370 ///
371 /// \code{.py}
372 /// # Assume a dataset made of two files vertically chained together, one has
373 /// # column "x" and the other has column "y"
374 /// df = ROOT.RDataFrame(dataset)
375 /// df_valid_col_x = df.FilterMissing("y")
376 /// df_valid_col_y = df.FilterMissing("x")
377 /// display_x = df_valid_col_x.Display(("x",))
378 /// display_y = df_valid_col_y.Display(("y",))
379 /// \endcode
380 ///
381 /// \code{.cpp}
382 /// // Assume a dataset made of two files vertically chained together, one has
383 /// // column "x" and the other has column "y"
384 /// ROOT.RDataFrame df{dataset};
385 /// auto df_valid_col_x = df.FilterMissing("y");
386 /// auto df_valid_col_y = df.FilterMissing("x");
387 /// auto display_x = df_valid_col_x.Display<int>({"x"});
388 /// auto display_y = df_valid_col_y.Display<int>({"y"});
389 /// \endcode
390 ///
391 /// \note See FilterAvailable() if you want to discard the entries in case
392 /// there is a missing value instead.
394 {
395 const auto columns = ColumnNames_t{column.data()};
396 // For now disable this functionality in case of an empty data source and
397 // the column name was not defined previously.
398 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
399 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
401 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ false, fProxiedPtr, fColRegister, columns);
404 }
405
406 // clang-format off
407 ////////////////////////////////////////////////////////////////////////////
408 /// \brief Define a new column.
409 /// \param[in] name The name of the defined column.
410 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column. This callable must be thread safe when used with multiple threads.
411 /// \param[in] columns Names of the columns/branches in input to the producer function.
412 /// \return the first node of the computation graph for which the new quantity is defined.
413 ///
414 /// Define a column that will be visible from all subsequent nodes
415 /// of the functional chain. The `expression` is only evaluated for entries that pass
416 /// all the preceding filters.
417 /// A new variable is created called `name`, accessible as if it was contained
418 /// in the dataset from subsequent transformations/actions.
419 ///
420 /// Use cases include:
421 /// * caching the results of complex calculations for easy and efficient multiple access
422 /// * extraction of quantities of interest from complex objects
423 ///
424 /// An exception is thrown if the name of the new column is already in use in this branch of the computation graph.
425 /// Note that the callable must be thread safe when called from multiple threads. Use DefineSlot() if needed.
426 ///
427 /// ### Example usage:
428 /// ~~~{.cpp}
429 /// // assuming a function with signature:
430 /// double myComplexCalculation(const RVec<float> &muon_pts);
431 /// // we can pass it directly to Define
432 /// auto df_with_define = df.Define("newColumn", myComplexCalculation, {"muon_pts"});
433 /// // alternatively, we can pass the body of the function as a string, as in Filter:
434 /// auto df_with_define = df.Define("newColumn", "x*x + y*y");
435 /// ~~~
436 ///
437 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
438 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
439 /// ~~~{.cpp}
440 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
441 /// ~~~
442 /// but instead this will:
443 /// ~~~{.cpp}
444 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
445 /// ~~~
447 RInterface<Proxied> Define(std::string_view name, F expression, const ColumnNames_t &columns = {})
448 {
449 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Define");
450 }
451 // clang-format on
452
453 // clang-format off
454 ////////////////////////////////////////////////////////////////////////////
455 /// \brief Define a new column with a value dependent on the processing slot.
456 /// \param[in] name The name of the defined column.
457 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
458 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding the slot number).
459 /// \return the first node of the computation graph for which the new quantity is defined.
460 ///
461 /// This alternative implementation of `Define` is meant as a helper to evaluate new column values in a thread-safe manner.
462 /// The expression must be a callable of signature R(unsigned int, T1, T2, ...) where `T1, T2...` are the types
463 /// of the columns that the expression takes as input. The first parameter is reserved for an unsigned integer
464 /// representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
465 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
466 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
467 ///
468 /// The following two calls are equivalent, although `DefineSlot` is slightly more performant:
469 /// ~~~{.cpp}
470 /// int function(unsigned int, double, double);
471 /// df.Define("x", function, {"rdfslot_", "column1", "column2"})
472 /// df.DefineSlot("x", function, {"column1", "column2"})
473 /// ~~~
474 ///
475 /// See Define() for more information.
476 template <typename F>
477 RInterface<Proxied> DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
478 {
479 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "DefineSlot");
480 }
481 // clang-format on
482
483 // clang-format off
484 ////////////////////////////////////////////////////////////////////////////
485 /// \brief Define a new column with a value dependent on the processing slot and the current entry.
486 /// \param[in] name The name of the defined column.
487 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
488 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
489 /// \return the first node of the computation graph for which the new quantity is defined.
490 ///
491 /// This alternative implementation of `Define` is meant as a helper in writing entry-specific, thread-safe custom
492 /// columns. The expression must be a callable of signature R(unsigned int, ULong64_t, T1, T2, ...) where `T1, T2...`
493 /// are the types of the columns that the expression takes as input. The first parameter is reserved for an unsigned
494 /// integer representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
495 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
496 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
497 /// The second parameter is reserved for a `ULong64_t` representing the current entry being processed by the current thread.
498 ///
499 /// The following two `Define`s are equivalent, although `DefineSlotEntry` is slightly more performant:
500 /// ~~~{.cpp}
501 /// int function(unsigned int, ULong64_t, double, double);
502 /// Define("x", function, {"rdfslot_", "rdfentry_", "column1", "column2"})
503 /// DefineSlotEntry("x", function, {"column1", "column2"})
504 /// ~~~
505 ///
506 /// See Define() for more information.
507 template <typename F>
508 RInterface<Proxied> DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
509 {
511 "DefineSlotEntry");
512 }
513 // clang-format on
514
515 ////////////////////////////////////////////////////////////////////////////
516 /// \brief Define a new column.
517 /// \param[in] name The name of the defined column.
518 /// \param[in] expression An expression in C++ which represents the defined value
519 /// \return the first node of the computation graph for which the new quantity is defined.
520 ///
521 /// The expression is just-in-time compiled and used to produce the column entries.
522 /// It must be valid C++ syntax in which variable names are substituted with the names
523 /// of branches/columns.
524 ///
525 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
526 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
527 /// ~~~{.cpp}
528 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
529 /// ~~~
530 /// but instead this will:
531 /// ~~~{.cpp}
532 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
533 /// ~~~
534 ///
535 /// Refer to the first overload of this method for the full documentation.
536 RInterface<Proxied> Define(std::string_view name, std::string_view expression)
537 {
538 constexpr auto where = "Define";
540 // these checks must be done before jitting lest we throw exceptions in jitted code
543
545
547 newCols.AddDefine(std::move(jittedDefine));
548
550
551 return newInterface;
552 }
553
554 ////////////////////////////////////////////////////////////////////////////
555 /// \brief Overwrite the value and/or type of an existing column.
556 /// \param[in] name The name of the column to redefine.
557 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
558 /// \param[in] columns Names of the columns/branches in input to the expression.
559 /// \return the first node of the computation graph for which the quantity is redefined.
560 ///
561 /// The old value of the column can be used as an input for the expression.
562 ///
563 /// An exception is thrown in case the column to redefine does not already exist.
564 /// See Define() for more information.
566 RInterface<Proxied> Redefine(std::string_view name, F expression, const ColumnNames_t &columns = {})
567 {
568 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Redefine");
569 }
570
571 // clang-format off
572 ////////////////////////////////////////////////////////////////////////////
573 /// \brief Overwrite the value and/or type of an existing column.
574 /// \param[in] name The name of the column to redefine.
575 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
576 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot).
577 /// \return the first node of the computation graph for which the new quantity is defined.
578 ///
579 /// The old value of the column can be used as an input for the expression.
580 /// An exception is thrown in case the column to redefine does not already exist.
581 ///
582 /// See DefineSlot() for more information.
583 // clang-format on
584 template <typename F>
585 RInterface<Proxied> RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
586 {
587 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "RedefineSlot");
588 }
589
590 // clang-format off
591 ////////////////////////////////////////////////////////////////////////////
592 /// \brief Overwrite the value and/or type of an existing column.
593 /// \param[in] name The name of the column to redefine.
594 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
595 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
596 /// \return the first node of the computation graph for which the new quantity is defined.
597 ///
598 /// The old value of the column can be used as an input for the expression.
599 /// An exception is thrown in case the column to re-define does not already exist.
600 ///
601 /// See DefineSlotEntry() for more information.
602 // clang-format on
603 template <typename F>
604 RInterface<Proxied> RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
605 {
607 "RedefineSlotEntry");
608 }
609
610 ////////////////////////////////////////////////////////////////////////////
611 /// \brief Overwrite the value and/or type of an existing column.
612 /// \param[in] name The name of the column to redefine.
613 /// \param[in] expression An expression in C++ which represents the defined value
614 /// \return the first node of the computation graph for which the new quantity is defined.
615 ///
616 /// The expression is just-in-time compiled and used to produce the column entries.
617 /// It must be valid C++ syntax in which variable names are substituted with the names
618 /// of branches/columns.
619 ///
620 /// The old value of the column can be used as an input for the expression.
621 /// An exception is thrown in case the column to re-define does not already exist.
622 ///
623 /// Aliases cannot be overridden. See the corresponding Define() overload for more information.
641
642 ////////////////////////////////////////////////////////////////////////////
643 /// \brief In case the value in the given column is missing, provide a default value
644 /// \tparam T The type of the column
645 /// \param[in] column Column name where missing values should be replaced by the given default value
646 /// \param[in] defaultValue Value to provide instead of a missing value
647 /// \return The node of the graph that will provide a default value
648 ///
649 /// This operation is useful in case an entry of the dataset is incomplete,
650 /// i.e. if one or more of the columns do not have valid values. It does not
651 /// modify the values of the column, but in case any entry is missing, it
652 /// will provide the default value to downstream nodes instead.
653 ///
654 /// Use cases include:
655 /// * When processing multiple files, one or more of them is missing a column
656 /// * In horizontal joining with entry matching, a certain dataset has no
657 /// match for the current entry.
658 ///
659 /// ### Example usage:
660 ///
661 /// \code{.cpp}
662 /// // Assume a dataset with columns [idx, x] matching another dataset with
663 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
664 /// ROOT::RDataFrame df{dataset};
665 /// auto df_default = df.DefaultValueFor("y", 33)
666 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
667 /// auto colz = df_default.Take<int>("z");
668 /// \endcode
669 ///
670 /// \code{.py}
671 /// df = ROOT.RDataFrame(dataset)
672 /// df_default = df.DefaultValueFor("y", 33).Define("z", "x + y")
673 /// colz = df_default.Take[int]("z")
674 /// \endcode
675 template <typename T>
676 RInterface<Proxied> DefaultValueFor(std::string_view column, const T &defaultValue)
677 {
678 constexpr auto where{"DefaultValueFor"};
680 // For now disable this functionality in case of an empty data source and
681 // the column name was not defined previously.
682 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
685
686 // Declare return type to the interpreter, for future use by jitted actions
688 if (retTypeName.empty()) {
689 // The type is not known to the interpreter.
690 // We must not error out here, but if/when this column is used in jitted code
691 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(T));
692 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
693 }
694
695 const auto validColumnNames = ColumnNames_t{column.data()};
696 auto newColumn = std::make_shared<ROOT::Internal::RDF::RDefaultValueFor<T>>(
697 column, retTypeName, defaultValue, validColumnNames, fColRegister, *fLoopManager);
699
701 newCols.AddDefine(std::move(newColumn));
702
704
705 return newInterface;
706 }
707
708 // clang-format off
709 ////////////////////////////////////////////////////////////////////////////
710 /// \brief Define a new column that is updated when the input sample changes.
711 /// \param[in] name The name of the defined column.
712 /// \param[in] expression A C++ callable that computes the new value of the defined column.
713 /// \return the first node of the computation graph for which the new quantity is defined.
714 ///
715 /// The signature of the callable passed as second argument should be `T(unsigned int slot, const ROOT::RDF::RSampleInfo &id)`
716 /// where:
717 /// - `T` is the type of the defined column
718 /// - `slot` is a number in the range [0, nThreads) that is different for each processing thread. This can simplify
719 /// the definition of thread-safe callables if you are interested in using parallel capabilities of RDataFrame.
720 /// - `id` is an instance of a ROOT::RDF::RSampleInfo object which contains information about the sample which is
721 /// being processed (see the class docs for more information).
722 ///
723 /// DefinePerSample() is useful to e.g. define a quantity that depends on which TTree in which TFile is being
724 /// processed or to inject a callback into the event loop that is only called when the processing of a new sample
725 /// starts rather than at every entry.
726 ///
727 /// The callable will be invoked once per input TTree or once per multi-thread task, whichever is more often.
728 ///
729 /// ### Example usage:
730 /// ~~~{.cpp}
731 /// ROOT::RDataFrame df{"mytree", {"sample1.root","sample2.root"}};
732 /// df.DefinePerSample("weightbysample",
733 /// [](unsigned int slot, const ROOT::RDF::RSampleInfo &id)
734 /// { return id.Contains("sample1") ? 1.0f : 2.0f; });
735 /// ~~~
736 // clang-format on
737 // TODO we could SFINAE on F's signature to provide friendlier compilation errors in case of signature mismatch
739 RInterface<Proxied> DefinePerSample(std::string_view name, F expression)
740 {
741 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
744
745 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType_t));
746 if (retTypeName.empty()) {
747 // The type is not known to the interpreter.
748 // We must not error out here, but if/when this column is used in jitted code
749 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType_t));
750 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
751 }
752
753 auto newColumn =
754 std::make_shared<RDFDetail::RDefinePerSample<F>>(name, retTypeName, std::move(expression), *fLoopManager);
755
757 newCols.AddDefine(std::move(newColumn));
759 return newInterface;
760 }
761
762 // clang-format off
763 ////////////////////////////////////////////////////////////////////////////
764 /// \brief Define a new column that is updated when the input sample changes.
765 /// \param[in] name The name of the defined column.
766 /// \param[in] expression A valid C++ expression as a string, which will be used to compute the defined value.
767 /// \return the first node of the computation graph for which the new quantity is defined.
768 ///
769 /// The expression is just-in-time compiled and used to produce the column entries.
770 /// It must be valid C++ syntax and the usage of the special variable names `rdfslot_` and `rdfsampleinfo_` is
771 /// permitted, where these variables will take the same values as the `slot` and `id` parameters described at the
772 /// DefinePerSample(std::string_view name, F expression) overload. See the documentation of that overload for more information.
773 ///
774 /// ### Example usage:
775 /// ~~~{.py}
776 /// df = ROOT.RDataFrame('mytree', ['sample1.root','sample2.root'])
777 /// df.DefinePerSample('weightbysample', 'rdfsampleinfo_.Contains("sample1") ? 1.0f : 2.0f')
778 /// ~~~
779 ///
780 /// \note
781 /// If you have declared some C++ function to the interpreter, the correct syntax to call that function with this
782 /// overload of DefinePerSample is by calling it explicitly with the special names `rdfslot_` and `rdfsampleinfo_` as
783 /// input parameters. This is for example the correct way to call this overload when working in PyROOT:
784 /// ~~~{.py}
785 /// ROOT.gInterpreter.Declare(
786 /// """
787 /// float weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
788 /// return id.Contains("sample1") ? 1.0f : 2.0f;
789 /// }
790 /// """)
791 /// df = ROOT.RDataFrame("mytree", ["sample1.root","sample2.root"])
792 /// df.DefinePerSample("weightsbysample", "weights(rdfslot_, rdfsampleinfo_)")
793 /// ~~~
794 ///
795 /// \note
796 /// Differently from what happens in Define(), the string expression passed to DefinePerSample cannot contain
797 /// column names other than those mentioned above: the expression is evaluated once before the processing of the
798 /// sample even starts, so column values are not accessible.
799 // clang-format on
800 RInterface<Proxied> DefinePerSample(std::string_view name, std::string_view expression)
801 {
802 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
803 // these checks must be done before jitting lest we throw exceptions in jitted code
806
808
810 newCols.AddDefine(std::move(jittedDefine));
811
813
814 return newInterface;
815 }
816
817 /// \brief Register systematic variations for a single existing column using custom variation tags.
818 /// \param[in] colName name of the column for which varied values are provided.
819 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
820 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
821 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
822 /// \param[in] inputColumns the names of the columns to be passed to the callable.
823 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
824 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
825 ///
826 /// Vary provides a natural and flexible syntax to define systematic variations that automatically propagate to
827 /// Filters, Defines and results. RDataFrame usage of columns with attached variations does not change, but for
828 /// results that depend on any varied quantity, a map/dictionary of varied results can be produced with
829 /// ROOT::RDF::Experimental::VariationsFor (see the example below).
830 ///
831 /// The dictionary will contain a "nominal" value (accessed with the "nominal" key) for the unchanged result, and
832 /// values for each of the systematic variations that affected the result (via upstream Filters or via direct or
833 /// indirect dependencies of the column values on some registered variations). The keys will be a composition of
834 /// variation names and tags, e.g. "pt:up" and "pt:down" for the example below.
835 ///
836 /// In the following example we add up/down variations of pt and fill a histogram with a quantity that depends on pt.
837 /// We automatically obtain three histograms in output ("nominal", "pt:up" and "pt:down"):
838 /// ~~~{.cpp}
839 /// auto nominal_hx =
840 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, {"down", "up"})
841 /// .Filter("pt > k")
842 /// .Define("x", someFunc, {"pt"})
843 /// .Histo1D("x");
844 ///
845 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
846 /// hx["nominal"].Draw();
847 /// hx["pt:down"].Draw("SAME");
848 /// hx["pt:up"].Draw("SAME");
849 /// ~~~
850 /// RDataFrame computes all variations as part of a single loop over the data.
851 /// In particular, this means that I/O and computation of values shared
852 /// among variations only happen once for all variations. Thus, the event loop
853 /// run-time typically scales much better than linearly with the number of
854 /// variations.
855 ///
856 /// RDataFrame lazily computes the varied values required to produce the
857 /// outputs of \ref ROOT::RDF::Experimental::VariationsFor "VariationsFor()". If \ref
858 /// ROOT::RDF::Experimental::VariationsFor "VariationsFor()" was not called for a result, the computations are only
859 /// run for the nominal case.
860 ///
861 /// See other overloads for examples when variations are added for multiple existing columns,
862 /// or when the tags are auto-generated instead of being directly defined.
863 template <typename F>
864 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
865 const std::vector<std::string> &variationTags, std::string_view variationName = "")
866 {
867 std::vector<std::string> colNames{{std::string(colName)}};
868 const std::string theVariationName{variationName.empty() ? colName : variationName};
869
870 return VaryImpl<true>(std::move(colNames), std::forward<F>(expression), inputColumns, variationTags,
872 }
873
874 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
875 /// \param[in] colName name of the column for which varied values are provided.
876 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
877 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
878 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
879 /// \param[in] inputColumns the names of the columns to be passed to the callable.
880 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
881 /// `"1"`, etc.
882 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
883 /// colName is used if none is provided.
884 ///
885 /// This overload of Vary takes an nVariations parameter instead of a list of tag names.
886 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
887 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
888 ///
889 /// Example usage:
890 /// ~~~{.cpp}
891 /// auto nominal_hx =
892 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, 2)
893 /// .Histo1D("x");
894 ///
895 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
896 /// hx["nominal"].Draw();
897 /// hx["x:0"].Draw("SAME");
898 /// hx["x:1"].Draw("SAME");
899 /// ~~~
900 ///
901 /// \note See also This Vary() overload for more information.
902 template <typename F>
903 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
904 std::size_t nVariations, std::string_view variationName = "")
905 {
906 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
907
908 std::vector<std::string> variationTags;
909 variationTags.reserve(nVariations);
910 for (std::size_t i = 0u; i < nVariations; ++i)
911 variationTags.emplace_back(std::to_string(i));
912
913 const std::string theVariationName{variationName.empty() ? colName : variationName};
914
915 return Vary(colName, std::forward<F>(expression), inputColumns, std::move(variationTags), theVariationName);
916 }
917
918 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
919 /// \param[in] colNames set of names of the columns for which varied values are provided.
920 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
921 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
922 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
923 /// \param[in] inputColumns the names of the columns to be passed to the callable.
924 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
925 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`
926 ///
927 /// This overload of Vary takes a list of column names as first argument and
928 /// requires that the expression returns an RVec of RVecs of values: one inner RVec for the variations of each
929 /// affected column. The `variationTags` are defined as `{"down", "up"}`.
930 ///
931 /// Example usage:
932 /// ~~~{.cpp}
933 /// // produce variations "ptAndEta:down" and "ptAndEta:up"
934 /// auto nominal_hx =
935 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
936 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
937 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
938 /// {"down", "up"}, // variation tags
939 /// "ptAndEta") // variation name
940 /// .Histo1D("pt", "eta");
941 ///
942 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
943 /// hx["nominal"].Draw();
944 /// hx["ptAndEta:down"].Draw("SAME");
945 /// hx["ptAndEta:up"].Draw("SAME");
946 /// ~~~
947 ///
948 /// \note See also This Vary() overload for more information.
949
950 template <typename F>
951 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
952 const std::vector<std::string> &variationTags, std::string_view variationName)
953 {
954 return VaryImpl<false>(colNames, std::forward<F>(expression), inputColumns, variationTags, variationName);
955 }
956
957 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
958 /// \param[in] colNames set of names of the columns for which varied values are provided.
959 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
960 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
961 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
962 /// \param[in] inputColumns the names of the columns to be passed to the callable.
963 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
964 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
965 /// colName is used if none is provided.
966 ///
967 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
968 /// is avoided.
969 ///
970 /// \note See also This Vary() overload for more information.
971 template <typename F>
973 Vary(std::initializer_list<std::string> colNames, F &&expression, const ColumnNames_t &inputColumns,
974 const std::vector<std::string> &variationTags, std::string_view variationName)
975 {
976 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, variationTags, variationName);
977 }
978
979 /// \brief Register systematic variations for multiple existing columns using auto-generated tags.
980 /// \param[in] colNames set of names of the columns for which varied values are provided.
981 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
982 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
983 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
984 /// \param[in] inputColumns the names of the columns to be passed to the callable.
985 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
986 /// `"1"`, etc.
987 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
988 /// colName is used if none is provided.
989 ///
990 /// This overload of Vary takes a list of column names as first argument.
991 /// It takes an `nVariations` parameter instead of a list of tag names (`variationTags`). Tag names
992 /// will be auto-generated as the sequence 0...``nVariations-1``.
993 ///
994 /// Example usage:
995 /// ~~~{.cpp}
996 /// auto nominal_hx =
997 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
998 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
999 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
1000 /// 2, // auto-generated variation tags
1001 /// "ptAndEta") // variation name
1002 /// .Histo1D("pt", "eta");
1003 ///
1004 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1005 /// hx["nominal"].Draw();
1006 /// hx["ptAndEta:0"].Draw("SAME");
1007 /// hx["ptAndEta:1"].Draw("SAME");
1008 /// ~~~
1009 ///
1010 /// \note See also This Vary() overload for more information.
1011 template <typename F>
1012 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
1013 std::size_t nVariations, std::string_view variationName)
1014 {
1015 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
1016
1017 std::vector<std::string> variationTags;
1018 variationTags.reserve(nVariations);
1019 for (std::size_t i = 0u; i < nVariations; ++i)
1020 variationTags.emplace_back(std::to_string(i));
1021
1022 return Vary(colNames, std::forward<F>(expression), inputColumns, std::move(variationTags), variationName);
1023 }
1024
1025 /// \brief Register systematic variations for for multiple existing columns using custom variation tags.
1026 /// \param[in] colNames set of names of the columns for which varied values are provided.
1027 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
1028 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
1029 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
1030 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1031 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1032 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1033 /// `"1"`, etc.
1034 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1035 /// colName is used if none is provided.
1036 ///
1037 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1038 /// is avoided.
1039 ///
1040 /// \note See also This Vary() overload for more information.
1041 template <typename F>
1042 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, F &&expression,
1043 const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
1044 {
1045 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, nVariations, variationName);
1046 }
1047
1048 /// \brief Register systematic variations for a single existing column using custom variation tags.
1049 /// \param[in] colName name of the column for which varied values are provided.
1050 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1051 /// values for the specified column.
1052 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1053 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1054 /// colName is used if none is provided.
1055 ///
1056 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1057 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1058 /// defined as `{"down", "up"}`.
1059 /// ~~~{.cpp}
1060 /// auto nominal_hx =
1061 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", {"down", "up"})
1062 /// .Filter("pt > k")
1063 /// .Define("x", someFunc, {"pt"})
1064 /// .Histo1D("x");
1065 ///
1066 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1067 /// hx["nominal"].Draw();
1068 /// hx["pt:down"].Draw("SAME");
1069 /// hx["pt:up"].Draw("SAME");
1070 /// ~~~
1071 ///
1072 /// \note See also This Vary() overload for more information.
1073 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression,
1074 const std::vector<std::string> &variationTags, std::string_view variationName = "")
1075 {
1076 std::vector<std::string> colNames{{std::string(colName)}};
1077 const std::string theVariationName{variationName.empty() ? colName : variationName};
1078
1079 return JittedVaryImpl(colNames, expression, variationTags, theVariationName, /*isSingleColumn=*/true);
1080 }
1081
1082 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
1083 /// \param[in] colName name of the column for which varied values are provided.
1084 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1085 /// values for the specified column.
1086 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1087 /// `"1"`, etc.
1088 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1089 /// colName is used if none is provided.
1090 ///
1091 /// This overload adds the possibility for the expression used to evaluate the varied values to be a just-in-time
1092 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1093 /// auto-generated.
1094 /// ~~~{.cpp}
1095 /// auto nominal_hx =
1096 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", 2)
1097 /// .Histo1D("pt");
1098 ///
1099 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1100 /// hx["nominal"].Draw();
1101 /// hx["pt:0"].Draw("SAME");
1102 /// hx["pt:1"].Draw("SAME");
1103 /// ~~~
1104 ///
1105 /// \note See also This Vary() overload for more information.
1106 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression, std::size_t nVariations,
1107 std::string_view variationName = "")
1108 {
1109 std::vector<std::string> variationTags;
1110 variationTags.reserve(nVariations);
1111 for (std::size_t i = 0u; i < nVariations; ++i)
1112 variationTags.emplace_back(std::to_string(i));
1113
1114 return Vary(colName, expression, std::move(variationTags), variationName);
1115 }
1116
1117 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1118 /// \param[in] colNames set of names of the columns for which varied values are provided.
1119 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1120 /// values for the specified columns.
1121 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1122 /// `"1"`, etc.
1123 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1124 ///
1125 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1126 /// compiled. It takes an nVariations parameter instead of a list of tag names.
1127 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
1128 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
1129 /// The example below shows how Vary() is used while dealing with multiple columns.
1130 ///
1131 /// ~~~{.cpp}
1132 /// auto nominal_hx =
1133 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1134 /// .Histo1D("x", "y");
1135 ///
1136 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1137 /// hx["nominal"].Draw();
1138 /// hx["xy:0"].Draw("SAME");
1139 /// hx["xy:1"].Draw("SAME");
1140 /// ~~~
1141 ///
1142 /// \note See also This Vary() overload for more information.
1143 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1144 std::size_t nVariations, std::string_view variationName)
1145 {
1146 std::vector<std::string> variationTags;
1147 variationTags.reserve(nVariations);
1148 for (std::size_t i = 0u; i < nVariations; ++i)
1149 variationTags.emplace_back(std::to_string(i));
1150
1151 return Vary(colNames, expression, std::move(variationTags), variationName);
1152 }
1153
1154 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1155 /// \param[in] colNames set of names of the columns for which varied values are provided.
1156 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1157 /// values for the specified column.
1158 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1159 /// `"1"`, etc.
1160 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1161 /// colName is used if none is provided.
1162 ///
1163 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1164 /// is avoided.
1165 ///
1166 /// \note See also This Vary() overload for more information.
1167 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, std::string_view expression,
1168 std::size_t nVariations, std::string_view variationName)
1169 {
1170 return Vary(std::vector<std::string>(colNames), expression, nVariations, variationName);
1171 }
1172
1173 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
1174 /// \param[in] colNames set of names of the columns for which varied values are provided.
1175 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1176 /// values for the specified columns.
1177 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1178 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1179 ///
1180 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1181 /// compiled. The example below shows how Vary() is used while dealing with multiple columns. The tags are defined as
1182 /// `{"down", "up"}`.
1183 /// ~~~{.cpp}
1184 /// auto nominal_hx =
1185 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1186 /// .Histo1D("x", "y");
1187 ///
1188 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1189 /// hx["nominal"].Draw();
1190 /// hx["xy:down"].Draw("SAME");
1191 /// hx["xy:up"].Draw("SAME");
1192 /// ~~~
1193 ///
1194 /// \note See also This Vary() overload for more information.
1195 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1196 const std::vector<std::string> &variationTags, std::string_view variationName)
1197 {
1198 return JittedVaryImpl(colNames, expression, variationTags, variationName, /*isSingleColumn=*/false);
1199 }
1200
1201 ////////////////////////////////////////////////////////////////////////////
1202 /// \brief Allow to refer to a column with a different name.
1203 /// \param[in] alias name of the column alias
1204 /// \param[in] columnName of the column to be aliased
1205 /// \return the first node of the computation graph for which the alias is available.
1206 ///
1207 /// Aliasing an alias is supported.
1208 ///
1209 /// ### Example usage:
1210 /// ~~~{.cpp}
1211 /// auto df_with_alias = df.Alias("simple_name", "very_long&complex_name!!!");
1212 /// ~~~
1213 RInterface<Proxied> Alias(std::string_view alias, std::string_view columnName)
1214 {
1215 // The symmetry with Define is clear. We want to:
1216 // - Create globally the alias and return this very node, unchanged
1217 // - Make aliases accessible based on chains and not globally
1218
1219 // Helper to find out if a name is a column
1221
1222 constexpr auto where = "Alias";
1224 // If the alias name is a column name, there is a problem
1226
1227 const auto validColumnName = GetValidatedColumnNames(1, {std::string(columnName)})[0];
1228
1230 newCols.AddAlias(alias, validColumnName);
1231
1233
1234 return newInterface;
1235 }
1236
1237 template <typename... ColumnTypes>
1238 [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]]
1240 Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList,
1241 const RSnapshotOptions &options = RSnapshotOptions())
1242 {
1243 return Snapshot(treename, filename, columnList, options);
1244 }
1245
1246 ////////////////////////////////////////////////////////////////////////////
1247 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1248 /// \param[in] treename The name of the output TTree or RNTuple.
1249 /// \param[in] filename The name of the output TFile.
1250 /// \param[in] columnList The list of names of the columns/branches/fields to be written.
1251 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
1252 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1253 ///
1254 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1255 /// The types of the columns are automatically inferred and do not need to be specified.
1256 ///
1257 /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.')
1258 /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot.
1259 /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also
1260 /// written out and it appears before the array in the columnList.
1261 ///
1262 /// By default, in case of TTree, TChain or RNTuple inputs, Snapshot will try to write out all top-level branches.
1263 /// For other types of inputs, all columns returned by GetColumnNames() will be written out. Systematic variations of
1264 /// columns will be included if the corresponding flag is set in RSnapshotOptions. See \ref snapshot-with-variations
1265 /// "Snapshot with Variations" for more details. If friend trees or chains are present, by default all friend
1266 /// top-level branches that have names that do not collide with names of branches in the main TTree/TChain will be
1267 /// written out. Since v6.24, Snapshot will also write out friend branches with the same names of branches in the
1268 /// main TTree/TChain with names of the form
1269 /// `<friendname>_<branchname>` in order to differentiate them from the branches in the main tree/chain.
1270 ///
1271 /// ### Writing to a sub-directory
1272 ///
1273 /// Snapshot supports writing the TTree or RNTuple in a sub-directory inside the TFile. It is sufficient to specify
1274 /// the directory path as part of the TTree or RNTuple name, e.g. `df.Snapshot("subdir/t", "f.root")` writes TTree
1275 /// `t` in the sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed).
1276 ///
1277 /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
1278 /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled
1279 /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in
1280 /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
1281 /// error out if such a "shuffled" TTree is used in a friendship.
1282 ///
1283 /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the
1284 /// requested output TTree or RNTuple to the file, with all the branches requested to preserve the dataset schema.
1285 ///
1286 /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns
1287 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1288 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1289 /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`.
1290 ///
1291 /// ### Example invocations:
1292 ///
1293 /// ~~~{.cpp}
1294 /// // No need to specify column types, they are automatically deduced thanks
1295 /// // to information coming from the data source
1296 /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"});
1297 /// ~~~
1298 ///
1299 /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in
1300 /// `RSnapshotOptions`:
1301 /// ~~~{.cpp}
1302 /// RSnapshotOptions opts;
1303 /// opts.fLazy = true;
1304 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
1305 /// ~~~
1306 ///
1307 /// To snapshot to the RNTuple data format, the `fOutputFormat` option in `RSnapshotOptions` needs to be set
1308 /// accordingly:
1309 /// ~~~{.cpp}
1310 /// RSnapshotOptions opts;
1311 /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
1312 /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts);
1313 /// ~~~
1314 ///
1315 /// Snapshot systematic variations resulting from a Vary() call (see details \ref snapshot-with-variations "here"):
1316 /// ~~~{.cpp}
1317 /// RSnapshotOptions opts;
1318 /// opts.fIncludeVariations = true;
1319 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
1320 /// ~~~
1323 const RSnapshotOptions &options = RSnapshotOptions())
1324 {
1325 // like columnList but with `#var` columns removed
1327 // like columnListWithoutSizeColumns but with aliases resolved
1330 // like validCols but with missing size branches required by array branches added in the right positions
1331 const auto pairOfColumnLists =
1335
1336 const auto fullTreeName = treename;
1338 treename = parsedTreePath.fTreeName;
1339 const auto &dirname = parsedTreePath.fDirName;
1340
1342
1344
1345 auto retrieveTypeID = [](const std::string &colName, const std::string &colTypeName,
1346 bool isRNTuple = false) -> const std::type_info * {
1347 try {
1349 } catch (const std::runtime_error &err) {
1350 if (isRNTuple)
1352
1353 if (std::string(err.what()).find("Cannot extract type_info of type") != std::string::npos) {
1354 // We could not find RTTI for this column, thus we cannot write it out at the moment.
1355 std::string trueTypeName{colTypeName};
1356 if (colTypeName.rfind("CLING_UNKNOWN_TYPE", 0) == 0)
1357 trueTypeName = colTypeName.substr(19);
1358 std::string msg{"No runtime type information is available for column \"" + colName +
1359 "\" with type name \"" + trueTypeName +
1360 "\". Thus, it cannot be written to disk with Snapshot. Make sure to generate and load "
1361 "ROOT dictionaries for the type of this column."};
1362
1363 throw std::runtime_error(msg);
1364 } else {
1365 throw;
1366 }
1367 }
1368 };
1369
1371
1372 if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) {
1373 // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one
1374 // without a data source for now, and set it once the actual data source can be created (i.e., after
1375 // writing the RNTuple).
1376 auto newRDF = std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(colListNoPoundSizes));
1377
1378 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1379 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1380 options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false});
1381
1384
1385 const auto nSlots = fLoopManager->GetNSlots();
1386 std::vector<const std::type_info *> colTypeIDs;
1387 colTypeIDs.reserve(nColumns);
1388 for (decltype(nColumns) i{}; i < nColumns; i++) {
1389 const auto &colName = validColumnNames[i];
1391 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
1392 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName, /*isRNTuple*/ true);
1393 colTypeIDs.push_back(colTypeID);
1394 }
1395 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
1396 // source
1398
1399 auto action =
1401 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
1402 } else {
1403 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" &&
1404 options.fOutputFormat == ESnapshotOutputFormat::kDefault) {
1405 Warning("Snapshot",
1406 "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you "
1407 "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in "
1408 "RSnapshotOptions. Note that this current default behaviour might change in the future.");
1409 }
1410
1411 // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset
1412 // has actually been created and written to TFile, i.e. at the end of the Snapshot execution.
1413 auto newRDF = std::make_shared<RInterface<RLoopManager>>(
1414 std::make_shared<RLoopManager>(colListNoAliasesWithSizeBranches));
1415
1416 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1417 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1418 options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations});
1419
1422
1423 const auto nSlots = fLoopManager->GetNSlots();
1424 std::vector<const std::type_info *> colTypeIDs;
1425 colTypeIDs.reserve(nColumns);
1426 for (decltype(nColumns) i{}; i < nColumns; i++) {
1427 const auto &colName = validColumnNames[i];
1429 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
1430 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName);
1431 colTypeIDs.push_back(colTypeID);
1432 }
1433 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
1434 // source
1436
1437 auto action =
1439 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
1440 }
1441
1442 if (!options.fLazy)
1443 *resPtr;
1444 return resPtr;
1445 }
1446
1447 // clang-format off
1448 ////////////////////////////////////////////////////////////////////////////
1449 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1450 /// \param[in] treename The name of the output TTree or RNTuple.
1451 /// \param[in] filename The name of the output TFile.
1452 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1453 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple
1454 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1455 ///
1456 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1457 /// The types of the columns are automatically inferred and do not need to be specified.
1458 ///
1459 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
1461 std::string_view columnNameRegexp = "",
1462 const RSnapshotOptions &options = RSnapshotOptions())
1463 {
1465
1467 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1469 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1470 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1475
1476 // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd.
1477 // RemoveDuplicates should preserve ordering of the columns: it might be meaningful.
1479
1481
1482 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") {
1484 }
1485
1486 return Snapshot(treename, filename, selectedColumns, options);
1487 }
1488 // clang-format on
1489
1490 // clang-format off
1491 ////////////////////////////////////////////////////////////////////////////
1492 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1493 /// \param[in] treename The name of the output TTree or RNTuple.
1494 /// \param[in] filename The name of the output TFile.
1495 /// \param[in] columnList The list of names of the columns/branches to be written.
1496 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
1497 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1498 ///
1499 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1500 /// The types of the columns are automatically inferred and do not need to be specified.
1501 ///
1502 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
1504 std::initializer_list<std::string> columnList,
1505 const RSnapshotOptions &options = RSnapshotOptions())
1506 {
1508 return Snapshot(treename, filename, selectedColumns, options);
1509 }
1510 // clang-format on
1511
1512 ////////////////////////////////////////////////////////////////////////////
1513 /// \brief Save selected columns in memory.
1514 /// \tparam ColumnTypes variadic list of branch/column types.
1515 /// \param[in] columnList columns to be cached in memory.
1516 /// \return a `RDataFrame` that wraps the cached dataset.
1517 ///
1518 /// This action returns a new `RDataFrame` object, completely detached from
1519 /// the originating `RDataFrame`. The new dataframe only contains the cached
1520 /// columns and stores their content in memory for fast, zero-copy subsequent access.
1521 ///
1522 /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that
1523 /// fits in memory and that will be accessed many times.
1524 ///
1525 /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns
1526 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1527 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1528 /// Alias(): `df.Alias("nbar", "#bar").Cache<std::size_t>(..., {"nbar"})`.
1529 ///
1530 /// ### Example usage:
1531 ///
1532 /// **Types and columns specified:**
1533 /// ~~~{.cpp}
1534 /// auto cache_some_cols_df = df.Cache<double, MyClass, int>({"col0", "col1", "col2"});
1535 /// ~~~
1536 ///
1537 /// **Types inferred and columns specified (this invocation relies on jitting):**
1538 /// ~~~{.cpp}
1539 /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"});
1540 /// ~~~
1541 ///
1542 /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):**
1543 /// ~~~{.cpp}
1544 /// auto cache_all_cols_df = df.Cache(myRegexp);
1545 /// ~~~
1546 template <typename... ColumnTypes>
1548 {
1549 auto staticSeq = std::make_index_sequence<sizeof...(ColumnTypes)>();
1551 }
1552
1553 ////////////////////////////////////////////////////////////////////////////
1554 /// \brief Save selected columns in memory.
1555 /// \param[in] columnList columns to be cached in memory
1556 /// \return a `RDataFrame` that wraps the cached dataset.
1557 ///
1558 /// See the previous overloads for more information.
1560 {
1561 // Early return: if the list of columns is empty, just return an empty RDF
1562 // If we proceed, the jitted call will not compile!
1563 if (columnList.empty()) {
1564 auto nEntries = *this->Count();
1565 RInterface<RLoopManager> emptyRDF(std::make_shared<RLoopManager>(nEntries));
1566 return emptyRDF;
1567 }
1568
1569 std::stringstream cacheCall;
1571 RInterface<TTraits::TakeFirstParameter_t<decltype(upcastNode)>> upcastInterface(fProxiedPtr, *fLoopManager,
1572 fColRegister);
1573 // build a string equivalent to
1574 // "(RInterface<nodetype*>*)(this)->Cache<Ts...>(*(ColumnNames_t*)(&columnList))"
1575 RInterface<RLoopManager> resRDF(std::make_shared<ROOT::Detail::RDF::RLoopManager>(0));
1576 cacheCall << "*reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RLoopManager>*>("
1578 << ") = reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RNodeBase>*>("
1580
1582
1583 const auto validColumnNames =
1585 const auto colTypes =
1586 GetValidatedArgTypes(validColumnNames, fColRegister, nullptr, GetDataSource(), "Cache", /*vector2RVec=*/false);
1587 for (const auto &colType : colTypes)
1588 cacheCall << colType << ", ";
1589 if (!columnListWithoutSizeColumns.empty())
1590 cacheCall.seekp(-2, cacheCall.cur); // remove the last ",
1591 cacheCall << ">(*reinterpret_cast<std::vector<std::string>*>(" // vector<string> should be ColumnNames_t
1593
1594 // book the code to jit with the RLoopManager and trigger the event loop
1595 fLoopManager->ToJitExec(cacheCall.str());
1596 fLoopManager->Jit();
1597
1598 return resRDF;
1599 }
1600
1601 ////////////////////////////////////////////////////////////////////////////
1602 /// \brief Save selected columns in memory.
1603 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1604 /// \return a `RDataFrame` that wraps the cached dataset.
1605 ///
1606 /// The existing columns are matched against the regular expression. If the string provided
1607 /// is empty, all columns are selected. See the previous overloads for more information.
1609 {
1612 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1614 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1615 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1617 columnNames.reserve(definedColumns.size() + dsColumns.size());
1621 return Cache(selectedColumns);
1622 }
1623
1624 ////////////////////////////////////////////////////////////////////////////
1625 /// \brief Save selected columns in memory.
1626 /// \param[in] columnList columns to be cached in memory.
1627 /// \return a `RDataFrame` that wraps the cached dataset.
1628 ///
1629 /// See the previous overloads for more information.
1630 RInterface<RLoopManager> Cache(std::initializer_list<std::string> columnList)
1631 {
1633 return Cache(selectedColumns);
1634 }
1635
1636 // clang-format off
1637 ////////////////////////////////////////////////////////////////////////////
1638 /// \brief Creates a node that filters entries based on range: [begin, end).
1639 /// \param[in] begin Initial entry number considered for this range.
1640 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1641 /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0.
1642 /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries.
1643 ///
1644 /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset.
1645 /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported.
1646 ///
1647 /// ### Example usage:
1648 /// ~~~{.cpp}
1649 /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries
1650 /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards
1651 /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3
1652 /// ~~~
1653 // clang-format on
1654 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int begin, unsigned int end, unsigned int stride = 1)
1655 {
1656 // check invariants
1657 if (stride == 0 || (end != 0 && end < begin))
1658 throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin.");
1659 CheckIMTDisabled("Range");
1660
1661 using Range_t = RDFDetail::RRange<Proxied>;
1662 auto rangePtr = std::make_shared<Range_t>(begin, end, stride, fProxiedPtr);
1664 return newInterface;
1665 }
1666
1667 // clang-format off
1668 ////////////////////////////////////////////////////////////////////////////
1669 /// \brief Creates a node that filters entries based on range.
1670 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1671 /// \return a node of the computation graph for which the range is defined.
1672 ///
1673 /// See the other Range overload for a detailed description.
1674 // clang-format on
1675 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int end) { return Range(0, end, 1); }
1676
1677 // clang-format off
1678 ////////////////////////////////////////////////////////////////////////////
1679 /// \brief Execute a user-defined function on each entry (*instant action*).
1680 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1681 /// \param[in] columns Names of the columns/branches in input to the user function.
1682 ///
1683 /// The callable `f` is invoked once per entry. This is an *instant action*:
1684 /// upon invocation, an event loop as well as execution of all scheduled actions
1685 /// is triggered.
1686 /// Users are responsible for the thread-safety of this callable when executing
1687 /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT).
1688 ///
1689 /// ### Example usage:
1690 /// ~~~{.cpp}
1691 /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"});
1692 /// ~~~
1693 // clang-format on
1694 template <typename F>
1695 void Foreach(F f, const ColumnNames_t &columns = {})
1696 {
1697 using arg_types = typename TTraits::CallableTraits<decltype(f)>::arg_types_nodecay;
1698 using ret_type = typename TTraits::CallableTraits<decltype(f)>::ret_type;
1699 ForeachSlot(RDFInternal::AddSlotParameter<ret_type>(f, arg_types()), columns);
1700 }
1701
1702 // clang-format off
1703 ////////////////////////////////////////////////////////////////////////////
1704 /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*).
1705 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1706 /// \param[in] columns Names of the columns/branches in input to the user function.
1707 ///
1708 /// Same as `Foreach`, but the user-defined function takes an extra
1709 /// `unsigned int` as its first parameter, the *processing slot index*.
1710 /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`,
1711 /// for each thread of execution.
1712 /// This is meant as a helper in writing thread-safe `Foreach`
1713 /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`.
1714 /// The user-defined processing callable is able to follow different
1715 /// *streams of processing* indexed by the first parameter.
1716 /// `ForeachSlot` works just as well with single-thread execution: in that
1717 /// case `slot` will always be `0`.
1718 ///
1719 /// ### Example usage:
1720 /// ~~~{.cpp}
1721 /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"});
1722 /// ~~~
1723 // clang-format on
1724 template <typename F>
1725 void ForeachSlot(F f, const ColumnNames_t &columns = {})
1726 {
1728 constexpr auto nColumns = ColTypes_t::list_size;
1729
1732
1733 using Helper_t = RDFInternal::ForeachSlotHelper<F>;
1735
1736 auto action = std::make_unique<Action_t>(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister);
1737
1738 fLoopManager->Run();
1739 }
1740
1741 // clang-format off
1742 ////////////////////////////////////////////////////////////////////////////
1743 /// \brief Execute a user-defined reduce operation on the values of a column.
1744 /// \tparam F The type of the reduce callable. Automatically deduced.
1745 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1746 /// \param[in] f A callable with signature `T(T,T)`
1747 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1748 /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr.
1749 ///
1750 /// A reduction takes two values of a column and merges them into one (e.g.
1751 /// by summing them, taking the maximum, etc). This action performs the
1752 /// specified reduction operation on all processed column values, returning
1753 /// a single value of the same type. The callable f must satisfy the general
1754 /// requirements of a *processing function* besides having signature `T(T,T)`
1755 /// where `T` is the type of column columnName.
1756 ///
1757 /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a
1758 /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific
1759 /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this
1760 /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce`
1761 /// overload.
1762 ///
1763 /// ### Example usage:
1764 /// ~~~{.cpp}
1765 /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol");
1766 /// ~~~
1767 ///
1768 /// This action is *lazy*: upon invocation of this method the calculation is
1769 /// booked but not executed. Also see RResultPtr.
1770 // clang-format on
1772 RResultPtr<T> Reduce(F f, std::string_view columnName = "")
1773 {
1774 static_assert(
1775 std::is_default_constructible<T>::value,
1776 "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)");
1777 return Reduce(std::move(f), columnName, T());
1778 }
1779
1780 ////////////////////////////////////////////////////////////////////////////
1781 /// \brief Execute a user-defined reduce operation on the values of a column.
1782 /// \tparam F The type of the reduce callable. Automatically deduced.
1783 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1784 /// \param[in] f A callable with signature `T(T,T)`
1785 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1786 /// \param[in] redIdentity The reduced object of each thread is initialized to this value.
1787 /// \return the reduced quantity wrapped in a RResultPtr.
1788 ///
1789 /// ### Example usage:
1790 /// ~~~{.cpp}
1791 /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42);
1792 /// ~~~
1793 /// See the description of the first Reduce overload for more information.
1795 RResultPtr<T> Reduce(F f, std::string_view columnName, const T &redIdentity)
1796 {
1797 return Aggregate(f, f, columnName, redIdentity);
1798 }
1799
1800 ////////////////////////////////////////////////////////////////////////////
1801 /// \brief Return the number of entries processed (*lazy action*).
1802 /// \return the number of entries wrapped in a RResultPtr.
1803 ///
1804 /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`).
1805 /// This action is *lazy*: upon invocation of this method the calculation is
1806 /// booked but not executed. Also see RResultPtr.
1807 ///
1808 /// ### Example usage:
1809 /// ~~~{.cpp}
1810 /// auto nEntriesAfterCuts = myFilteredDf.Count();
1811 /// ~~~
1812 ///
1814 {
1815 const auto nSlots = fLoopManager->GetNSlots();
1816 auto cSPtr = std::make_shared<ULong64_t>(0);
1817 using Helper_t = RDFInternal::CountHelper;
1819 auto action = std::make_unique<Action_t>(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr,
1821 return MakeResultPtr(cSPtr, *fLoopManager, std::move(action));
1822 }
1823
1824 ////////////////////////////////////////////////////////////////////////////
1825 /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default).
1826 /// \tparam T The type of the column.
1827 /// \tparam COLL The type of collection used to store the values.
1828 /// \param[in] column The name of the column to collect the values of.
1829 /// \return the content of the selected column wrapped in a RResultPtr.
1830 ///
1831 /// The collection type to be specified for C-style array columns is `RVec<T>`:
1832 /// in this case the returned collection is a `std::vector<RVec<T>>`.
1833 /// ### Example usage:
1834 /// ~~~{.cpp}
1835 /// // In this case intCol is a std::vector<int>
1836 /// auto intCol = rdf.Take<int>("integerColumn");
1837 /// // Same content as above but in this case taken as a RVec<int>
1838 /// auto intColAsRVec = rdf.Take<int, RVec<int>>("integerColumn");
1839 /// // In this case intCol is a std::vector<RVec<int>>, a collection of collections
1840 /// auto cArrayIntCol = rdf.Take<RVec<int>>("cArrayInt");
1841 /// ~~~
1842 /// This action is *lazy*: upon invocation of this method the calculation is
1843 /// booked but not executed. Also see RResultPtr.
1844 template <typename T, typename COLL = std::vector<T>>
1845 RResultPtr<COLL> Take(std::string_view column = "")
1846 {
1847 const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)});
1848
1851
1852 using Helper_t = RDFInternal::TakeHelper<T, T, COLL>;
1854 auto valuesPtr = std::make_shared<COLL>();
1855 const auto nSlots = fLoopManager->GetNSlots();
1856
1857 auto action =
1858 std::make_unique<Action_t>(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister);
1859 return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action));
1860 }
1861
1862 ////////////////////////////////////////////////////////////////////////////
1863 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1864 /// \tparam V The type of the column used to fill the histogram.
1865 /// \param[in] model The returned histogram will be constructed using this as a model.
1866 /// \param[in] vName The name of the column that will fill the histogram.
1867 /// \return the monodimensional histogram wrapped in a RResultPtr.
1868 ///
1869 /// Columns can be of a container type (e.g. `std::vector<double>`), in which case the histogram
1870 /// is filled with each one of the elements of the container. In case multiple columns of container type
1871 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1872 /// possibly different lengths between events).
1873 /// This action is *lazy*: upon invocation of this method the calculation is
1874 /// booked but not executed. Also see RResultPtr.
1875 ///
1876 /// ### Example usage:
1877 /// ~~~{.cpp}
1878 /// // Deduce column type (this invocation needs jitting internally)
1879 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1880 /// // Explicit column type
1881 /// auto myHist2 = myDf.Histo1D<float>({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1882 /// ~~~
1883 ///
1884 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1885 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1886 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1887 template <typename V = RDFDetail::RInferredType>
1888 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "")
1889 {
1890 const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)});
1891
1893
1894 std::shared_ptr<::TH1D> h(nullptr);
1895 {
1896 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1897 h = model.GetHistogram();
1898 }
1899
1900 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1901 h->SetCanExtend(::TH1::kAllAxes);
1903 }
1904
1905 ////////////////////////////////////////////////////////////////////////////
1906 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1907 /// \tparam V The type of the column used to fill the histogram.
1908 /// \param[in] vName The name of the column that will fill the histogram.
1909 /// \return the monodimensional histogram wrapped in a RResultPtr.
1910 ///
1911 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1912 /// The "name" and "title" strings are built starting from the input column name.
1913 /// See the description of the first Histo1D() overload for more details.
1914 ///
1915 /// ### Example usage:
1916 /// ~~~{.cpp}
1917 /// // Deduce column type (this invocation needs jitting internally)
1918 /// auto myHist1 = myDf.Histo1D("myColumn");
1919 /// // Explicit column type
1920 /// auto myHist2 = myDf.Histo1D<float>("myColumn");
1921 /// ~~~
1922 template <typename V = RDFDetail::RInferredType>
1924 {
1925 const auto h_name = std::string(vName);
1926 const auto h_title = h_name + ";" + h_name + ";count";
1927 return Histo1D<V>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName);
1928 }
1929
1930 ////////////////////////////////////////////////////////////////////////////
1931 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1932 /// \tparam V The type of the column used to fill the histogram.
1933 /// \tparam W The type of the column used as weights.
1934 /// \param[in] model The returned histogram will be constructed using this as a model.
1935 /// \param[in] vName The name of the column that will fill the histogram.
1936 /// \param[in] wName The name of the column that will provide the weights.
1937 /// \return the monodimensional histogram wrapped in a RResultPtr.
1938 ///
1939 /// See the description of the first Histo1D() overload for more details.
1940 ///
1941 /// ### Example usage:
1942 /// ~~~{.cpp}
1943 /// // Deduce column type (this invocation needs jitting internally)
1944 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1945 /// // Explicit column type
1946 /// auto myHist2 = myDf.Histo1D<float, int>({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1947 /// ~~~
1948 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1949 RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
1950 {
1951 const std::vector<std::string_view> columnViews = {vName, wName};
1953 ? ColumnNames_t()
1955 std::shared_ptr<::TH1D> h(nullptr);
1956 {
1957 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1958 h = model.GetHistogram();
1959 }
1960
1961 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1962 h->SetCanExtend(::TH1::kAllAxes);
1964 }
1965
1966 ////////////////////////////////////////////////////////////////////////////
1967 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1968 /// \tparam V The type of the column used to fill the histogram.
1969 /// \tparam W The type of the column used as weights.
1970 /// \param[in] vName The name of the column that will fill the histogram.
1971 /// \param[in] wName The name of the column that will provide the weights.
1972 /// \return the monodimensional histogram wrapped in a RResultPtr.
1973 ///
1974 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1975 /// The "name" and "title" strings are built starting from the input column names.
1976 /// See the description of the first Histo1D() overload for more details.
1977 ///
1978 /// ### Example usage:
1979 /// ~~~{.cpp}
1980 /// // Deduce column types (this invocation needs jitting internally)
1981 /// auto myHist1 = myDf.Histo1D("myValue", "myweight");
1982 /// // Explicit column types
1983 /// auto myHist2 = myDf.Histo1D<float, int>("myValue", "myweight");
1984 /// ~~~
1985 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1986 RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName)
1987 {
1988 // We build name and title based on the value and weight column names
1989 std::string str_vName{vName};
1990 std::string str_wName{wName};
1991 const auto h_name = str_vName + "_weighted_" + str_wName;
1992 const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName;
1993 return Histo1D<V, W>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName);
1994 }
1995
1996 ////////////////////////////////////////////////////////////////////////////
1997 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1998 /// \tparam V The type of the column used to fill the histogram.
1999 /// \tparam W The type of the column used as weights.
2000 /// \param[in] model The returned histogram will be constructed using this as a model.
2001 /// \return the monodimensional histogram wrapped in a RResultPtr.
2002 ///
2003 /// This overload will use the first two default columns as column names.
2004 /// See the description of the first Histo1D() overload for more details.
2005 template <typename V, typename W>
2006 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.})
2007 {
2008 return Histo1D<V, W>(model, "", "");
2009 }
2010
2011 ////////////////////////////////////////////////////////////////////////////
2012 /// \brief Fill and return a two-dimensional histogram (*lazy action*).
2013 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
2014 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
2015 /// \param[in] model The returned histogram will be constructed using this as a model.
2016 /// \param[in] v1Name The name of the column that will fill the x axis.
2017 /// \param[in] v2Name The name of the column that will fill the y axis.
2018 /// \return the bidimensional histogram wrapped in a RResultPtr.
2019 ///
2020 /// Columns can be of a container type (e.g. std::vector<double>), in which case the histogram
2021 /// is filled with each one of the elements of the container. In case multiple columns of container type
2022 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
2023 /// possibly different lengths between events).
2024 /// This action is *lazy*: upon invocation of this method the calculation is
2025 /// booked but not executed. Also see RResultPtr.
2026 ///
2027 /// ### Example usage:
2028 /// ~~~{.cpp}
2029 /// // Deduce column types (this invocation needs jitting internally)
2030 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
2031 /// // Explicit column types
2032 /// auto myHist2 = myDf.Histo2D<float, float>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
2033 /// ~~~
2034 ///
2035 ///
2036 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
2037 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2038 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2039 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2040 RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2041 {
2042 std::shared_ptr<::TH2D> h(nullptr);
2043 {
2044 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2045 h = model.GetHistogram();
2046 }
2047 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
2048 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
2049 }
2050 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2052 ? ColumnNames_t()
2055 }
2056
2057 ////////////////////////////////////////////////////////////////////////////
2058 /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*).
2059 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
2060 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
2061 /// \tparam W The type of the column used for the weights of the histogram.
2062 /// \param[in] model The returned histogram will be constructed using this as a model.
2063 /// \param[in] v1Name The name of the column that will fill the x axis.
2064 /// \param[in] v2Name The name of the column that will fill the y axis.
2065 /// \param[in] wName The name of the column that will provide the weights.
2066 /// \return the bidimensional histogram wrapped in a RResultPtr.
2067 ///
2068 /// This action is *lazy*: upon invocation of this method the calculation is
2069 /// booked but not executed. Also see RResultPtr.
2070 ///
2071 /// ### Example usage:
2072 /// ~~~{.cpp}
2073 /// // Deduce column types (this invocation needs jitting internally)
2074 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2075 /// // Explicit column types
2076 /// auto myHist2 = myDf.Histo2D<float, float, double>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2077 /// ~~~
2078 ///
2079 /// See the documentation of the first Histo2D() overload for more details.
2080 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2081 typename W = RDFDetail::RInferredType>
2083 Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2084 {
2085 std::shared_ptr<::TH2D> h(nullptr);
2086 {
2087 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2088 h = model.GetHistogram();
2089 }
2090 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
2091 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
2092 }
2093 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2095 ? ColumnNames_t()
2098 }
2099
2100 template <typename V1, typename V2, typename W>
2102 {
2103 return Histo2D<V1, V2, W>(model, "", "", "");
2104 }
2105
2106 ////////////////////////////////////////////////////////////////////////////
2107 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2108 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2109 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2110 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2111 /// \param[in] model The returned histogram will be constructed using this as a model.
2112 /// \param[in] v1Name The name of the column that will fill the x axis.
2113 /// \param[in] v2Name The name of the column that will fill the y axis.
2114 /// \param[in] v3Name The name of the column that will fill the z axis.
2115 /// \return the tridimensional histogram wrapped in a RResultPtr.
2116 ///
2117 /// This action is *lazy*: upon invocation of this method the calculation is
2118 /// booked but not executed. Also see RResultPtr.
2119 ///
2120 /// ### Example usage:
2121 /// ~~~{.cpp}
2122 /// // Deduce column types (this invocation needs jitting internally)
2123 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2124 /// "myValueX", "myValueY", "myValueZ");
2125 /// // Explicit column types
2126 /// auto myHist2 = myDf.Histo3D<double, double, float>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2127 /// "myValueX", "myValueY", "myValueZ");
2128 /// ~~~
2129 /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D
2130 /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in
2131 /// the RDataFrame description.
2132 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
2133 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2134 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2135 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2136 typename V3 = RDFDetail::RInferredType>
2137 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "",
2138 std::string_view v3Name = "")
2139 {
2140 std::shared_ptr<::TH3D> h(nullptr);
2141 {
2142 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2143 h = model.GetHistogram();
2144 }
2145 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2146 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2147 }
2148 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
2150 ? ColumnNames_t()
2153 }
2154
2155 ////////////////////////////////////////////////////////////////////////////
2156 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2157 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2158 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2159 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2160 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
2161 /// \param[in] model The returned histogram will be constructed using this as a model.
2162 /// \param[in] v1Name The name of the column that will fill the x axis.
2163 /// \param[in] v2Name The name of the column that will fill the y axis.
2164 /// \param[in] v3Name The name of the column that will fill the z axis.
2165 /// \param[in] wName The name of the column that will provide the weights.
2166 /// \return the tridimensional histogram wrapped in a RResultPtr.
2167 ///
2168 /// This action is *lazy*: upon invocation of this method the calculation is
2169 /// booked but not executed. Also see RResultPtr.
2170 ///
2171 /// ### Example usage:
2172 /// ~~~{.cpp}
2173 /// // Deduce column types (this invocation needs jitting internally)
2174 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2175 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2176 /// // Explicit column types
2177 /// using d_t = double;
2178 /// auto myHist2 = myDf.Histo3D<d_t, d_t, float, d_t>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2179 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2180 /// ~~~
2181 ///
2182 ///
2183 /// See the documentation of the first Histo2D() overload for more details.
2184 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2185 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2186 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name,
2187 std::string_view v3Name, std::string_view wName)
2188 {
2189 std::shared_ptr<::TH3D> h(nullptr);
2190 {
2191 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2192 h = model.GetHistogram();
2193 }
2194 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2195 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2196 }
2197 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
2199 ? ColumnNames_t()
2202 }
2203
2204 template <typename V1, typename V2, typename V3, typename W>
2206 {
2207 return Histo3D<V1, V2, V3, W>(model, "", "", "", "");
2208 }
2209
2210 ////////////////////////////////////////////////////////////////////////////
2211 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2212 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
2213 /// present.
2214 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
2215 /// object.
2216 /// \param[in] model The returned histogram will be constructed using this as a model.
2217 /// \param[in] columnList
2218 /// A list containing the names of the columns that will be passed when calling `Fill`.
2219 /// \param[in] wName The name of the column that will provide the weights.
2220 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2221 ///
2222 /// This action is *lazy*: upon invocation of this method the calculation is
2223 /// booked but not executed. See RResultPtr documentation.
2224 ///
2225 /// ### Example usage:
2226 /// ~~~{.cpp}
2227 /// auto myFilledObj = myDf.HistoND<float, float, float, float>({"name","title", 4,
2228 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2229 /// {"col0", "col1", "col2", "col3"});
2230 /// ~~~
2231 ///
2232 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2233 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2234 ///
2235 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
2236 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2237 {
2238 std::shared_ptr<::THnD> h(nullptr);
2239 {
2240 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2241 h = model.GetHistogram();
2242 const auto hDims = h->GetNdimensions();
2243 decltype(hDims) nCols = columnList.size();
2244
2245 if (!wName.empty() && nCols == hDims + 1)
2246 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2247 "input columns contains one column more than the number of dimensions of the "
2248 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
2249
2250 if (nCols == hDims + 1)
2251 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
2252 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
2253
2254 if (!wName.empty() || nCols == hDims + 1)
2255 h->Sumw2();
2256
2257 if (nCols != hDims + 1 && nCols != hDims)
2258 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2259 }
2260
2261 if (!wName.empty()) {
2262 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2263 // passed arguments is one more the number of dimensions of the histogram.
2265 userColumns.push_back(std::string{wName});
2266 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(userColumns, h, h,
2267 fProxiedPtr);
2268 }
2269 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(columnList, h, h,
2270 fProxiedPtr);
2271 }
2272
2273 ////////////////////////////////////////////////////////////////////////////
2274 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2275 /// \param[in] model The returned histogram will be constructed using this as a model.
2276 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2277 /// \param[in] wName The name of the column that will provide the weights.
2278 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2279 ///
2280 /// This action is *lazy*: upon invocation of this method the calculation is
2281 /// booked but not executed. Also see RResultPtr.
2282 ///
2283 /// ### Example usage:
2284 /// ~~~{.cpp}
2285 /// auto myFilledObj = myDf.HistoND({"name","title", 4,
2286 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2287 /// {"col0", "col1", "col2", "col3"});
2288 /// ~~~
2289 ///
2290 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2291 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2292 ///
2293 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2294 {
2295 std::shared_ptr<::THnD> h(nullptr);
2296 {
2297 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2298 h = model.GetHistogram();
2299 const auto hDims = h->GetNdimensions();
2300 decltype(hDims) nCols = columnList.size();
2301
2302 if (!wName.empty() && nCols == hDims + 1)
2303 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2304 "input columns contains one column more than the number of dimensions of the "
2305 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
2306
2307 if (nCols == hDims + 1)
2308 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
2309 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
2310
2311 if (!wName.empty() || nCols == hDims + 1)
2312 h->Sumw2();
2313
2314 if (nCols != hDims + 1 && nCols != hDims)
2315 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2316 }
2317
2318 if (!wName.empty()) {
2319 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2320 // passed arguments is one more the number of dimensions of the histogram.
2322 userColumns.push_back(std::string{wName});
2324 userColumns.size());
2325 }
2327 columnList.size());
2328 }
2329
2330 ////////////////////////////////////////////////////////////////////////////
2331 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
2332 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
2333 /// present.
2334 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
2335 /// object.
2336 /// \param[in] model The returned histogram will be constructed using this as a model.
2337 /// \param[in] columnList
2338 /// A list containing the names of the columns that will be passed when calling `Fill`.
2339 /// \param[in] wName The name of the column that will provide the weights.
2340 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2341 ///
2342 /// This action is *lazy*: upon invocation of this method the calculation is
2343 /// booked but not executed. See RResultPtr documentation.
2344 ///
2345 /// ### Example usage:
2346 /// ~~~{.cpp}
2347 /// auto myFilledObj = myDf.HistoNSparseD<float, float, float, float>({"name","title", 4,
2348 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2349 /// {"col0", "col1", "col2", "col3"});
2350 /// ~~~
2351 ///
2352 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2353 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2354 ///
2355 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
2357 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2358 {
2359 std::shared_ptr<::THnSparseD> h(nullptr);
2360 {
2361 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2362 h = model.GetHistogram();
2363 const auto hDims = h->GetNdimensions();
2364 decltype(hDims) nCols = columnList.size();
2365
2366 if (!wName.empty() && nCols == hDims + 1)
2367 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2368 "input columns contains one column more than the number of dimensions of the "
2369 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
2370
2371 if (nCols == hDims + 1)
2372 Warning("HistoNSparseD",
2373 "Passing the column with the weights as the last column in the list is deprecated. "
2374 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
2375
2376 if (!wName.empty() || nCols == hDims + 1)
2377 h->Sumw2();
2378
2379 if (nCols != hDims + 1 && nCols != hDims)
2380 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2381 }
2382
2383 if (!wName.empty()) {
2384 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2385 // passed arguments is one more the number of dimensions of the histogram.
2387 userColumns.push_back(std::string{wName});
2388 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(userColumns, h, h,
2389 fProxiedPtr);
2390 }
2391 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(columnList, h, h,
2392 fProxiedPtr);
2393 }
2394
2395 ////////////////////////////////////////////////////////////////////////////
2396 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
2397 /// \param[in] model The returned histogram will be constructed using this as a model.
2398 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2399 /// \param[in] wName The name of the column that will provide the weights.
2400 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2401 ///
2402 /// This action is *lazy*: upon invocation of this method the calculation is
2403 /// booked but not executed. Also see RResultPtr.
2404 ///
2405 /// ### Example usage:
2406 /// ~~~{.cpp}
2407 /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4,
2408 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2409 /// {"col0", "col1", "col2", "col3"});
2410 /// ~~~
2411 ///
2412 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2413 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2414 ///
2416 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2417 {
2418 std::shared_ptr<::THnSparseD> h(nullptr);
2419 {
2420 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2421 h = model.GetHistogram();
2422 const auto hDims = h->GetNdimensions();
2423 decltype(hDims) nCols = columnList.size();
2424
2425 if (!wName.empty() && nCols == hDims + 1)
2426 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2427 "input columns contains one column more than the number of dimensions of the "
2428 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
2429
2430 if (nCols == hDims + 1)
2431 Warning("HistoNSparseD",
2432 "Passing the column with the weights as the last column in the list is deprecated. "
2433 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
2434
2435 if (!wName.empty() || nCols == hDims + 1)
2436 h->Sumw2();
2437
2438 if (nCols != hDims + 1 && nCols != hDims)
2439 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2440 }
2441
2442 if (!wName.empty()) {
2443 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2444 // passed arguments is one more the number of dimensions of the histogram.
2446 userColumns.push_back(std::string{wName});
2449 }
2451 columnList, h, h, fProxiedPtr, columnList.size());
2452 }
2453
2454#ifdef R__HAS_ROOT7
2455 ////////////////////////////////////////////////////////////////////////////
2456 /// \brief Fill and return a one-dimensional RHist (*lazy action*).
2457 /// \tparam BinContentType The bin content type of the returned RHist.
2458 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2459 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2460 /// \param[in] vName The name of the column that will fill the histogram.
2461 /// \return the histogram wrapped in a RResultPtr.
2462 ///
2463 /// This action is *lazy*: upon invocation of this method the calculation is
2464 /// booked but not executed. Also see RResultPtr.
2465 ///
2466 /// ### Example usage:
2467 /// ~~~{.cpp}
2468 /// auto myHist = myDf.Hist(10, {5, 15}, "col0");
2469 /// ~~~
2470 template <typename BinContentType = double, typename V = RDFDetail::RInferredType>
2472 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName)
2473 {
2474 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2475
2476 const ColumnNames_t columnList = {std::string(vName)};
2477
2478 return Hist<V>(h, columnList);
2479 }
2480
2481 ////////////////////////////////////////////////////////////////////////////
2482 /// \brief Fill and return an RHist (*lazy action*).
2483 /// \tparam BinContentType The bin content type of the returned RHist.
2484 /// \param[in] axes The returned histogram will be constructed using these axes.
2485 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2486 /// \return the histogram wrapped in a RResultPtr.
2487 ///
2488 /// This action is *lazy*: upon invocation of this method the calculation is
2489 /// booked but not executed. Also see RResultPtr.
2490 ///
2491 /// ### Example usage:
2492 /// ~~~{.cpp}
2493 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2494 /// auto myHist = myDf.Hist({axis}, {"col0"});
2495 /// ~~~
2496 template <typename BinContentType = double>
2498 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList)
2499 {
2500 // Note: this overload works around limitations for automatic pythonization of variadic function templates with
2501 // template parameter packs.
2503 }
2504
2505 ////////////////////////////////////////////////////////////////////////////
2506 /// \brief Fill and return an RHist (*lazy action*).
2507 /// \tparam BinContentType The bin content type of the returned RHist.
2508 /// \param[in] axes The returned histogram will be constructed using these axes.
2509 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2510 /// \return the histogram wrapped in a RResultPtr.
2511 ///
2512 /// This action is *lazy*: upon invocation of this method the calculation is
2513 /// booked but not executed. Also see RResultPtr.
2514 ///
2515 /// ### Example usage:
2516 /// ~~~{.cpp}
2517 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2518 /// auto myHist = myDf.Hist<double, double>({axis}, {"col0"});
2519 /// ~~~
2520 template <typename BinContentType, typename ColumnType, typename... ColumnTypes>
2522 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList)
2523 {
2524 if (axes.size() != columnList.size()) {
2525 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2526 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2527 throw std::invalid_argument(msg);
2528 }
2529
2530 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2531
2532 return Hist<ColumnType, ColumnTypes...>(h, columnList);
2533 }
2534
2535 ////////////////////////////////////////////////////////////////////////////
2536 /// \brief Fill the provided RHist (*lazy action*).
2537 /// \param[in] h The histogram that should be filled.
2538 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2539 /// \return the histogram wrapped in a RResultPtr.
2540 ///
2541 /// This action is *lazy*: upon invocation of this method the calculation is
2542 /// booked but not executed. Also see RResultPtr.
2543 ///
2544 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2545 /// allowed during concurrent filling.
2546 ///
2547 /// ### Example usage:
2548 /// ~~~{.cpp}
2549 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2550 /// auto myHist = myDf.Hist(h, {"col0"});
2551 /// ~~~
2552 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2554 Hist(std::shared_ptr<ROOT::Experimental::RHist<BinContentType>> h, const ColumnNames_t &columnList)
2555 {
2557
2558 if (h->GetNDimensions() != columnList.size()) {
2559 std::string msg = "Wrong number of columns for the passed histogram: ";
2560 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2561 throw std::invalid_argument(msg);
2562 }
2563
2564 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2565 columnList.size());
2566 }
2567
2568 ////////////////////////////////////////////////////////////////////////////
2569 /// \brief Fill and return a one-dimensional RHist with weights (*lazy action*).
2570 /// \tparam BinContentType The bin content type of the returned RHist.
2571 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2572 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2573 /// \param[in] vName The name of the column that will fill the histogram.
2574 /// \param[in] wName The name of the column that will provide the weights.
2575 /// \return the histogram wrapped in a RResultPtr.
2576 ///
2577 /// This action is *lazy*: upon invocation of this method the calculation is
2578 /// booked but not executed. Also see RResultPtr.
2579 ///
2580 /// ### Example usage:
2581 /// ~~~{.cpp}
2582 /// auto myHist = myDf.Hist(10, {5, 15}, "col0", "colW");
2583 /// ~~~
2585 typename W = RDFDetail::RInferredType>
2587 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName, std::string_view wName)
2588 {
2589 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2590
2591 const ColumnNames_t columnList = {std::string(vName)};
2592
2593 return Hist<V, W>(h, columnList, wName);
2594 }
2595
2596 ////////////////////////////////////////////////////////////////////////////
2597 /// \brief Fill and return an RHist with weights (*lazy action*).
2598 /// \tparam BinContentType The bin content type of the returned RHist.
2599 /// \param[in] axes The returned histogram will be constructed using these axes.
2600 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2601 /// \param[in] wName The name of the column that will provide the weights.
2602 /// \return the histogram wrapped in a RResultPtr.
2603 ///
2604 /// This action is *lazy*: upon invocation of this method the calculation is
2605 /// booked but not executed. Also see RResultPtr.
2606 ///
2607 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2608 ///
2609 /// ### Example usage:
2610 /// ~~~{.cpp}
2611 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2612 /// auto myHist = myDf.Hist({axis}, {"col0"}, "colW");
2613 /// ~~~
2614 template <typename BinContentType = ROOT::Experimental::RBinWithError>
2616 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList, std::string_view wName)
2617 {
2618 // Note: this overload works around limitations for automatic pythonization of variadic function templates with
2619 // template parameter packs.
2621 }
2622
2623 ////////////////////////////////////////////////////////////////////////////
2624 /// \brief Fill and return an RHist with weights (*lazy action*).
2625 /// \tparam BinContentType The bin content type of the returned RHist.
2626 /// \param[in] axes The returned histogram will be constructed using these axes.
2627 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2628 /// \param[in] wName The name of the column that will provide the weights.
2629 /// \return the histogram wrapped in a RResultPtr.
2630 ///
2631 /// This action is *lazy*: upon invocation of this method the calculation is
2632 /// booked but not executed. Also see RResultPtr.
2633 ///
2634 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2635 ///
2636 /// ### Example usage:
2637 /// ~~~{.cpp}
2638 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2639 /// auto myHist = myDf.Hist<ROOT::Experimental::RBinWithError, double, double>({axis}, {"col0"}, "colW");
2640 /// ~~~
2641 template <typename BinContentType = ROOT::Experimental::RBinWithError, typename ColumnType, typename... ColumnTypes>
2643 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList, std::string_view wName)
2644 {
2646 "weighted filling is not supported for integral bin content types");
2647
2648 if (axes.size() != columnList.size()) {
2649 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2650 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2651 throw std::invalid_argument(msg);
2652 }
2653
2654 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2655
2656 return Hist<ColumnType, ColumnTypes...>(h, columnList, wName);
2657 }
2658
2659 ////////////////////////////////////////////////////////////////////////////
2660 /// \brief Fill the provided RHist with weights (*lazy action*).
2661 /// \param[in] h The histogram that should be filled.
2662 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2663 /// \param[in] wName The name of the column that will provide the weights.
2664 /// \return the histogram wrapped in a RResultPtr.
2665 ///
2666 /// This action is *lazy*: upon invocation of this method the calculation is
2667 /// booked but not executed. Also see RResultPtr.
2668 ///
2669 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2670 ///
2671 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2672 /// allowed during concurrent filling.
2673 ///
2674 /// ### Example usage:
2675 /// ~~~{.cpp}
2676 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2677 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2678 /// ~~~
2679 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2681 Hist(std::shared_ptr<ROOT::Experimental::RHist<BinContentType>> h, const ColumnNames_t &columnList,
2682 std::string_view wName)
2683 {
2685 "weighted filling is not supported for integral bin content types");
2686
2688
2689 if (h->GetNDimensions() != columnList.size()) {
2690 std::string msg = "Wrong number of columns for the passed histogram: ";
2691 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2692 throw std::invalid_argument(msg);
2693 }
2694
2695 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2696 ColumnNames_t columnListWithWeights(columnList);
2697 columnListWithWeights.push_back(std::string(wName));
2698
2699 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2701 }
2702
2703 ////////////////////////////////////////////////////////////////////////////
2704 /// \brief Fill the provided RHistEngine (*lazy action*).
2705 /// \param[in] h The histogram that should be filled.
2706 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2707 /// \return the histogram wrapped in a RResultPtr.
2708 ///
2709 /// This action is *lazy*: upon invocation of this method the calculation is
2710 /// booked but not executed. Also see RResultPtr.
2711 ///
2712 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2713 /// allowed during concurrent filling.
2714 ///
2715 /// ### Example usage:
2716 /// ~~~{.cpp}
2717 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2718 /// auto myHist = myDf.Hist(h, {"col0"});
2719 /// ~~~
2720 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2722 Hist(std::shared_ptr<ROOT::Experimental::RHistEngine<BinContentType>> h, const ColumnNames_t &columnList)
2723 {
2725
2726 if (h->GetNDimensions() != columnList.size()) {
2727 std::string msg = "Wrong number of columns for the passed histogram: ";
2728 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2729 throw std::invalid_argument(msg);
2730 }
2731
2732 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2733 columnList.size());
2734 }
2735
2736 ////////////////////////////////////////////////////////////////////////////
2737 /// \brief Fill the provided RHistEngine with weights (*lazy action*).
2738 /// \param[in] h The histogram that should be filled.
2739 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2740 /// \param[in] wName The name of the column that will provide the weights.
2741 /// \return the histogram wrapped in a RResultPtr.
2742 ///
2743 /// This action is *lazy*: upon invocation of this method the calculation is
2744 /// booked but not executed. Also see RResultPtr.
2745 ///
2746 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2747 ///
2748 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2749 /// allowed during concurrent filling.
2750 ///
2751 /// ### Example usage:
2752 /// ~~~{.cpp}
2753 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2754 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2755 /// ~~~
2756 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2758 Hist(std::shared_ptr<ROOT::Experimental::RHistEngine<BinContentType>> h, const ColumnNames_t &columnList,
2759 std::string_view wName)
2760 {
2762 "weighted filling is not supported for integral bin content types");
2763
2765
2766 if (h->GetNDimensions() != columnList.size()) {
2767 std::string msg = "Wrong number of columns for the passed histogram: ";
2768 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2769 throw std::invalid_argument(msg);
2770 }
2771
2772 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2773 ColumnNames_t columnListWithWeights(columnList);
2774 columnListWithWeights.push_back(std::string(wName));
2775
2776 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2778 }
2779#endif
2780
2781 ////////////////////////////////////////////////////////////////////////////
2782 /// \brief Fill and return a TGraph object (*lazy action*).
2783 /// \tparam X The type of the column used to fill the x axis.
2784 /// \tparam Y The type of the column used to fill the y axis.
2785 /// \param[in] x The name of the column that will fill the x axis.
2786 /// \param[in] y The name of the column that will fill the y axis.
2787 /// \return the TGraph wrapped in a RResultPtr.
2788 ///
2789 /// Columns can be of a container type (e.g. std::vector<double>), in which case the TGraph
2790 /// is filled with each one of the elements of the container.
2791 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2792 /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing.
2793 /// A name and a title to the TGraph is given based on the input column names.
2794 ///
2795 /// This action is *lazy*: upon invocation of this method the calculation is
2796 /// booked but not executed. Also see RResultPtr.
2797 ///
2798 /// ### Example usage:
2799 /// ~~~{.cpp}
2800 /// // Deduce column types (this invocation needs jitting internally)
2801 /// auto myGraph1 = myDf.Graph("xValues", "yValues");
2802 /// // Explicit column types
2803 /// auto myGraph2 = myDf.Graph<int, float>("xValues", "yValues");
2804 /// ~~~
2805 ///
2806 /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory
2807 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2808 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2809 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType>
2810 RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "")
2811 {
2812 auto graph = std::make_shared<::TGraph>();
2813 const std::vector<std::string_view> columnViews = {x, y};
2815 ? ColumnNames_t()
2817
2819
2820 // We build a default name and title based on the input columns
2821 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2822 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2823 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2824 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2825 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2826
2828 }
2829
2830 ////////////////////////////////////////////////////////////////////////////
2831 /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*).
2832 /// \param[in] x The name of the column that will fill the x axis.
2833 /// \param[in] y The name of the column that will fill the y axis.
2834 /// \param[in] exl The name of the column of X low errors
2835 /// \param[in] exh The name of the column of X high errors
2836 /// \param[in] eyl The name of the column of Y low errors
2837 /// \param[in] eyh The name of the column of Y high errors
2838 /// \return the TGraphAsymmErrors wrapped in a RResultPtr.
2839 ///
2840 /// Columns can be of a container type (e.g. std::vector<double>), in which case the graph
2841 /// is filled with each one of the elements of the container.
2842 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2843 ///
2844 /// This action is *lazy*: upon invocation of this method the calculation is
2845 /// booked but not executed. Also see RResultPtr.
2846 ///
2847 /// ### Example usage:
2848 /// ~~~{.cpp}
2849 /// // Deduce column types (this invocation needs jitting internally)
2850 /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2851 /// // Explicit column types
2852 /// using f = float
2853 /// auto myGAE2 = myDf.GraphAsymmErrors<f, f, f, f, f, f>("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2854 /// ~~~
2855 ///
2856 /// `GraphAsymmErrors` should also be used for the cases in which values associated only with
2857 /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero.
2858 /// In such cases, user should do the following:
2859 /// ~~~{.cpp}
2860 /// // Create a column of zeros in RDataFrame
2861 /// auto rdf_withzeros = rdf.Define("zero", "0");
2862 /// // or alternatively:
2863 /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;});
2864 /// // Create the graph with y errors only
2865 /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh");
2866 /// ~~~
2867 ///
2868 /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory
2869 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2870 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2871 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType,
2875 GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "",
2876 std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "")
2877 {
2878 auto graph = std::make_shared<::TGraphAsymmErrors>();
2879 const std::vector<std::string_view> columnViews = {x, y, exl, exh, eyl, eyh};
2881 ? ColumnNames_t()
2883
2885
2886 // We build a default name and title based on the input columns
2887 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2888 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2889 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2890 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2891 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2892
2894 graph, fProxiedPtr);
2895 }
2896
2897 ////////////////////////////////////////////////////////////////////////////
2898 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2899 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2900 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2901 /// \param[in] model The model to be considered to build the new return value.
2902 /// \param[in] v1Name The name of the column that will fill the x axis.
2903 /// \param[in] v2Name The name of the column that will fill the y axis.
2904 /// \return the monodimensional profile wrapped in a RResultPtr.
2905 ///
2906 /// This action is *lazy*: upon invocation of this method the calculation is
2907 /// booked but not executed. Also see RResultPtr.
2908 ///
2909 /// ### Example usage:
2910 /// ~~~{.cpp}
2911 /// // Deduce column types (this invocation needs jitting internally)
2912 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2913 /// // Explicit column types
2914 /// auto myProf2 = myDf.Graph<int, float>({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2915 /// ~~~
2916 ///
2917 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2918 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2919 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2920 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2922 Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2923 {
2924 std::shared_ptr<::TProfile> h(nullptr);
2925 {
2926 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2927 h = model.GetProfile();
2928 }
2929
2930 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2931 throw std::runtime_error("Profiles with no axes limits are not supported yet.");
2932 }
2933 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2935 ? ColumnNames_t()
2938 }
2939
2940 ////////////////////////////////////////////////////////////////////////////
2941 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2942 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2943 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2944 /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present.
2945 /// \param[in] model The model to be considered to build the new return value.
2946 /// \param[in] v1Name The name of the column that will fill the x axis.
2947 /// \param[in] v2Name The name of the column that will fill the y axis.
2948 /// \param[in] wName The name of the column that will provide the weights.
2949 /// \return the monodimensional profile wrapped in a RResultPtr.
2950 ///
2951 /// This action is *lazy*: upon invocation of this method the calculation is
2952 /// booked but not executed. Also see RResultPtr.
2953 ///
2954 /// ### Example usage:
2955 /// ~~~{.cpp}
2956 /// // Deduce column types (this invocation needs jitting internally)
2957 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight");
2958 /// // Explicit column types
2959 /// auto myProf2 = myDf.Profile1D<int, float, double>({"profName", "profTitle", 64u, -4., 4.},
2960 /// "xValues", "yValues", "weight");
2961 /// ~~~
2962 ///
2963 /// See the first Profile1D() overload for more details.
2964 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2965 typename W = RDFDetail::RInferredType>
2967 Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2968 {
2969 std::shared_ptr<::TProfile> h(nullptr);
2970 {
2971 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2972 h = model.GetProfile();
2973 }
2974
2975 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2976 throw std::runtime_error("Profile histograms with no axes limits are not supported yet.");
2977 }
2978 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2980 ? ColumnNames_t()
2983 }
2984
2985 ////////////////////////////////////////////////////////////////////////////
2986 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2987 /// See the first Profile1D() overload for more details.
2988 template <typename V1, typename V2, typename W>
2990 {
2991 return Profile1D<V1, V2, W>(model, "", "", "");
2992 }
2993
2994 ////////////////////////////////////////////////////////////////////////////
2995 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2996 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2997 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2998 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2999 /// \param[in] model The returned profile will be constructed using this as a model.
3000 /// \param[in] v1Name The name of the column that will fill the x axis.
3001 /// \param[in] v2Name The name of the column that will fill the y axis.
3002 /// \param[in] v3Name The name of the column that will fill the z axis.
3003 /// \return the bidimensional profile wrapped in a RResultPtr.
3004 ///
3005 /// This action is *lazy*: upon invocation of this method the calculation is
3006 /// booked but not executed. Also see RResultPtr.
3007 ///
3008 /// ### Example usage:
3009 /// ~~~{.cpp}
3010 /// // Deduce column types (this invocation needs jitting internally)
3011 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3012 /// "xValues", "yValues", "zValues");
3013 /// // Explicit column types
3014 /// auto myProf2 = myDf.Profile2D<int, float, double>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3015 /// "xValues", "yValues", "zValues");
3016 /// ~~~
3017 ///
3018 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
3019 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
3020 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
3021 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
3022 typename V3 = RDFDetail::RInferredType>
3023 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "",
3024 std::string_view v2Name = "", std::string_view v3Name = "")
3025 {
3026 std::shared_ptr<::TProfile2D> h(nullptr);
3027 {
3028 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
3029 h = model.GetProfile();
3030 }
3031
3032 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
3033 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
3034 }
3035 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
3037 ? ColumnNames_t()
3040 }
3041
3042 ////////////////////////////////////////////////////////////////////////////
3043 /// \brief Fill and return a two-dimensional profile (*lazy action*).
3044 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
3045 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
3046 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
3047 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
3048 /// \param[in] model The returned histogram will be constructed using this as a model.
3049 /// \param[in] v1Name The name of the column that will fill the x axis.
3050 /// \param[in] v2Name The name of the column that will fill the y axis.
3051 /// \param[in] v3Name The name of the column that will fill the z axis.
3052 /// \param[in] wName The name of the column that will provide the weights.
3053 /// \return the bidimensional profile wrapped in a RResultPtr.
3054 ///
3055 /// This action is *lazy*: upon invocation of this method the calculation is
3056 /// booked but not executed. Also see RResultPtr.
3057 ///
3058 /// ### Example usage:
3059 /// ~~~{.cpp}
3060 /// // Deduce column types (this invocation needs jitting internally)
3061 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3062 /// "xValues", "yValues", "zValues", "weight");
3063 /// // Explicit column types
3064 /// auto myProf2 = myDf.Profile2D<int, float, double, int>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3065 /// "xValues", "yValues", "zValues", "weight");
3066 /// ~~~
3067 ///
3068 /// See the first Profile2D() overload for more details.
3069 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
3070 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
3071 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name,
3072 std::string_view v3Name, std::string_view wName)
3073 {
3074 std::shared_ptr<::TProfile2D> h(nullptr);
3075 {
3076 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
3077 h = model.GetProfile();
3078 }
3079
3080 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
3081 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
3082 }
3083 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
3085 ? ColumnNames_t()
3088 }
3089
3090 /// \brief Fill and return a two-dimensional profile (*lazy action*).
3091 /// See the first Profile2D() overload for more details.
3092 template <typename V1, typename V2, typename V3, typename W>
3094 {
3095 return Profile2D<V1, V2, V3, W>(model, "", "", "", "");
3096 }
3097
3098 ////////////////////////////////////////////////////////////////////////////
3099 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*).
3100 ///
3101 /// Type T must provide at least:
3102 /// - a copy-constructor
3103 /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList
3104 /// (these types can also be passed as template parameters to this method)
3105 /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector<T *>&)` that merges the
3106 /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that
3107 /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in
3108 /// the TCollection*).
3109 ///
3110 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present.
3111 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object.
3112 /// \tparam T The type of the object to fill. Automatically deduced.
3113 /// \param[in] model The model to be considered to build the new return value.
3114 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
3115 /// \return the filled object wrapped in a RResultPtr.
3116 ///
3117 /// The user gives up ownership of the model object.
3118 /// The list of column names to be used for filling must always be specified.
3119 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed.
3120 /// Also see RResultPtr.
3121 ///
3122 /// ### Example usage:
3123 /// ~~~{.cpp}
3124 /// MyClass obj;
3125 /// // Deduce column types (this invocation needs jitting internally, and in this case
3126 /// // MyClass needs to be known to the interpreter)
3127 /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"});
3128 /// // explicit column types
3129 /// auto myFilledObj = myDf.Fill<float, float>(obj, {"col0", "col1"});
3130 /// ~~~
3131 ///
3132 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename T>
3134 {
3135 auto h = std::make_shared<std::decay_t<T>>(std::forward<T>(model));
3136 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
3137 throw std::runtime_error("The absence of axes limits is not supported yet.");
3138 }
3139 return CreateAction<RDFInternal::ActionTags::Fill, FirstColumn, OtherColumns...>(columnList, h, h, fProxiedPtr,
3140 columnList.size());
3141 }
3142
3143 ////////////////////////////////////////////////////////////////////////////
3144 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
3145 ///
3146 /// \tparam V The type of the value column
3147 /// \param[in] value The name of the column with the values to fill the statistics with.
3148 /// \return the filled TStatistic object wrapped in a RResultPtr.
3149 ///
3150 /// ### Example usage:
3151 /// ~~~{.cpp}
3152 /// // Deduce column type (this invocation needs jitting internally)
3153 /// auto stats0 = myDf.Stats("values");
3154 /// // Explicit column type
3155 /// auto stats1 = myDf.Stats<float>("values");
3156 /// ~~~
3157 ///
3158 template <typename V = RDFDetail::RInferredType>
3159 RResultPtr<TStatistic> Stats(std::string_view value = "")
3160 {
3162 if (!value.empty()) {
3163 columns.emplace_back(std::string(value));
3164 }
3166 if (std::is_same<V, RDFDetail::RInferredType>::value) {
3167 return Fill(TStatistic(), validColumnNames);
3168 } else {
3170 }
3171 }
3172
3173 ////////////////////////////////////////////////////////////////////////////
3174 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
3175 ///
3176 /// \tparam V The type of the value column
3177 /// \tparam W The type of the weight column
3178 /// \param[in] value The name of the column with the values to fill the statistics with.
3179 /// \param[in] weight The name of the column with the weights to fill the statistics with.
3180 /// \return the filled TStatistic object wrapped in a RResultPtr.
3181 ///
3182 /// ### Example usage:
3183 /// ~~~{.cpp}
3184 /// // Deduce column types (this invocation needs jitting internally)
3185 /// auto stats0 = myDf.Stats("values", "weights");
3186 /// // Explicit column types
3187 /// auto stats1 = myDf.Stats<int, float>("values", "weights");
3188 /// ~~~
3189 ///
3190 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
3191 RResultPtr<TStatistic> Stats(std::string_view value, std::string_view weight)
3192 {
3193 ColumnNames_t columns{std::string(value), std::string(weight)};
3194 constexpr auto vIsInferred = std::is_same<V, RDFDetail::RInferredType>::value;
3195 constexpr auto wIsInferred = std::is_same<W, RDFDetail::RInferredType>::value;
3197 // We have 3 cases:
3198 // 1. Both types are inferred: we use Fill and let the jit kick in.
3199 // 2. One of the two types is explicit and the other one is inferred: the case is not supported.
3200 // 3. Both types are explicit: we invoke the fully compiled Fill method.
3201 if (vIsInferred && wIsInferred) {
3202 return Fill(TStatistic(), validColumnNames);
3203 } else if (vIsInferred != wIsInferred) {
3204 std::string error("The ");
3205 error += vIsInferred ? "value " : "weight ";
3206 error += "column type is explicit, while the ";
3207 error += vIsInferred ? "weight " : "value ";
3208 error += " is specified to be inferred. This case is not supported: please specify both types or none.";
3209 throw std::runtime_error(error);
3210 } else {
3212 }
3213 }
3214
3215 ////////////////////////////////////////////////////////////////////////////
3216 /// \brief Return the minimum of processed column values (*lazy action*).
3217 /// \tparam T The type of the branch/column.
3218 /// \param[in] columnName The name of the branch/column to be treated.
3219 /// \return the minimum value of the selected column wrapped in a RResultPtr.
3220 ///
3221 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3222 /// template specialization of this method.
3223 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3224 ///
3225 /// This action is *lazy*: upon invocation of this method the calculation is
3226 /// booked but not executed. Also see RResultPtr.
3227 ///
3228 /// ### Example usage:
3229 /// ~~~{.cpp}
3230 /// // Deduce column type (this invocation needs jitting internally)
3231 /// auto minVal0 = myDf.Min("values");
3232 /// // Explicit column type
3233 /// auto minVal1 = myDf.Min<double>("values");
3234 /// ~~~
3235 ///
3236 template <typename T = RDFDetail::RInferredType>
3238 {
3239 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3240 using RetType_t = RDFDetail::MinReturnType_t<T>;
3241 auto minV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::max());
3243 }
3244
3245 ////////////////////////////////////////////////////////////////////////////
3246 /// \brief Return the maximum of processed column values (*lazy action*).
3247 /// \tparam T The type of the branch/column.
3248 /// \param[in] columnName The name of the branch/column to be treated.
3249 /// \return the maximum value of the selected column wrapped in a RResultPtr.
3250 ///
3251 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3252 /// template specialization of this method.
3253 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3254 ///
3255 /// This action is *lazy*: upon invocation of this method the calculation is
3256 /// booked but not executed. Also see RResultPtr.
3257 ///
3258 /// ### Example usage:
3259 /// ~~~{.cpp}
3260 /// // Deduce column type (this invocation needs jitting internally)
3261 /// auto maxVal0 = myDf.Max("values");
3262 /// // Explicit column type
3263 /// auto maxVal1 = myDf.Max<double>("values");
3264 /// ~~~
3265 ///
3266 template <typename T = RDFDetail::RInferredType>
3268 {
3269 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3270 using RetType_t = RDFDetail::MaxReturnType_t<T>;
3271 auto maxV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::lowest());
3273 }
3274
3275 ////////////////////////////////////////////////////////////////////////////
3276 /// \brief Return the mean of processed column values (*lazy action*).
3277 /// \tparam T The type of the branch/column.
3278 /// \param[in] columnName The name of the branch/column to be treated.
3279 /// \return the mean value of the selected column wrapped in a RResultPtr.
3280 ///
3281 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3282 /// template specialization of this method.
3283 /// Note that internally, the summations are executed with Kahan sums in double precision, irrespective
3284 /// of the type of column that is read.
3285 ///
3286 /// This action is *lazy*: upon invocation of this method the calculation is
3287 /// booked but not executed. Also see RResultPtr.
3288 ///
3289 /// ### Example usage:
3290 /// ~~~{.cpp}
3291 /// // Deduce column type (this invocation needs jitting internally)
3292 /// auto meanVal0 = myDf.Mean("values");
3293 /// // Explicit column type
3294 /// auto meanVal1 = myDf.Mean<double>("values");
3295 /// ~~~
3296 ///
3297 template <typename T = RDFDetail::RInferredType>
3298 RResultPtr<double> Mean(std::string_view columnName = "")
3299 {
3300 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3301 auto meanV = std::make_shared<double>(0);
3303 }
3304
3305 ////////////////////////////////////////////////////////////////////////////
3306 /// \brief Return the unbiased standard deviation of processed column values (*lazy action*).
3307 /// \tparam T The type of the branch/column.
3308 /// \param[in] columnName The name of the branch/column to be treated.
3309 /// \return the standard deviation value of the selected column wrapped in a RResultPtr.
3310 ///
3311 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3312 /// template specialization of this method.
3313 ///
3314 /// This action is *lazy*: upon invocation of this method the calculation is
3315 /// booked but not executed. Also see RResultPtr.
3316 ///
3317 /// ### Example usage:
3318 /// ~~~{.cpp}
3319 /// // Deduce column type (this invocation needs jitting internally)
3320 /// auto stdDev0 = myDf.StdDev("values");
3321 /// // Explicit column type
3322 /// auto stdDev1 = myDf.StdDev<double>("values");
3323 /// ~~~
3324 ///
3325 template <typename T = RDFDetail::RInferredType>
3326 RResultPtr<double> StdDev(std::string_view columnName = "")
3327 {
3328 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3329 auto stdDeviationV = std::make_shared<double>(0);
3331 }
3332
3333 // clang-format off
3334 ////////////////////////////////////////////////////////////////////////////
3335 /// \brief Return the sum of processed column values (*lazy action*).
3336 /// \tparam T The type of the branch/column.
3337 /// \param[in] columnName The name of the branch/column.
3338 /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible.
3339 /// \return the sum of the selected column wrapped in a RResultPtr.
3340 ///
3341 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3342 /// template specialization of this method.
3343 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3344 ///
3345 /// This action is *lazy*: upon invocation of this method the calculation is
3346 /// booked but not executed. Also see RResultPtr.
3347 ///
3348 /// ### Example usage:
3349 /// ~~~{.cpp}
3350 /// // Deduce column type (this invocation needs jitting internally)
3351 /// auto sum0 = myDf.Sum("values");
3352 /// // Explicit column type
3353 /// auto sum1 = myDf.Sum<double>("values");
3354 /// ~~~
3355 ///
3356 template <typename T = RDFDetail::RInferredType>
3358 Sum(std::string_view columnName = "",
3359 const RDFDetail::SumReturnType_t<T> &initValue = RDFDetail::SumReturnType_t<T>{})
3360 {
3361 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3362 auto sumV = std::make_shared<RDFDetail::SumReturnType_t<T>>(initValue);
3364 }
3365 // clang-format on
3366
3367 ////////////////////////////////////////////////////////////////////////////
3368 /// \brief Gather filtering statistics.
3369 /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr.
3370 ///
3371 /// Calling `Report` on the main `RDataFrame` object gathers stats for
3372 /// all named filters in the call graph. Calling this method on a
3373 /// stored chain state (i.e. a graph node different from the first) gathers
3374 /// the stats for all named filters in the chain section between the original
3375 /// `RDataFrame` and that node (included). Stats are gathered in the same
3376 /// order as the named filters have been added to the graph.
3377 /// A RResultPtr<RCutFlowReport> is returned to allow inspection of the
3378 /// effects cuts had.
3379 ///
3380 /// This action is *lazy*: upon invocation of
3381 /// this method the calculation is booked but not executed. See RResultPtr
3382 /// documentation.
3383 ///
3384 /// ### Example usage:
3385 /// ~~~{.cpp}
3386 /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2");
3387 /// auto cutReport = filtered3.Report();
3388 /// cutReport->Print();
3389 /// ~~~
3390 ///
3392 {
3393 bool returnEmptyReport = false;
3394 // if this is a RInterface<RLoopManager> on which `Define` has been called, users
3395 // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which
3396 // certainly does not contain named filters.
3397 // The number 4 takes into account the implicit columns for entry and slot number
3398 // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_)
3399 if (std::is_same<Proxied, RLoopManager>::value && fColRegister.GenerateColumnNames().size() > 4)
3400 returnEmptyReport = true;
3401
3402 auto rep = std::make_shared<RCutFlowReport>();
3405
3406 auto action = std::make_unique<Action_t>(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}),
3408
3409 return MakeResultPtr(rep, *fLoopManager, std::move(action));
3410 }
3411
3412 /// \brief Returns the names of the filters created.
3413 /// \return the container of filters names.
3414 ///
3415 /// If called on a root node, all the filters in the computation graph will
3416 /// be printed. For any other node, only the filters upstream of that node.
3417 /// Filters without a name are printed as "Unnamed Filter"
3418 /// This is not an action nor a transformation, just a query to the RDataFrame object.
3419 ///
3420 /// ### Example usage:
3421 /// ~~~{.cpp}
3422 /// auto filtNames = d.GetFilterNames();
3423 /// for (auto &&filtName : filtNames) std::cout << filtName << std::endl;
3424 /// ~~~
3425 ///
3426 std::vector<std::string> GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); }
3427
3428 // clang-format off
3429 ////////////////////////////////////////////////////////////////////////////
3430 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3431 /// \tparam F The type of the aggregator callable. Automatically deduced.
3432 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3433 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3434 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U&,T)`, where T is the type of the column, U is the type of the aggregator variable
3435 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3436 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3437 /// \param[in] aggIdentity The aggregator variable of each thread is initialized to this value (or is default-constructed if the parameter is omitted)
3438 /// \return the result of the aggregation wrapped in a RResultPtr.
3439 ///
3440 /// An aggregator callable takes two values, an aggregator variable and a column value. The aggregator variable is
3441 /// initialized to aggIdentity or default-constructed if aggIdentity is omitted.
3442 /// This action calls the aggregator callable for each processed entry, passing in the aggregator variable and
3443 /// the value of the column columnName.
3444 /// If the signature is `U(U,T)` the aggregator variable is then copy-assigned the result of the execution of the callable.
3445 /// Otherwise the signature of aggregator must be `void(U&,T)`.
3446 ///
3447 /// The merger callable is used to merge the partial accumulation results of each processing thread. It is only called in multi-thread executions.
3448 /// If its signature is `U(U,U)` the aggregator variables of each thread are merged two by two.
3449 /// If its signature is `void(std::vector<U>& a)` it is assumed that it merges all aggregators in a[0].
3450 ///
3451 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3452 ///
3453 /// Example usage:
3454 /// ~~~{.cpp}
3455 /// auto aggregator = [](double acc, double x) { return acc * x; };
3456 /// ROOT::EnableImplicitMT();
3457 /// // If multithread is enabled, the aggregator function will be called by more threads
3458 /// // and will produce a vector of partial accumulators.
3459 /// // The merger function performs the final aggregation of these partial results.
3460 /// auto merger = [](std::vector<double> &accumulators) {
3461 /// for (auto i : ROOT::TSeqU(1u, accumulators.size())) {
3462 /// accumulators[0] *= accumulators[i];
3463 /// }
3464 /// };
3465 ///
3466 /// // The accumulator is initialized at this value by every thread.
3467 /// double initValue = 1.;
3468 ///
3469 /// // Multiplies all elements of the column "x"
3470 /// auto result = d.Aggregate(aggregator, merger, "x", initValue);
3471 /// ~~~
3472 // clang-format on
3474 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3475 typename ArgTypesNoDecay = typename TTraits::CallableTraits<AccFun>::arg_types_nodecay,
3476 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3477 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3479 {
3480 RDFInternal::CheckAggregate<R, MergeFun>(ArgTypesNoDecay());
3481 const auto columns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3482
3485
3486 auto accObjPtr = std::make_shared<U>(aggIdentity);
3487 using Helper_t = RDFInternal::AggregateHelper<AccFun, MergeFun, R, T, U>;
3489 auto action = std::make_unique<Action_t>(
3490 Helper_t(std::move(aggregator), std::move(merger), accObjPtr, fLoopManager->GetNSlots()), validColumnNames,
3492 return MakeResultPtr(accObjPtr, *fLoopManager, std::move(action));
3493 }
3494
3495 // clang-format off
3496 ////////////////////////////////////////////////////////////////////////////
3497 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3498 /// \tparam F The type of the aggregator callable. Automatically deduced.
3499 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3500 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3501 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U,T)`, where T is the type of the column, U is the type of the aggregator variable
3502 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3503 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3504 /// \return the result of the aggregation wrapped in a RResultPtr.
3505 ///
3506 /// See previous Aggregate overload for more information.
3507 // clang-format on
3509 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3510 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3511 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3513 {
3514 static_assert(
3515 std::is_default_constructible<U>::value,
3516 "aggregated object cannot be default-constructed. Please provide an initialisation value (aggIdentity)");
3517 return Aggregate(std::move(aggregator), std::move(merger), columnName, U());
3518 }
3519
3520 // clang-format off
3521 ////////////////////////////////////////////////////////////////////////////
3522 /// \brief Book execution of a custom action using a user-defined helper object.
3523 /// \tparam FirstColumn The type of the first column used by this action. Inferred together with OtherColumns if not present.
3524 /// \tparam OtherColumns A list of the types of the other columns used by this action
3525 /// \tparam Helper The type of the user-defined helper. See below for the required interface it should expose.
3526 /// \param[in] helper The Action Helper to be scheduled.
3527 /// \param[in] columns The names of the columns on which the helper acts.
3528 /// \return the result of the helper wrapped in a RResultPtr.
3529 ///
3530 /// This method books a custom action for execution. The behavior of the action is completely dependent on the
3531 /// Helper object provided by the caller. The required interface for the helper is described below (more
3532 /// methods that the ones required can be present, e.g. a constructor that takes the number of worker threads is usually useful):
3533 ///
3534 /// ### Mandatory interface
3535 ///
3536 /// * `Helper` must publicly inherit from `ROOT::Detail::RDF::RActionImpl<Helper>`
3537 /// * `Helper::Result_t`: public alias for the type of the result of this action helper. `Result_t` must be default-constructible.
3538 /// * `Helper(Helper &&)`: a move-constructor is required. Copy-constructors are discouraged.
3539 /// * `std::shared_ptr<Result_t> GetResultPtr() const`: return a shared_ptr to the result of this action (of type
3540 /// Result_t). The RResultPtr returned by Book will point to this object. Note that this method can be called
3541 /// _before_ Initialize(), because the RResultPtr is constructed before the event loop is started.
3542 /// * `void Initialize()`: this method is called once before starting the event-loop. Useful for setup operations.
3543 /// It must reset the state of the helper to the expected state at the beginning of the event loop: the same helper,
3544 /// or copies of it, might be used for multiple event loops (e.g. in the presence of systematic variations).
3545 /// * `void InitTask(TTreeReader *, unsigned int slot)`: each working thread shall call this method during the event
3546 /// loop, before processing a batch of entries. The pointer passed as argument, if not null, will point to the TTreeReader
3547 /// that RDataFrame has set up to read the task's batch of entries. It is passed to the helper to allow certain advanced optimizations
3548 /// it should not usually serve any purpose for the Helper. This method is often no-op for simple helpers.
3549 /// * `void Exec(unsigned int slot, ColumnTypes...columnValues)`: each working thread shall call this method
3550 /// during the event-loop, possibly concurrently. No two threads will ever call Exec with the same 'slot' value:
3551 /// this parameter is there to facilitate writing thread-safe helpers. The other arguments will be the values of
3552 /// the requested columns for the particular entry being processed.
3553 /// * `void Finalize()`: this method is called at the end of the event loop. Commonly used to finalize the contents of the result.
3554 /// * `std::string GetActionName()`: it returns a string identifier for this type of action that RDataFrame will use in
3555 /// diagnostics, SaveGraph(), etc.
3556 ///
3557 /// ### Optional methods
3558 ///
3559 /// If these methods are implemented they enable extra functionality as per the description below.
3560 ///
3561 /// * `Result_t &PartialUpdate(unsigned int slot)`: if present, it must return the value of the partial result of this action for the given 'slot'.
3562 /// Different threads might call this method concurrently, but will do so with different 'slot' numbers.
3563 /// RDataFrame leverages this method to implement RResultPtr::OnPartialResult().
3564 /// * `ROOT::RDF::SampleCallback_t GetSampleCallback()`: if present, it must return a callable with the
3565 /// appropriate signature (see ROOT::RDF::SampleCallback_t) that will be invoked at the beginning of the processing
3566 /// of every sample, as in DefinePerSample().
3567 /// * `Helper MakeNew(void *newResult, std::string_view variation = "nominal")`: if implemented, it enables varying
3568 /// the action's result with VariationsFor(). It takes a type-erased new result that can be safely cast to a
3569 /// `std::shared_ptr<Result_t> *` (a pointer to shared pointer) and should be used as the action's output result.
3570 /// The function optionally takes the name of the current variation which could be useful in customizing its behaviour.
3571 ///
3572 /// In case Book is called without specifying column types as template arguments, corresponding typed code will be just-in-time compiled
3573 /// by RDataFrame. In that case the Helper class needs to be known to the ROOT interpreter.
3574 ///
3575 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3576 ///
3577 /// ### Examples
3578 /// See [this tutorial](https://root.cern/doc/master/df018__customActions_8C.html) for an example implementation of an action helper.
3579 ///
3580 /// It is also possible to inspect the code used by built-in RDataFrame actions at ActionHelpers.hxx.
3581 ///
3582 // clang-format on
3583 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename Helper>
3585 {
3586 using HelperT = std::decay_t<Helper>;
3587 // TODO add more static sanity checks on Helper
3589 static_assert(std::is_base_of<AH, HelperT>::value && std::is_convertible<HelperT *, AH *>::value,
3590 "Action helper of type T must publicly inherit from ROOT::Detail::RDF::RActionImpl<T>");
3591
3592 auto hPtr = std::make_shared<HelperT>(std::forward<Helper>(helper));
3593 auto resPtr = hPtr->GetResultPtr();
3594
3595 if (std::is_same<FirstColumn, RDFDetail::RInferredType>::value && columns.empty()) {
3597 } else {
3598 return CreateAction<RDFInternal::ActionTags::Book, FirstColumn, OtherColumns...>(columns, resPtr, hPtr,
3599 fProxiedPtr, columns.size());
3600 }
3601 }
3602
3603 ////////////////////////////////////////////////////////////////////////////
3604 /// \brief Provides a representation of the columns in the dataset.
3605 /// \tparam ColumnTypes variadic list of branch/column types.
3606 /// \param[in] columnList Names of the columns to be displayed.
3607 /// \param[in] nRows Number of events for each column to be displayed.
3608 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3609 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3610 ///
3611 /// This function returns a `RResultPtr<RDisplay>` containing all the entries to be displayed, organized in a tabular
3612 /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will
3613 /// return a complete version through `RDisplay::AsString()`.
3614 ///
3615 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see
3616 /// RResultPtr.
3617 ///
3618 /// Example usage:
3619 /// ~~~{.cpp}
3620 /// // Preparing the RResultPtr<RDisplay> object with all columns and default number of entries
3621 /// auto d1 = rdf.Display("");
3622 /// // Preparing the RResultPtr<RDisplay> object with two columns and 128 entries
3623 /// auto d2 = d.Display({"x", "y"}, 128);
3624 /// // Printing the short representations, the event loop will run
3625 /// d1->Print();
3626 /// d2->Print();
3627 /// ~~~
3628 template <typename... ColumnTypes>
3630 {
3631 CheckIMTDisabled("Display");
3632 auto newCols = columnList;
3633 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3634 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3635 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3636 // Need to add ULong64_t type corresponding to the first column rdfentry_
3637 return CreateAction<RDFInternal::ActionTags::Display, ULong64_t, ColumnTypes...>(
3638 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr);
3639 }
3640
3641 ////////////////////////////////////////////////////////////////////////////
3642 /// \brief Provides a representation of the columns in the dataset.
3643 /// \param[in] columnList Names of the columns to be displayed.
3644 /// \param[in] nRows Number of events for each column to be displayed.
3645 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3646 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3647 ///
3648 /// This overload automatically infers the column types.
3649 /// See the previous overloads for further details.
3650 ///
3651 /// Invoked when no types are specified to Display
3653 {
3654 CheckIMTDisabled("Display");
3655 auto newCols = columnList;
3656 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3657 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3658 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3660 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr,
3661 columnList.size() + 1);
3662 }
3663
3664 ////////////////////////////////////////////////////////////////////////////
3665 /// \brief Provides a representation of the columns in the dataset.
3666 /// \param[in] columnNameRegexp A regular expression to select the columns.
3667 /// \param[in] nRows Number of events for each column to be displayed.
3668 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3669 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3670 ///
3671 /// The existing columns are matched against the regular expression. If the string provided
3672 /// is empty, all columns are selected.
3673 /// See the previous overloads for further details.
3675 Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10)
3676 {
3677 const auto columnNames = GetColumnNames();
3680 }
3681
3682 ////////////////////////////////////////////////////////////////////////////
3683 /// \brief Provides a representation of the columns in the dataset.
3684 /// \param[in] columnList Names of the columns to be displayed.
3685 /// \param[in] nRows Number of events for each column to be displayed.
3686 /// \param[in] nMaxCollectionElements Number of maximum elements in collection.
3687 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3688 ///
3689 /// See the previous overloads for further details.
3691 Display(std::initializer_list<std::string> columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3692 {
3695 }
3696
3697private:
3699 std::enable_if_t<std::is_default_constructible<RetType>::value, RInterface<Proxied>>
3700 DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
3701 {
3702 if (where.compare(0, 8, "Redefine") != 0) { // not a Redefine
3706 } else {
3710 }
3711
3712 using ArgTypes_t = typename TTraits::CallableTraits<F>::arg_types;
3714 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::Slot>::value, ArgTypes_t>::type;
3716 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::SlotAndEntry>::value, ColTypesTmp_t>::type;
3717
3718 constexpr auto nColumns = ColTypes_t::list_size;
3719
3722
3723 // Declare return type to the interpreter, for future use by jitted actions
3725 if (retTypeName.empty()) {
3726 // The type is not known to the interpreter.
3727 // We must not error out here, but if/when this column is used in jitted code
3729 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3730 }
3731
3733 auto newColumn = std::make_shared<NewCol_t>(name, retTypeName, std::forward<F>(expression), validColumnNames,
3735
3737 newCols.AddDefine(std::move(newColumn));
3738
3740
3741 return newInterface;
3742 }
3743
3744 // This overload is chosen when the callable passed to Define or DefineSlot returns void.
3745 // It simply fires a compile-time error. This is preferable to a static_assert in the main `Define` overload because
3746 // this way compilation of `Define` has no way to continue after throwing the error.
3748 bool IsFStringConv = std::is_convertible<F, std::string>::value,
3749 bool IsRetTypeDefConstr = std::is_default_constructible<RetType>::value>
3750 std::enable_if_t<!IsFStringConv && !IsRetTypeDefConstr, RInterface<Proxied>>
3751 DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
3752 {
3753 static_assert(std::is_default_constructible<typename TTraits::CallableTraits<F>::ret_type>::value,
3754 "Error in `Define`: type returned by expression is not default-constructible");
3755 return *this; // never reached
3756 }
3757
3758 ////////////////////////////////////////////////////////////////////////////
3759 /// \brief Implementation of cache.
3760 template <typename... ColTypes, std::size_t... S>
3762 {
3764
3765 // Check at compile time that the columns types are copy constructible
3766 constexpr bool areCopyConstructible =
3767 RDFInternal::TEvalAnd<std::is_copy_constructible<ColTypes>::value...>::value;
3768 static_assert(areCopyConstructible, "Columns of a type which is not copy constructible cannot be cached yet.");
3769
3771
3772 auto colHolders = std::make_tuple(Take<ColTypes>(columnListWithoutSizeColumns[S])...);
3773 auto ds = std::make_unique<RLazyDS<ColTypes...>>(
3774 std::make_pair(columnListWithoutSizeColumns[S], std::get<S>(colHolders))...);
3775
3776 RInterface<RLoopManager> cachedRDF(std::make_shared<RLoopManager>(std::move(ds), columnListWithoutSizeColumns));
3777
3778 return cachedRDF;
3779 }
3780
3781 template <bool IsSingleColumn, typename F>
3783 VaryImpl(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
3784 const std::vector<std::string> &variationTags, std::string_view variationName)
3785 {
3786 using F_t = std::decay_t<F>;
3787 using ColTypes_t = typename TTraits::CallableTraits<F_t>::arg_types;
3788 using RetType = typename TTraits::CallableTraits<F_t>::ret_type;
3789 constexpr auto nColumns = ColTypes_t::list_size;
3790
3792
3795
3797 if (retTypeName.empty()) {
3798 // The type is not known to the interpreter, but we don't want to error out
3799 // here, rather if/when this column is used in jitted code, so we inject a broken but telling type name.
3801 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3802 }
3803
3804 auto variation = std::make_shared<RDFInternal::RVariation<F_t, IsSingleColumn>>(
3805 colNames, variationName, std::forward<F>(expression), variationTags, retTypeName, fColRegister, *fLoopManager,
3807
3809 newCols.AddVariation(std::move(variation));
3810
3812
3813 return newInterface;
3814 }
3815
3816 RInterface<Proxied> JittedVaryImpl(const std::vector<std::string> &colNames, std::string_view expression,
3817 const std::vector<std::string> &variationTags, std::string_view variationName,
3818 bool isSingleColumn)
3819 {
3820 R__ASSERT(!variationTags.empty() && "Must have at least one variation.");
3821 R__ASSERT(!colNames.empty() && "Must have at least one varied column.");
3822 R__ASSERT(!variationName.empty() && "Must provide a variation name.");
3823
3824 for (auto &colName : colNames) {
3828 }
3830
3831 // when varying multiple columns, they must be different columns
3832 if (colNames.size() > 1) {
3833 std::set<std::string> uniqueCols(colNames.begin(), colNames.end());
3834 if (uniqueCols.size() != colNames.size())
3835 throw std::logic_error("A column name was passed to the same Vary invocation multiple times.");
3836 }
3837
3838 auto jittedVariation =
3841
3843 newColRegister.AddVariation(std::move(jittedVariation));
3844
3846
3847 return newInterface;
3848 }
3849
3850 template <typename Helper, typename ActionResultType>
3851 auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &resPtr,
3852 const std::shared_ptr<Helper> &hPtr,
3854 -> decltype(hPtr->Exec(0u), RResultPtr<ActionResultType>{})
3855 {
3857 }
3858
3859 template <typename Helper, typename ActionResultType, typename... Others>
3861 CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &,
3862 const std::shared_ptr<Helper>& /*hPtr*/,
3863 Others...)
3864 {
3865 throw std::logic_error(std::string("An action was booked with no input columns, but the action requires "
3866 "columns! The action helper type was ") +
3867 typeid(Helper).name());
3868 return {};
3869 }
3870
3871protected:
3872 RInterface(const std::shared_ptr<Proxied> &proxied, RLoopManager &lm,
3875 {
3876 }
3877
3878 const std::shared_ptr<Proxied> &GetProxiedPtr() const { return fProxiedPtr; }
3879};
3880
3881} // namespace RDF
3882
3883} // namespace ROOT
3884
3885#endif // ROOT_RDF_INTERFACE
#define f(i)
Definition RSha256.hxx:104
#define h(i)
Definition RSha256.hxx:106
Basic types used by ROOT and required by TInterpreter.
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int)
Definition RtypesCore.h:60
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
#define X(type, name)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char name[80]
Definition TGX11.cxx:157
Base class for action helpers, see RInterface::Book() for more information.
implementation of FilterAvailable and FilterMissing operations
The head node of a RDF computation graph.
A histogram data structure to bin data along multiple dimensions.
A histogram for aggregation of data along multiple dimensions.
Definition RHist.hxx:65
Helper class that provides the operation graph nodes.
A RDataFrame node that produces a result.
Definition RAction.hxx:53
A binder for user-defined columns, variations and aliases.
std::vector< std::string_view > GenerateColumnNames() const
Return the list of the names of the defined columns (Defines + Aliases).
RDFDetail::RDefineBase * GetDefine(std::string_view colName) const
Return the RDefine for the requested column name, or nullptr.
The dataset specification for RDataFrame.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
The base public interface to the RDataFrame federation of classes.
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg, const std::shared_ptr< RDFNode > &proxiedPtr, const int=-1)
Create RAction object, return RResultPtr for the action Overload for the case in which all column typ...
RDataSource * GetDataSource() const
void CheckAndFillDSColumns(ColumnNames_t validCols, TTraits::TypeList< ColumnTypes... > typeList)
void CheckIMTDisabled(std::string_view callerName)
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
RDFDetail::RLoopManager * GetLoopManager() const
RDFInternal::RColumnRegister fColRegister
Contains the columns defined up to this node.
The public interface to the RDataFrame federation of classes.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a one-dimensional profile (lazy action).
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr<::TGraph > Graph(std::string_view x="", std::string_view y="")
Fill and return a TGraph object (lazy action).
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RInterface(const RInterface &)=default
Copy-ctor for RInterface.
RResultPtr< RDFDetail::MaxReturnType_t< T > > Max(std::string_view columnName="")
Return the maximum of processed column values (lazy action).
auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &resPtr, const std::shared_ptr< Helper > &hPtr, TTraits::TypeList< RDFDetail::RInferredType >) -> decltype(hPtr->Exec(0u), RResultPtr< ActionResultType >{})
RInterface(RInterface &&)=default
Move-ctor for RInterface.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const std::initializer_list< std::string > &columns)
Append a filter to the call graph.
RInterface< RLoopManager > Cache(std::initializer_list< std::string > columnList)
Save selected columns in memory.
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a two-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp="", const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr< RDisplay > Display(std::initializer_list< std::string > columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface(const std::shared_ptr< RLoopManager > &proxied)
Build a RInterface from a RLoopManager.
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Redefine(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
std::shared_ptr< Proxied > fProxiedPtr
Smart pointer to the graph node encapsulated by this RInterface.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RResultPtr<::TH1D > Histo1D(std::string_view vName)
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int begin, unsigned int end, unsigned int stride=1)
Creates a node that filters entries based on range: [begin, end).
RResultPtr< typename std::decay_t< Helper >::Result_t > Book(Helper &&helper, const ColumnNames_t &columns={})
Book execution of a custom action using a user-defined helper object.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a one-dimensional profile (lazy action).
const std::shared_ptr< Proxied > & GetProxiedPtr() const
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.})
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< T > Reduce(F f, std::string_view columnName="")
Execute a user-defined reduce operation on the values of a column.
RResultPtr< T > Reduce(F f, std::string_view columnName, const T &redIdentity)
Execute a user-defined reduce operation on the values of a column.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< RDisplay > Display(std::string_view columnNameRegexp="", size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface & operator=(const RInterface &)=default
Copy-assignment operator for RInterface.
RInterface< Proxied > VaryImpl(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Define(std::string_view name, std::string_view expression)
Define a new column.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterAvailable(std::string_view column)
Discard entries with missing values.
std::enable_if_t<!IsFStringConv &&!IsRetTypeDefConstr, RInterface< Proxied > > DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
RInterface< Proxied > Redefine(std::string_view name, std::string_view expression)
Overwrite the value and/or type of an existing column.
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RInterface< RLoopManager > Cache(std::string_view columnNameRegexp="")
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.}, std::string_view vName="")
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a three-dimensional histogram (lazy action).
friend class RDFInternal::GraphDrawing::GraphCreatorHelper
RInterface< RLoopManager > CacheImpl(const ColumnNames_t &columnList, std::index_sequence< S... >)
Implementation of cache.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a two-dimensional profile (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, std::string_view name)
Append a filter to the call graph.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName="")
Execute a user-defined accumulation operation on the processed column values in each processing slot.
std::enable_if_t< std::is_default_constructible< RetType >::value, RInterface< Proxied > > DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
RInterface(const std::shared_ptr< Proxied > &proxied, RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister)
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
RInterface< Proxied > Alias(std::string_view alias, std::string_view columnName)
Allow to refer to a column with a different name.
RResultPtr< RDFDetail::MinReturnType_t< T > > Min(std::string_view columnName="")
Return the minimum of processed column values (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< RCutFlowReport > Report()
Gather filtering statistics.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model)
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a three-dimensional histogram (lazy action).
RResultPtr<::TH1D > Histo1D(std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RInterface< Proxied > DefinePerSample(std::string_view name, std::string_view expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot and the current entry.
RResultPtr< std::decay_t< T > > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action).
RInterface< Proxied > DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterMissing(std::string_view column)
Keep only the entries that have missing values.
RResultPtr< TStatistic > Stats(std::string_view value="")
Return a TStatistic object, filled once per event (lazy action).
RInterface< Proxied > JittedVaryImpl(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName, bool isSingleColumn)
RInterface< Proxied > DefaultValueFor(std::string_view column, const T &defaultValue)
In case the value in the given column is missing, provide a default value.
RResultPtr< TStatistic > Stats(std::string_view value, std::string_view weight)
Return a TStatistic object, filled once per event (lazy action).
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model)
Fill and return a two-dimensional profile (lazy action).
RInterface< Proxied > RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a two-dimensional histogram (lazy action).
RResultPtr< ActionResultType > CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &, const std::shared_ptr< Helper > &, Others...)
RInterface< Proxied > Define(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column.
void ForeachSlot(F f, const ColumnNames_t &columns={})
Execute a user-defined function requiring a processing slot index on each entry (instant action).
RResultPtr<::TGraphAsymmErrors > GraphAsymmErrors(std::string_view x="", std::string_view y="", std::string_view exl="", std::string_view exh="", std::string_view eyl="", std::string_view eyh="")
Fill and return a TGraphAsymmErrors object (lazy action).
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model)
Fill and return a one-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::initializer_list< std::string > columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RInterface & operator=(RInterface &&)=default
Move-assignment operator for RInterface.
RResultPtr<::TH2D > Histo2D(const TH2DModel &model)
RResultPtr< double > Mean(std::string_view columnName="")
Return the mean of processed column values (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const ColumnNames_t &columns={}, std::string_view name="")
Append a filter to the call graph.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RInterface< Proxied > DefinePerSample(std::string_view name, F expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for for multiple existing columns using custom variation tags.
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int end)
Creates a node that filters entries based on range.
RInterface< Proxied > RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RInterface< RDFDetail::RJittedFilter > Filter(std::string_view expression, std::string_view name="")
Append a filter to the call graph.
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a weighted two-dimensional histogram (lazy action).
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated tags.
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr< double > StdDev(std::string_view columnName="")
Return the unbiased standard deviation of processed column values (lazy action).
RResultPtr< RDFDetail::SumReturnType_t< T > > Sum(std::string_view columnName="", const RDFDetail::SumReturnType_t< T > &initValue=RDFDetail::SumReturnType_t< T >{})
Return the sum of processed column values (lazy action).
A RDataSource implementation which is built on top of result proxies.
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
typename RemoveFirstParameter< T >::type RemoveFirstParameter_t
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
A TGraph is an object made of two arrays X and Y with npoints each.
Definition TGraph.h:41
@ kAllAxes
Definition TH1.h:126
Statistical variable, defined by its mean and variance (RMS).
Definition TStatistic.h:33
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
void CheckForNoVariations(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister)
Throw if the column has systematic variations attached.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
const std::type_info & TypeName2TypeID(const std::string &name)
Return the type_info associated to a name.
Definition RDFUtils.cxx:86
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
std::shared_ptr< RJittedDefine > BookDefinePerSampleJit(std::string_view name, std::string_view expression, RLoopManager &lm, const RColumnRegister &colRegister)
Book the jitting of a DefinePerSample call.
void CheckValidCppVarName(std::string_view var, const std::string &where)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:645
void RemoveDuplicates(ColumnNames_t &columnNames)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:191
void CheckSnapshotOptionsFormatCompatibility(const ROOT::RDF::RSnapshotOptions &opts)
void CheckForDefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is not already there.
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
std::string PrettyPrintAddr(const void *const addr)
std::shared_ptr< RDFDetail::RJittedFilter > BookFilterJit(std::shared_ptr< RDFDetail::RNodeBase > prevNode, std::string_view name, std::string_view expression, const RColumnRegister &colRegister, TTree *tree, RDataSource *ds)
Book the jitting of a Filter call.
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::pair< std::vector< std::string >, std::vector< std::string > > AddSizeBranches(ROOT::RDF::RDataSource *ds, std::vector< std::string > &&colsWithoutAliases, std::vector< std::string > &&colsWithAliases)
Return copies of colsWithoutAliases and colsWithAliases with size branches for variable-sized array b...
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2RVec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:325
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
void RemoveRNTupleSubfields(ColumnNames_t &columnNames)
ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
Take a list of column names, return that list with entries starting by '#' filtered out.
std::shared_ptr< RJittedVariation > BookVariationJit(const std::vector< std::string > &colNames, std::string_view variationName, const std::vector< std::string > &variationTags, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister, bool isSingleColumn)
Book the jitting of a Vary call.
void WarnHist()
Warn once about experimental filling of RHist.
Definition RDFUtils.cxx:55
void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
void CheckForRedefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is already there.
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister)
Book the jitting of a Define call.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
std::vector< std::string > ColumnNames_t
ROOT type_traits extensions.
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:544
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:600
@ kError
An error.
void DisableImplicitMT()
Disables the implicit multi-threading in ROOT (see EnableImplicitMT).
Definition TROOT.cxx:586
A special bin content type to compute the bin error in weighted filling.
type is TypeList if MustRemove is false, otherwise it is a TypeList with the first type removed
Definition Utils.hxx:156
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:347
A collection of options to steer the creation of the dataset on disk through Snapshot().
A struct which stores some basic parameters of a TH1D.
std::shared_ptr<::TH1D > GetHistogram() const
A struct which stores some basic parameters of a TH2D.
std::shared_ptr<::TH2D > GetHistogram() const
A struct which stores some basic parameters of a TH3D.
std::shared_ptr<::TH3D > GetHistogram() const
A struct which stores some basic parameters of a THnD.
std::shared_ptr<::THnD > GetHistogram() const
A struct which stores some basic parameters of a THnSparseD.
std::shared_ptr<::THnSparseD > GetHistogram() const
A struct which stores some basic parameters of a TProfile.
std::shared_ptr<::TProfile > GetProfile() const
A struct which stores some basic parameters of a TProfile2D.
std::shared_ptr<::TProfile2D > GetProfile() const
Lightweight storage for a collection of types.