Logo ROOT   6.14/05
Reference Guide
RDataSource.hxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2016, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RDATASOURCE
12 #define ROOT_RDATASOURCE
13 
14 #include "ROOT/RStringView.hxx"
15 #include "RtypesCore.h" // ULong64_t
16 #include <algorithm> // std::transform
17 #include <vector>
18 #include <typeinfo>
19 
20 namespace ROOT {
21 
22 namespace Internal {
23 namespace TDS {
24 
25 /// Mother class of TTypedPointerHolder. The instances
26 /// of this class can be put in a container. Upon destruction,
27 /// the correct deletion of the pointer is performed in the
28 /// derived class.
30 protected:
31  void *fPointer{nullptr};
32 
33 public:
34  TPointerHolder(void *ptr) : fPointer(ptr) {}
35  void *GetPointer() { return fPointer; }
36  void *GetPointerAddr() { return &fPointer; }
37  virtual TPointerHolder *GetDeepCopy() = 0;
38  virtual ~TPointerHolder(){};
39 };
40 
41 /// Class to wrap a pointer and delete the memory associated to it
42 /// correctly
43 template <typename T>
44 class TTypedPointerHolder final : public TPointerHolder {
45 public:
47 
49  {
50  const auto typedPtr = static_cast<T *>(fPointer);
51  return new TTypedPointerHolder(new T(*typedPtr));
52  }
53 
54  ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
55 };
56 
57 } // ns TDS
58 } // ns Internal
59 
60 
61 namespace RDF {
62 
63 // clang-format off
64 /**
65 \class ROOT::RDF::RDataSource
66 \ingroup dataframe
67 \brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
68 
69 A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
70 methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
71 RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
72 for selected columns and to advance the readers to the desired data entry.
73 
74 The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
75 
76  1. SetNSlots: inform RDataSource of the desired level of parallelism
77  2. GetColumnReaders: retrieve from RDataSource per-thread readers for the desired columns
78  3. Initialise: inform RDataSource that an event-loop is about to start
79  4. GetEntryRanges: retrieve from RDataSource a set of ranges of entries that can be processed concurrently
80  5. InitSlot: inform RDataSource that a certain thread is about to start working on a certain range of entries
81  6. SetEntry: inform RDataSource that a certain thread is about to start working on a certain entry
82  7. FinaliseSlot: inform RDataSource that a certain thread finished working on a certain range of entries
83  8. Finalise: inform RDataSource that an event-loop finished
84 
85 RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
86  - Method 1 is called once per RDataSource object, typically when it is associated to a RDataFrame.
87  - Method 2 can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
88  - Methods 3,8 are called once per event-loop, right before starting and right after finishing.
89  - Methods 5,6,7 can be called concurrently from multiple threads, multiple times per event-loop.
90 */
91 class RDataSource {
92  // clang-format on
93 protected:
94  using Record_t = std::vector<void *>;
95 
96 public:
97  virtual ~RDataSource() = default;
98 
99  // clang-format off
100  /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
101  /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
102  /// pass different slot values when calling methods concurrently.
103  // clang-format on
104  virtual void SetNSlots(unsigned int nSlots) = 0;
105 
106  // clang-format off
107  /// \brief Returns a reference to the collection of the dataset's column names
108  // clang-format on
109  virtual const std::vector<std::string> &GetColumnNames() const = 0;
110 
111  /// \brief Checks if the dataset has a certain column
112  /// \param[in] columnName The name of the column
113  virtual bool HasColumn(std::string_view) const = 0;
114 
115  // clang-format off
116  /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
117  /// \param[in] columnName The name of the column
118  // clang-format on
119  virtual std::string GetTypeName(std::string_view) const = 0;
120 
121  // clang-format off
122  /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
123  /// \tparam T The type of the data stored in the column
124  /// \param[in] columnName The name of the column
125  ///
126  /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
127  /// the "right" memory region.
128  // clang-format on
129  template <typename T>
130  std::vector<T **> GetColumnReaders(std::string_view columnName)
131  {
132  auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
133  std::vector<T **> typedVec(typeErasedVec.size());
134  std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
135  [](void *p) { return static_cast<T **>(p); });
136  return typedVec;
137  }
138 
139  // clang-format off
140  /// \brief Return ranges of entries to distribute to tasks.
141  /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
142  /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
143  // clang-format on
144  virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
145 
146  // clang-format off
147  /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
148  /// \param[in] slot The data processing slot that needs to be considered
149  /// \param[in] entry The entry which needs to be pointed to by the reader pointers
150  /// Slots are adopted to accommodate parallel data processing. Different workers will loop over different ranges and
151  /// will be labelled by different "slot" values.
152  /// Returns *true* if the entry has to be processed, *false* otherwise.
153  // clang-format on
154  virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
155 
156  // clang-format off
157  /// \brief Convenience method called before starting an event-loop.
158  /// This method might be called multiple times over the lifetime of a RDataSource, since
159  /// users can run multiple event-loops with the same RDataFrame.
160  /// Ideally, `Initialise` should set the state of the RDataSource so that multiple identical event-loops
161  /// will produce identical results.
162  // clang-format on
163  virtual void Initialise() {}
164 
165  // clang-format off
166  /// \brief Convenience method called at the start of the data processing associated to a slot.
167  /// \param[in] slot The data processing slot wihch needs to be initialised
168  /// \param[in] firstEntry The first entry of the range that the task will process.
169  /// This method might be called multiple times per thread per event-loop.
170  // clang-format on
171  virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
172 
173  // clang-format off
174  /// \brief Convenience method called at the end of the data processing associated to a slot.
175  /// \param[in] slot The data processing slot wihch needs to be finalised
176  /// This method might be called multiple times per thread per event-loop.
177  // clang-format on
178  virtual void FinaliseSlot(unsigned int /*slot*/) {}
179 
180  // clang-format off
181  /// \brief Convenience method called after concluding an event-loop.
182  /// See Initialise for more details.
183  // clang-format on
184  virtual void Finalise() {}
185 
186 protected:
187  /// type-erased vector of pointers to pointers to column values - one per slot
188  virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
189 };
190 
191 } // ns RDF
192 
193 } // ns ROOT
194 
195 #endif // ROOT_TDATASOURCE
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
double T(double x)
Definition: ChebyshevPol.h:34
virtual void Initialise()
Convenience method called before starting an event-loop.
virtual TPointerHolder * GetDeepCopy()=0
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
std::vector< void * > Record_t
Definition: RDataSource.hxx:94
unsigned long long ULong64_t
Definition: RtypesCore.h:70
basic_string_view< char > string_view
Definition: RStringView.hxx:35
Class to wrap a pointer and delete the memory associated to it correctly.
Definition: RDataSource.hxx:44
typedef void((*Func_t)())
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
virtual void FinaliseSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot. ...
Mother class of TTypedPointerHolder.
Definition: RDataSource.hxx:29
virtual TPointerHolder * GetDeepCopy()
Definition: RDataSource.hxx:48
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:91
char name[80]
Definition: TGX11.cxx:109
virtual void Finalise()
Convenience method called after concluding an event-loop.