Logo ROOT  
Reference Guide
RDataSource.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDATASOURCE
12#define ROOT_RDATASOURCE
13
15#include "ROOT/RStringView.hxx"
16#include "ROOT/RConfig.hxx" // R__DEPRECATED
17#include "RtypesCore.h" // ULong64_t
18#include "TError.h" // Warning
19#include "TString.h"
20
21#include <algorithm> // std::transform
22#include <string>
23#include <typeinfo>
24#include <vector>
25
26namespace ROOT {
27namespace RDF {
28class RDataSource;
29}
30}
31
32/// Print a RDataSource at the prompt
33namespace cling {
34std::string printValue(ROOT::RDF::RDataSource *ds);
35} // namespace cling
36
37namespace ROOT {
38
39namespace Internal {
40namespace TDS {
41
42/// Mother class of TTypedPointerHolder. The instances
43/// of this class can be put in a container. Upon destruction,
44/// the correct deletion of the pointer is performed in the
45/// derived class.
47protected:
48 void *fPointer{nullptr};
49
50public:
51 TPointerHolder(void *ptr) : fPointer(ptr) {}
52 void *GetPointer() { return fPointer; }
53 void *GetPointerAddr() { return &fPointer; }
55 virtual ~TPointerHolder(){};
56};
57
58/// Class to wrap a pointer and delete the memory associated to it
59/// correctly
60template <typename T>
61class TTypedPointerHolder final : public TPointerHolder {
62public:
64
66 {
67 const auto typedPtr = static_cast<T *>(fPointer);
68 return new TTypedPointerHolder(new T(*typedPtr));
69 }
70
71 ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
72};
73
74} // ns TDS
75} // ns Internal
76
77namespace RDF {
78
79// clang-format off
80/**
81\class ROOT::RDF::RDataSource
82\ingroup dataframe
83\brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
84
85A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
86methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
87RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
88for selected columns and to advance the readers to the desired data entry.
89
90The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
91
92 - SetNSlots() : inform RDataSource of the desired level of parallelism
93 - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
94 - Initialize() : inform RDataSource that an event-loop is about to start
95 - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
96 - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
97 - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
98 - FinalizeSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
99 - Finalize() : inform RDataSource that an event-loop finished
100
101RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
102 - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
103 - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
104 - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
105 - \b Initialize() and \b Finalize() are called once per event-loop, right before starting and right after finishing.
106 - \b InitSlot(), \b SetEntry(), and \b FinalizeSlot() can be called concurrently from multiple threads, multiple times per event-loop.
107
108 Advanced users that plan to implement a custom RDataSource can check out existing implementations, e.g. RCsvDS or RNTupleDS.
109 See the inheritance diagram below for the full list of existing concrete implementations.
110*/
112 // clang-format on
113private:
114 /// \cond
115 // Temporary boolean value used by the backwards compatibility code for the deprecated spellings Initialise,
116 // Finalise and FinaliseSlot.
117 bool fDeprecatedBaseCalled = false;
118 /// \endcond
119
120protected:
121 using Record_t = std::vector<void *>;
122 friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
123
124 virtual std::string AsString() { return "generic data source"; };
125
126public:
127 virtual ~RDataSource() = default;
128
129 // clang-format off
130 /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
131 /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
132 /// pass different slot values when calling methods concurrently.
133 // clang-format on
134 virtual void SetNSlots(unsigned int nSlots) = 0;
135
136 // clang-format off
137 /// \brief Returns a reference to the collection of the dataset's column names
138 // clang-format on
139 virtual const std::vector<std::string> &GetColumnNames() const = 0;
140
141 /// \brief Checks if the dataset has a certain column
142 /// \param[in] colName The name of the column
143 virtual bool HasColumn(std::string_view colName) const = 0;
144
145 // clang-format off
146 /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
147 /// \param[in] colName The name of the column
148 // clang-format on
149 virtual std::string GetTypeName(std::string_view colName) const = 0;
150
151 // clang-format off
152 /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
153 /// \tparam T The type of the data stored in the column
154 /// \param[in] columnName The name of the column
155 ///
156 /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
157 /// the "right" memory region.
158 // clang-format on
159 template <typename T>
160 std::vector<T **> GetColumnReaders(std::string_view columnName)
161 {
162 auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
163 std::vector<T **> typedVec(typeErasedVec.size());
164 std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
165 [](void *p) { return static_cast<T **>(p); });
166 return typedVec;
167 }
168
169 /// If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
170 /// \param[in] slot The data processing slot that needs to be considered
171 /// \param[in] name The name of the column for which a column reader needs to be returned
172 /// \param[in] tid A type_info
173 /// At least one of the two must return a non-empty/non-null value.
174 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
175 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &)
176 {
177 return {};
178 }
179
180 // clang-format off
181 /// \brief Return ranges of entries to distribute to tasks.
182 /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
183 /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
184 /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
185 /// The same entry range should not be returned more than once.
186 /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
187 // clang-format on
188 virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
189
190 // clang-format off
191 /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
192 /// \param[in] slot The data processing slot that needs to be considered
193 /// \param[in] entry The entry which needs to be pointed to by the reader pointers
194 /// Slots are adopted to accommodate parallel data processing.
195 /// Different workers will loop over different ranges and
196 /// will be labelled by different "slot" values.
197 /// Returns *true* if the entry has to be processed, *false* otherwise.
198 // clang-format on
199 virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
200
201 // clang-format off
202 /// \brief Convenience method called before starting an event-loop.
203 /// This method might be called multiple times over the lifetime of a RDataSource, since
204 /// users can run multiple event-loops with the same RDataFrame.
205 /// Ideally, `Initialize` should set the state of the RDataSource so that multiple identical event-loops
206 /// will produce identical results.
207 // clang-format on
208 virtual void Initialize() {}
209
210 /// \cond
211 // Unused deprecated struct, it's here to remind us to remove the deprecated spellings Initialise, Finalise and
212 // FinaliseSlot. PR that removes the deprecated code: https://github.com/root-project/root/pull/9521 .
213 struct R__DEPRECATED(6, 30,
214 "Use Initialize, Finalize and FinalizeSlot instead of the corresponding british spellings.")
215 NeverUsedJustAReminder {
216 };
217
218 virtual void Initialise() { fDeprecatedBaseCalled = true; }
219
220 void CallInitialize()
221 {
222 fDeprecatedBaseCalled = false;
223 Initialise();
224 if (!fDeprecatedBaseCalled) {
225 Warning("RDataSource::Initialise", "Initialise is deprecated. Please rename it to \"Initialize\" (with a z).");
226 return;
227 }
228
229 // `Initialise()` was not overridden, the data source uses the new spelling: good!
230 Initialize();
231 }
232 /// \endcond
233
234 // clang-format off
235 /// \brief Convenience method called at the start of the data processing associated to a slot.
236 /// \param[in] slot The data processing slot wihch needs to be initialized
237 /// \param[in] firstEntry The first entry of the range that the task will process.
238 /// This method might be called multiple times per thread per event-loop.
239 // clang-format on
240 virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
241
242 // clang-format off
243 /// \brief Convenience method called at the end of the data processing associated to a slot.
244 /// \param[in] slot The data processing slot wihch needs to be finalized
245 /// This method might be called multiple times per thread per event-loop.
246 // clang-format on
247 virtual void FinalizeSlot(unsigned int /*slot*/) {}
248
249 /// \cond
250 virtual void FinaliseSlot(unsigned int) { fDeprecatedBaseCalled = true; }
251
252 void CallFinalizeSlot(unsigned int slot)
253 {
254 fDeprecatedBaseCalled = false;
255 FinaliseSlot(slot);
256 if (!fDeprecatedBaseCalled) {
257 Warning("RDataSource::FinaliseSlot",
258 "FinaliseSlot is deprecated. Please implement FinalizeSlot (with a z) instead of FinaliseSlot.");
259 return;
260 }
261
262 FinalizeSlot(slot);
263 }
264 /// \endcond
265
266 // clang-format off
267 /// \brief Convenience method called after concluding an event-loop.
268 /// See Initialize for more details.
269 // clang-format on
270 virtual void Finalize() {}
271
272 /// \cond
273 virtual void Finalise() { fDeprecatedBaseCalled = true; }
274
275 void CallFinalize()
276 {
277 fDeprecatedBaseCalled = false;
278 Finalise();
279 if (!fDeprecatedBaseCalled) {
280 Warning("RDataSource::FinaliseSlot",
281 "Finalise is deprecated. Please implement Finalize (with a z) instead of Finalise.");
282 return;
283 }
284
285 Finalize();
286 }
287 /// \endcond
288
289 /// \brief Return a string representation of the datasource type.
290 /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
291 /// the datasource in the visualization of the computation graph.
292 /// Concrete datasources can override the default implementation.
293 virtual std::string GetLabel() { return "Custom Datasource"; }
294
295protected:
296 /// type-erased vector of pointers to pointers to column values - one per slot
297 virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
298};
299
300} // ns RDF
301
302} // ns ROOT
303
304/// Print a RDataSource at the prompt
305namespace cling {
306inline std::string printValue(ROOT::RDF::RDataSource *ds)
307{
308 return ds->AsString();
309}
310} // namespace cling
311
312#endif // ROOT_TDATASOURCE
#define R__DEPRECATED(MAJOR, MINOR, REASON)
Definition: RConfig.hxx:516
unsigned long long ULong64_t
Definition: RtypesCore.h:81
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition: TError.cxx:232
winID h TVirtualViewer3D TVirtualGLPainter p
char name[80]
Definition: TGX11.cxx:110
Mother class of TTypedPointerHolder.
Definition: RDataSource.hxx:46
virtual TPointerHolder * GetDeepCopy()=0
Class to wrap a pointer and delete the memory associated to it correctly.
Definition: RDataSource.hxx:61
TPointerHolder * GetDeepCopy() final
Definition: RDataSource.hxx:65
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual void Finalize()
Convenience method called after concluding an event-loop.
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
virtual void FinalizeSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot.
virtual ~RDataSource()=default
virtual std::string AsString()
virtual bool SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< void * > Record_t
virtual std::string GetLabel()
Return a string representation of the datasource type.
virtual void SetNSlots(unsigned int nSlots)=0
Inform RDataSource of the number of processing slots (i.e.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &)
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
virtual void Initialize()
Convenience method called before starting an event-loop.
basic_string_view< char > string_view
void(off) SmallVectorTemplateBase< T
double T(double x)
Definition: ChebyshevPol.h:34
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.