Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RVecDS.hxx
Go to the documentation of this file.
1// Author: Stefan Wunsch CERN 04/2019
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/RDataFrame.hxx>
12#include <ROOT/RDataSource.hxx>
13#include <ROOT/RVec.hxx>
14#include <ROOT/TSeq.hxx>
15
16#include <algorithm>
17#include <functional>
18#include <map>
19#include <memory>
20#include <string>
21#include <tuple>
22#include <typeinfo>
23#include <utility>
24#include <vector>
25
26#ifndef ROOT_RVECDS
27#define ROOT_RVECDS
28
29namespace ROOT {
30
31namespace Internal {
32
33namespace RDF {
34
35////////////////////////////////////////////////////////////////////////////////////////////////
36/// \brief A RDataSource implementation which takes a collection of RVecs, which
37/// are able to adopt data from Numpy arrays
38///
39/// This component allows to create a data source on a set of columns with data
40/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
41/// arrays, with RVecs allows to read arbitrary data from memory.
42/// In addition, the data source has to keep a reference on the Python owned data
43/// so that the lifetime of the data is tied to the datasource.
44template <typename... ColumnTypes>
45class RVecDS final : public ROOT::RDF::RDataSource {
46 using PointerHolderPtrs_t = std::vector<ROOT::Internal::TDS::TPointerHolder *>;
47
48 std::tuple<ROOT::RVec<ColumnTypes>...> fColumns;
49 const std::vector<std::string> fColNames;
50 const std::map<std::string, std::string> fColTypesMap;
51 // The role of the fPointerHoldersModels is to be initialised with the pack
52 // of arguments in the constrcutor signature at construction time
53 // Once the number of slots is known, the fPointerHolders are initialised
54 // according to the models.
56 std::vector<PointerHolderPtrs_t> fPointerHolders;
57 std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
58 unsigned int fNSlots{0};
59 std::function<void()> fDeleteRVecs;
60
61 Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
62 {
63 auto colNameStr = std::string(colName);
64 // This could be optimised and done statically
65 const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
66 auto it = fColTypesMap.find(colNameStr);
67 if (fColTypesMap.end() == it) {
68 std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
69 throw std::runtime_error(err);
70 }
71
72 const auto colIdName = it->second;
73 if (colIdName != idName) {
74 std::string err = "Column " + colNameStr + " has type " + colIdName +
75 " while the id specified is associated to type " + idName;
76 throw std::runtime_error(err);
77 }
78
79 const auto colBegin = fColNames.begin();
80 const auto colEnd = fColNames.end();
81 const auto namesIt = std::find(colBegin, colEnd, colName);
82 const auto index = std::distance(colBegin, namesIt);
83
84 Record_t ret(fNSlots);
85 for (auto slot : ROOT::TSeqU(fNSlots)) {
86 ret[slot] = fPointerHolders[index][slot]->GetPointerAddr();
87 }
88 return ret;
89 }
90
91 size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); }
92 template <std::size_t... S>
93 void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
94 {
95 std::initializer_list<int> expander{
96 (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = std::get<S>(fColumns)[entry], 0)...};
97 (void)expander; // avoid unused variable warnings
98 }
99
100 template <std::size_t... S>
101 void ColLengthChecker(std::index_sequence<S...>)
102 {
103 if (sizeof...(S) < 2)
104 return;
105
106 const std::vector<size_t> colLengths{std::get<S>(fColumns).size()...};
107 const auto expectedLen = colLengths[0];
108 std::string err;
109 for (auto i : TSeqI(1, colLengths.size())) {
110 if (expectedLen != colLengths[i]) {
111 err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
112 "\" have different lengths: " + std::to_string(expectedLen) + " and " +
113 std::to_string(colLengths[i]);
114 }
115 }
116 if (!err.empty()) {
117 throw std::runtime_error(err);
118 }
119 }
120
121protected:
122 std::string AsString() { return "Numpy data source"; };
123
124public:
125 RVecDS(std::function<void()> deleteRVecs, std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colsNameVals)
126 : fColumns(colsNameVals.second...),
127 fColNames{colsNameVals.first...},
128 fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
130 fDeleteRVecs(deleteRVecs)
131 {
132 }
133
134 // Rule of five
135 RVecDS(const RVecDS &) = delete;
136 RVecDS &operator=(const RVecDS &) = delete;
137 RVecDS(RVecDS &&) = delete;
138 RVecDS &operator=(RVecDS &&) = delete;
139 ~RVecDS() final
140 {
141 for (auto &&ptrHolderv : fPointerHolders) {
142 for (auto &&ptrHolder : ptrHolderv) {
143 delete ptrHolder;
144 }
145 }
146 // Release the data associated to this data source
147 fDeleteRVecs();
148 }
149
150 const std::vector<std::string> &GetColumnNames() const { return fColNames; }
151
152 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
153 {
154 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
155 return entryRanges;
156 }
157
158 std::string GetTypeName(std::string_view colName) const
159 {
160 const auto key = std::string(colName);
161 return fColTypesMap.at(key);
162 }
163
164 bool HasColumn(std::string_view colName) const
165 {
166 const auto key = std::string(colName);
167 const auto endIt = fColTypesMap.end();
168 return endIt != fColTypesMap.find(key);
169 }
170
171 bool SetEntry(unsigned int slot, ULong64_t entry)
172 {
173 SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
174 return true;
175 }
176
177 void SetNSlots(unsigned int nSlots)
178 {
179 fNSlots = nSlots;
180 const auto nCols = fColNames.size();
181 fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
182 auto colIndex = 0U;
183 for (auto &&ptrHolderv : fPointerHolders) {
184 for (auto slot : ROOT::TSeqI(fNSlots)) {
185 auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
186 ptrHolderv.emplace_back(ptrHolder);
187 (void)slot;
188 }
189 colIndex++;
190 }
191 for (auto &&ptrHolder : fPointerHoldersModels)
192 delete ptrHolder;
193 }
194
196 {
197 ColLengthChecker(std::index_sequence_for<ColumnTypes...>());
198 const auto nEntries = GetEntriesNumber();
199 const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
200 auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
201 fEntryRanges.resize(fNSlots);
202 auto init = 0ULL;
203 auto end = 0ULL;
204 for (auto &&range : fEntryRanges) {
205 end = init + nEntriesInRange;
206 if (0 != reminder) { // Distribute the reminder among the first chunks
207 reminder--;
208 end += 1;
209 }
210 range.first = init;
211 range.second = end;
212 init = end;
213 }
214 }
215
216 std::string GetLabel() { return "RVecDS"; }
217};
218
219// Factory to create datasource able to read Numpy arrays through RVecs.
220// \param pyRVecs Pointer to PyObject holding RVecs.
221// The RVecs itself hold a reference to the associated Numpy arrays so that
222// the data cannot go out of scope as long as the datasource survives.
223template <typename... ColumnTypes>
224std::unique_ptr<RDataFrame>
225MakeRVecDataFrame(std::function<void()> deleteRVecs,
226 std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colNameProxyPairs)
227{
228 return std::make_unique<RDataFrame>(std::make_unique<RVecDS<ColumnTypes...>>(deleteRVecs, colNameProxyPairs...));
229}
230
231} // namespace RDF
232} // namespace Internal
233} // namespace ROOT
234
235#endif // ROOT_RNUMPYDS
unsigned long long ULong64_t
Definition RtypesCore.h:70
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition RVecDS.hxx:45
RVecDS(RVecDS &&)=delete
Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
type-erased vector of pointers to pointers to column values - one per slot
Definition RVecDS.hxx:61
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition RVecDS.hxx:56
void Initialize()
Convenience method called before starting an event-loop.
Definition RVecDS.hxx:195
RVecDS & operator=(const RVecDS &)=delete
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition RVecDS.hxx:158
const PointerHolderPtrs_t fPointerHoldersModels
Definition RVecDS.hxx:55
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition RVecDS.hxx:150
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RVecDS.hxx:171
std::function< void()> fDeleteRVecs
Definition RVecDS.hxx:59
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition RVecDS.hxx:177
std::vector< ROOT::Internal::TDS::TPointerHolder * > PointerHolderPtrs_t
Definition RVecDS.hxx:46
std::tuple< ROOT::RVec< ColumnTypes >... > fColumns
Definition RVecDS.hxx:48
std::string GetLabel()
Return a string representation of the datasource type.
Definition RVecDS.hxx:216
RVecDS & operator=(RVecDS &&)=delete
RVecDS(const RVecDS &)=delete
const std::vector< std::string > fColNames
Definition RVecDS.hxx:49
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition RVecDS.hxx:57
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition RVecDS.hxx:93
void ColLengthChecker(std::index_sequence< S... >)
Definition RVecDS.hxx:101
RVecDS(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colsNameVals)
Definition RVecDS.hxx:125
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition RVecDS.hxx:164
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition RVecDS.hxx:152
const std::map< std::string, std::string > fColTypesMap
Definition RVecDS.hxx:50
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1529
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:119
std::unique_ptr< RDataFrame > MakeRVecDataFrame(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colNameProxyPairs)
Definition RVecDS.hxx:225
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
TSeq< int > TSeqI
Definition TSeq.hxx:203