Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RVecDS.hxx
Go to the documentation of this file.
1// Author: Stefan Wunsch CERN 04/2019
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/RDataFrame.hxx>
12#include <ROOT/RDataSource.hxx>
13#include <ROOT/RVec.hxx>
14#include <ROOT/TSeq.hxx>
15
16#include <algorithm>
17#include <functional>
18#include <map>
19#include <memory>
20#include <string>
21#include <tuple>
22#include <typeinfo>
23#include <utility>
24#include <vector>
25
26#ifndef ROOT_RVECDS
27#define ROOT_RVECDS
28
29namespace ROOT {
30
31namespace Internal {
32
33namespace RDF {
34
35////////////////////////////////////////////////////////////////////////////////////////////////
36/// \brief A RDataSource implementation which takes a collection of RVecs, which
37/// are able to adopt data from Numpy arrays
38///
39/// This component allows to create a data source on a set of columns with data
40/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
41/// arrays, with RVecs allows to read arbitrary data from memory.
42/// In addition, the data source has to keep a reference on the Python owned data
43/// so that the lifetime of the data is tied to the datasource.
44template <typename... ColumnTypes>
45class RVecDS final : public ROOT::RDF::RDataSource {
46 using PointerHolderPtrs_t = std::vector<ROOT::Internal::TDS::TPointerHolder *>;
47
48 std::tuple<ROOT::RVec<ColumnTypes>...> fColumns;
49 const std::vector<std::string> fColNames;
50 const std::map<std::string, std::string> fColTypesMap;
51 // The role of the fPointerHoldersModels is to be initialised with the pack
52 // of arguments in the constrcutor signature at construction time
53 // Once the number of slots is known, the fPointerHolders are initialised
54 // according to the models.
56 std::vector<PointerHolderPtrs_t> fPointerHolders;
57 std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
58 unsigned int fNSlots{0};
59 std::function<void()> fDeleteRVecs;
60
61 Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
62 {
63 auto colNameStr = std::string(colName);
64 // This could be optimised and done statically
65 const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
66 auto it = fColTypesMap.find(colNameStr);
67 if (fColTypesMap.end() == it) {
68 std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
69 throw std::runtime_error(err);
70 }
71
72 const auto colIdName = it->second;
73 if (colIdName != idName) {
74 std::string err = "Column " + colNameStr + " has type " + colIdName +
75 " while the id specified is associated to type " + idName;
76 throw std::runtime_error(err);
77 }
78
79 const auto colBegin = fColNames.begin();
80 const auto colEnd = fColNames.end();
81 const auto namesIt = std::find(colBegin, colEnd, colName);
82 const auto index = std::distance(colBegin, namesIt);
83
84 Record_t ret(fNSlots);
85 for (auto slot : ROOT::TSeqU(fNSlots)) {
86 ret[slot] = fPointerHolders[index][slot]->GetPointerAddr();
87 }
88 return ret;
89 }
90
91 size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); }
92 template <std::size_t... S>
93 void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
94 {
95 std::initializer_list<int> expander{
96 (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = std::get<S>(fColumns)[entry], 0)...};
97 (void)expander; // avoid unused variable warnings
98 }
99
100 template <std::size_t... S>
101 void ColLengthChecker(std::index_sequence<S...>)
102 {
103 if (sizeof...(S) < 2)
104 return;
105
106 const std::vector<size_t> colLengths{std::get<S>(fColumns).size()...};
107 const auto expectedLen = colLengths[0];
108 std::string err;
109 for (auto i : TSeqI(1, colLengths.size())) {
110 if (expectedLen != colLengths[i]) {
111 err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
112 "\" have different lengths: " + std::to_string(expectedLen) + " and " +
113 std::to_string(colLengths[i]);
114 }
115 }
116 if (!err.empty()) {
117 throw std::runtime_error(err);
118 }
119 }
120
121protected:
122 std::string AsString() { return "Numpy data source"; };
123
124public:
125 RVecDS(std::function<void()> deleteRVecs, std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colsNameVals)
126 : fColumns(colsNameVals.second...),
127 fColNames{colsNameVals.first...},
128 fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
130 fDeleteRVecs(deleteRVecs)
131 {
132 }
133
135 {
136 for (auto &&ptrHolderv : fPointerHolders) {
137 for (auto &&ptrHolder : ptrHolderv) {
138 delete ptrHolder;
139 }
140 }
141 // Release the data associated to this data source
142 fDeleteRVecs();
143 }
144
145 const std::vector<std::string> &GetColumnNames() const { return fColNames; }
146
147 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
148 {
149 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
150 return entryRanges;
151 }
152
153 std::string GetTypeName(std::string_view colName) const
154 {
155 const auto key = std::string(colName);
156 return fColTypesMap.at(key);
157 }
158
159 bool HasColumn(std::string_view colName) const
160 {
161 const auto key = std::string(colName);
162 const auto endIt = fColTypesMap.end();
163 return endIt != fColTypesMap.find(key);
164 }
165
166 bool SetEntry(unsigned int slot, ULong64_t entry)
167 {
168 SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
169 return true;
170 }
171
172 void SetNSlots(unsigned int nSlots)
173 {
174 fNSlots = nSlots;
175 const auto nCols = fColNames.size();
176 fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
177 auto colIndex = 0U;
178 for (auto &&ptrHolderv : fPointerHolders) {
179 for (auto slot : ROOT::TSeqI(fNSlots)) {
180 auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
181 ptrHolderv.emplace_back(ptrHolder);
182 (void)slot;
183 }
184 colIndex++;
185 }
186 for (auto &&ptrHolder : fPointerHoldersModels)
187 delete ptrHolder;
188 }
189
191 {
192 ColLengthChecker(std::index_sequence_for<ColumnTypes...>());
193 const auto nEntries = GetEntriesNumber();
194 const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
195 auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
196 fEntryRanges.resize(fNSlots);
197 auto init = 0ULL;
198 auto end = 0ULL;
199 for (auto &&range : fEntryRanges) {
200 end = init + nEntriesInRange;
201 if (0 != reminder) { // Distribute the reminder among the first chunks
202 reminder--;
203 end += 1;
204 }
205 range.first = init;
206 range.second = end;
207 init = end;
208 }
209 }
210
211 std::string GetLabel() { return "RVecDS"; }
212};
213
214// Factory to create datasource able to read Numpy arrays through RVecs.
215// \param pyRVecs Pointer to PyObject holding RVecs.
216// The RVecs itself hold a reference to the associated Numpy arrays so that
217// the data cannot go out of scope as long as the datasource survives.
218template <typename... ColumnTypes>
219std::unique_ptr<RDataFrame>
220MakeRVecDataFrame(std::function<void()> deleteRVecs,
221 std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colNameProxyPairs)
222{
223 return std::make_unique<RDataFrame>(std::make_unique<RVecDS<ColumnTypes...>>(deleteRVecs, colNameProxyPairs...));
224}
225
226} // namespace RDF
227} // namespace Internal
228} // namespace ROOT
229
230#endif // ROOT_RNUMPYDS
unsigned long long ULong64_t
Definition RtypesCore.h:70
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition RVecDS.hxx:45
Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
type-erased vector of pointers to pointers to column values - one per slot
Definition RVecDS.hxx:61
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition RVecDS.hxx:56
void Initialize()
Convenience method called before starting an event-loop.
Definition RVecDS.hxx:190
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition RVecDS.hxx:153
const PointerHolderPtrs_t fPointerHoldersModels
Definition RVecDS.hxx:55
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition RVecDS.hxx:145
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RVecDS.hxx:166
std::function< void()> fDeleteRVecs
Definition RVecDS.hxx:59
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition RVecDS.hxx:172
std::vector< ROOT::Internal::TDS::TPointerHolder * > PointerHolderPtrs_t
Definition RVecDS.hxx:46
std::tuple< ROOT::RVec< ColumnTypes >... > fColumns
Definition RVecDS.hxx:48
std::string GetLabel()
Return a string representation of the datasource type.
Definition RVecDS.hxx:211
const std::vector< std::string > fColNames
Definition RVecDS.hxx:49
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition RVecDS.hxx:57
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition RVecDS.hxx:93
void ColLengthChecker(std::index_sequence< S... >)
Definition RVecDS.hxx:101
RVecDS(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colsNameVals)
Definition RVecDS.hxx:125
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition RVecDS.hxx:159
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition RVecDS.hxx:147
const std::map< std::string, std::string > fColTypesMap
Definition RVecDS.hxx:50
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1529
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:119
std::unique_ptr< RDataFrame > MakeRVecDataFrame(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colNameProxyPairs)
Definition RVecDS.hxx:220
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
TSeq< int > TSeqI
Definition TSeq.hxx:203