Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNumpyDS.hxx
Go to the documentation of this file.
1// Author: Stefan Wunsch CERN 04/2019
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// Include Python.h first before any standard header
12#include "Python.h"
13
14#include "ROOT/RDataSource.hxx"
15#include "ROOT/TSeq.hxx"
16#include "ROOT/RVec.hxx"
17
18#include <algorithm>
19#include <map>
20#include <memory>
21#include <tuple>
22#include <string>
23#include <typeinfo>
24#include <utility>
25#include <vector>
26
27#ifndef ROOT_RNUMPYDS
28#define ROOT_RNUMPYDS
29
30namespace ROOT {
31
32namespace Internal {
33
34namespace RDF {
35
36////////////////////////////////////////////////////////////////////////////////////////////////
37/// \brief A RDataSource implementation which takes a collection of RVecs, which
38/// are able to adopt data from Numpy arrays
39///
40/// This component allows to create a data source on a set of columns with data
41/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
42/// arrays, with RVecs allows to read arbitrary data from memory.
43/// In addition, the data source has to keep a reference on the Python owned data
44/// so that the lifetime of the data is tied to the datasource.
45template <typename... ColumnTypes>
46class RNumpyDS final : public ROOT::RDF::RDataSource {
47 using PointerHolderPtrs_t = std::vector<ROOT::Internal::TDS::TPointerHolder *>;
48
49 std::tuple<ROOT::RVec<ColumnTypes>*...> fColumns;
50 const std::vector<std::string> fColNames;
51 const std::map<std::string, std::string> fColTypesMap;
52 // The role of the fPointerHoldersModels is to be initialised with the pack
53 // of arguments in the constrcutor signature at construction time
54 // Once the number of slots is known, the fPointerHolders are initialised
55 // according to the models.
57 std::vector<PointerHolderPtrs_t> fPointerHolders;
58 std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
59 unsigned int fNSlots{0};
60 // Pointer to PyObject holding RVecs
61 // The RVecs itself hold a reference to the associated Numpy arrays so that
62 // the data cannot go out of scope as long as the datasource survives.
64
65 Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
66 {
67 auto colNameStr = std::string(colName);
68 // This could be optimised and done statically
69 const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
70 auto it = fColTypesMap.find(colNameStr);
71 if (fColTypesMap.end() == it) {
72 std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
73 throw std::runtime_error(err);
74 }
75
76 const auto colIdName = it->second;
77 if (colIdName != idName) {
78 std::string err = "Column " + colNameStr + " has type " + colIdName +
79 " while the id specified is associated to type " + idName;
80 throw std::runtime_error(err);
81 }
82
83 const auto colBegin = fColNames.begin();
84 const auto colEnd = fColNames.end();
85 const auto namesIt = std::find(colBegin, colEnd, colName);
86 const auto index = std::distance(colBegin, namesIt);
87
88 Record_t ret(fNSlots);
89 for (auto slot : ROOT::TSeqU(fNSlots)) {
90 ret[slot] = fPointerHolders[index][slot]->GetPointerAddr();
91 }
92 return ret;
93 }
94
95 size_t GetEntriesNumber() { return std::get<0>(fColumns)->size(); }
96 template <std::size_t... S>
97 void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
98 {
99 std::initializer_list<int> expander{
100 (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = (*std::get<S>(fColumns))[entry], 0)...};
101 (void)expander; // avoid unused variable warnings
102 }
103
104 template <std::size_t... S>
105 void ColLengthChecker(std::index_sequence<S...>)
106 {
107 if (sizeof...(S) < 2)
108 return;
109
110 const std::vector<size_t> colLengths{std::get<S>(fColumns)->size()...};
111 const auto expectedLen = colLengths[0];
112 std::string err;
113 for (auto i : TSeqI(1, colLengths.size())) {
114 if (expectedLen != colLengths[i]) {
115 err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
116 "\" have different lengths: " + std::to_string(expectedLen) + " and " +
117 std::to_string(colLengths[i]);
118 }
119 }
120 if (!err.empty()) {
121 throw std::runtime_error(err);
122 }
123 }
124
125protected:
126 std::string AsString() { return "Numpy data source"; };
127
128public:
130 std::pair<std::string, ROOT::RVec<ColumnTypes>*>... colsNameVals)
131 : fColumns(std::tuple<ROOT::RVec<ColumnTypes>*...>(colsNameVals.second...)),
132 fColNames({colsNameVals.first...}),
133 fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
135 fPyRVecs(pyRVecs)
136 {
137 // Take a reference to the data associated with this data source
138 Py_INCREF(fPyRVecs);
139 }
140
142 {
143 for (auto &&ptrHolderv : fPointerHolders) {
144 for (auto &&ptrHolder : ptrHolderv) {
145 delete ptrHolder;
146 }
147 }
148 // Release the data associated to this data source
149 Py_DECREF(fPyRVecs);
150 }
151
152 const std::vector<std::string> &GetColumnNames() const { return fColNames; }
153
154 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
155 {
156 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
157 return entryRanges;
158 }
159
160 std::string GetTypeName(std::string_view colName) const
161 {
162 const auto key = std::string(colName);
163 return fColTypesMap.at(key);
164 }
165
166 bool HasColumn(std::string_view colName) const
167 {
168 const auto key = std::string(colName);
169 const auto endIt = fColTypesMap.end();
170 return endIt != fColTypesMap.find(key);
171 }
172
173 bool SetEntry(unsigned int slot, ULong64_t entry)
174 {
175 SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
176 return true;
177 }
178
179 void SetNSlots(unsigned int nSlots)
180 {
181 fNSlots = nSlots;
182 const auto nCols = fColNames.size();
183 fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
184 auto colIndex = 0U;
185 for (auto &&ptrHolderv : fPointerHolders) {
186 for (auto slot : ROOT::TSeqI(fNSlots)) {
187 auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
188 ptrHolderv.emplace_back(ptrHolder);
189 (void)slot;
190 }
191 colIndex++;
192 }
193 for (auto &&ptrHolder : fPointerHoldersModels)
194 delete ptrHolder;
195 }
196
198 {
199 ColLengthChecker(std::index_sequence_for<ColumnTypes...>());
200 const auto nEntries = GetEntriesNumber();
201 const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
202 auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
203 fEntryRanges.resize(fNSlots);
204 auto init = 0ULL;
205 auto end = 0ULL;
206 for (auto &&range : fEntryRanges) {
207 end = init + nEntriesInRange;
208 if (0 != reminder) { // Distribute the reminder among the first chunks
209 reminder--;
210 end += 1;
211 }
212 range.first = init;
213 range.second = end;
214 init = end;
215 }
216 }
217
218 std::string GetLabel() { return "RNumpyDS"; }
219};
220
221// Factory to create datasource able to read Numpy arrays through RVecs
222// Note that we have to return the object on the heap so that the interpreter
223// does not clean it up during shutdown and causes a double delete.
224template <typename... ColumnTypes>
226 std::pair<std::string, ROOT::RVec<ColumnTypes>*> &&... colNameProxyPairs)
227{
228 return new RDataFrame(std::make_unique<RNumpyDS<ColumnTypes...>>(
229 std::forward<PyObject*>(pyRVecs),
230 std::forward<std::pair<std::string, ROOT::RVec<ColumnTypes>*>>(colNameProxyPairs)...));
231}
232
233} // namespace RDF
234} // namespace Internal
235} // namespace ROOT
236
237#endif // ROOT_RNUMPYDS
_object PyObject
unsigned long long ULong64_t
Definition RtypesCore.h:81
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition RNumpyDS.hxx:46
const std::map< std::string, std::string > fColTypesMap
Definition RNumpyDS.hxx:51
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition RNumpyDS.hxx:58
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition RNumpyDS.hxx:57
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition RNumpyDS.hxx:166
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RNumpyDS.hxx:173
std::tuple< ROOT::RVec< ColumnTypes > *... > fColumns
Definition RNumpyDS.hxx:49
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition RNumpyDS.hxx:97
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition RNumpyDS.hxx:154
std::vector< ROOT::Internal::TDS::TPointerHolder * > PointerHolderPtrs_t
Definition RNumpyDS.hxx:47
void ColLengthChecker(std::index_sequence< S... >)
Definition RNumpyDS.hxx:105
const PointerHolderPtrs_t fPointerHoldersModels
Definition RNumpyDS.hxx:56
const std::vector< std::string > fColNames
Definition RNumpyDS.hxx:50
void Initialize()
Convenience method called before starting an event-loop.
Definition RNumpyDS.hxx:197
RNumpyDS(PyObject *pyRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > * >... colsNameVals)
Definition RNumpyDS.hxx:129
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition RNumpyDS.hxx:179
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition RNumpyDS.hxx:160
std::string GetLabel()
Return a string representation of the datasource type.
Definition RNumpyDS.hxx:218
Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
type-erased vector of pointers to pointers to column values - one per slot
Definition RNumpyDS.hxx:65
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition RNumpyDS.hxx:152
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1492
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:99
RDataFrame * MakeNumpyDataFrame(PyObject *pyRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > * > &&... colNameProxyPairs)
Definition RNumpyDS.hxx:225
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
TSeq< int > TSeqI
Definition TSeq.hxx:203