Logo ROOT  
Reference Guide
RNumpyDS.hxx
Go to the documentation of this file.
1 // Author: Stefan Wunsch CERN 04/2019
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 // Include Python.h first before any standard header
12 #include "Python.h"
13 
15 #include "ROOT/RMakeUnique.hxx"
16 #include "ROOT/RDataSource.hxx"
17 #include "ROOT/TSeq.hxx"
18 #include "ROOT/RVec.hxx"
19 
20 #include <algorithm>
21 #include <map>
22 #include <tuple>
23 #include <string>
24 #include <typeinfo>
25 #include <vector>
26 
27 #ifndef ROOT_RNUMPYDS
28 #define ROOT_RNUMPYDS
29 
30 namespace ROOT {
31 
32 namespace Internal {
33 
34 namespace RDF {
35 
36 ////////////////////////////////////////////////////////////////////////////////////////////////
37 /// \brief A RDataSource implementation which takes a collection of RVecs, which
38 /// are able to adopt data from Numpy arrays
39 ///
40 /// This component allows to create a data source on a set of columns with data
41 /// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
42 /// arrays, with RVecs allows to read arbitrary data from memory.
43 /// In addition, the data source has to keep a reference on the Python owned data
44 /// so that the lifetime of the data is tied to the datasource.
45 template <typename... ColumnTypes>
46 class RNumpyDS final : public ROOT::RDF::RDataSource {
47  using PointerHolderPtrs_t = std::vector<ROOT::Internal::TDS::TPointerHolder *>;
48 
49  std::tuple<ROOT::RVec<ColumnTypes>*...> fColumns;
50  const std::vector<std::string> fColNames;
51  const std::map<std::string, std::string> fColTypesMap;
52  // The role of the fPointerHoldersModels is to be initialised with the pack
53  // of arguments in the constrcutor signature at construction time
54  // Once the number of slots is known, the fPointerHolders are initialised
55  // according to the models.
57  std::vector<PointerHolderPtrs_t> fPointerHolders;
58  std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
59  unsigned int fNSlots{0};
60  // Pointer to PyObject holding RVecs
61  // The RVecs itself hold a reference to the associated Numpy arrays so that
62  // the data cannot go out of scope as long as the datasource survives.
64 
65  Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
66  {
67  auto colNameStr = std::string(colName);
68  // This could be optimised and done statically
69  const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
70  auto it = fColTypesMap.find(colNameStr);
71  if (fColTypesMap.end() == it) {
72  std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
73  throw std::runtime_error(err);
74  }
75 
76  const auto colIdName = it->second;
77  if (colIdName != idName) {
78  std::string err = "Column " + colNameStr + " has type " + colIdName +
79  " while the id specified is associated to type " + idName;
80  throw std::runtime_error(err);
81  }
82 
83  const auto colBegin = fColNames.begin();
84  const auto colEnd = fColNames.end();
85  const auto namesIt = std::find(colBegin, colEnd, colName);
86  const auto index = std::distance(colBegin, namesIt);
87 
88  Record_t ret(fNSlots);
89  for (auto slot : ROOT::TSeqU(fNSlots)) {
90  ret[slot] = fPointerHolders[index][slot]->GetPointerAddr();
91  }
92  return ret;
93  }
94 
95  size_t GetEntriesNumber() { return std::get<0>(fColumns)->size(); }
96  template <std::size_t... S>
97  void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
98  {
99  std::initializer_list<int> expander{
100  (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = (*std::get<S>(fColumns))[entry], 0)...};
101  (void)expander; // avoid unused variable warnings
102  }
103 
104  template <std::size_t... S>
105  void ColLenghtChecker(std::index_sequence<S...>)
106  {
107  if (sizeof...(S) < 2)
108  return;
109 
110  const std::vector<size_t> colLengths{std::get<S>(fColumns)->size()...};
111  const auto expectedLen = colLengths[0];
112  std::string err;
113  for (auto i : TSeqI(1, colLengths.size())) {
114  if (expectedLen != colLengths[i]) {
115  err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
116  "\" have different lengths: " + std::to_string(expectedLen) + " and " +
117  std::to_string(colLengths[i]);
118  }
119  }
120  if (!err.empty()) {
121  throw std::runtime_error(err);
122  }
123  }
124 
125 protected:
126  std::string AsString() { return "Numpy data source"; };
127 
128 public:
129  RNumpyDS(PyObject* pyRVecs,
130  std::pair<std::string, ROOT::RVec<ColumnTypes>*>... colsNameVals)
131  : fColumns(std::tuple<ROOT::RVec<ColumnTypes>*...>(colsNameVals.second...)),
132  fColNames({colsNameVals.first...}),
133  fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
135  fPyRVecs(pyRVecs)
136  {
137  // Take a reference to the data associated with this data source
138  Py_INCREF(fPyRVecs);
139  }
140 
141  ~RNumpyDS()
142  {
143  for (auto &&ptrHolderv : fPointerHolders) {
144  for (auto &&ptrHolder : ptrHolderv) {
145  delete ptrHolder;
146  }
147  }
148  // Release the data associated to this data source
149  Py_DECREF(fPyRVecs);
150  }
151 
152  const std::vector<std::string> &GetColumnNames() const { return fColNames; }
153 
154  std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
155  {
156  auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
157  return entryRanges;
158  }
159 
160  std::string GetTypeName(std::string_view colName) const
161  {
162  const auto key = std::string(colName);
163  return fColTypesMap.at(key);
164  }
165 
166  bool HasColumn(std::string_view colName) const
167  {
168  const auto key = std::string(colName);
169  const auto endIt = fColTypesMap.end();
170  return endIt != fColTypesMap.find(key);
171  }
172 
173  bool SetEntry(unsigned int slot, ULong64_t entry)
174  {
175  SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
176  return true;
177  }
178 
179  void SetNSlots(unsigned int nSlots)
180  {
181  fNSlots = nSlots;
182  const auto nCols = fColNames.size();
183  fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
184  auto colIndex = 0U;
185  for (auto &&ptrHolderv : fPointerHolders) {
186  for (auto slot : ROOT::TSeqI(fNSlots)) {
187  auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
188  ptrHolderv.emplace_back(ptrHolder);
189  (void)slot;
190  }
191  colIndex++;
192  }
193  for (auto &&ptrHolder : fPointerHoldersModels)
194  delete ptrHolder;
195  }
196 
197  void Initialise()
198  {
199  ColLenghtChecker(std::index_sequence_for<ColumnTypes...>());
200  const auto nEntries = GetEntriesNumber();
201  const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
202  auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
204  auto init = 0ULL;
205  auto end = 0ULL;
206  for (auto &&range : fEntryRanges) {
207  end = init + nEntriesInRange;
208  if (0 != reminder) { // Distribute the reminder among the first chunks
209  reminder--;
210  end += 1;
211  }
212  range.first = init;
213  range.second = end;
214  init = end;
215  }
216  }
217 
218  std::string GetLabel() { return "RNumpyDS"; }
219 };
220 
221 // Factory to create datasource able to read Numpy arrays through RVecs
222 // Note that we have to return the object on the heap so that the interpreter
223 // does not clean it up during shutdown and causes a double delete.
224 template <typename... ColumnTypes>
226  std::pair<std::string, ROOT::RVec<ColumnTypes>*> &&... colNameProxyPairs)
227 {
228  return new RDataFrame(std::make_unique<RNumpyDS<ColumnTypes...>>(
229  std::forward<PyObject*>(pyRVecs),
230  std::forward<std::pair<std::string, ROOT::RVec<ColumnTypes>*>>(colNameProxyPairs)...));
231 }
232 
233 } // namespace RDF
234 } // namespace Internal
235 } // namespace ROOT
236 
237 #endif // ROOT_RNUMPYDS
ROOT::Internal::RDF::RNumpyDS::fColNames
const std::vector< std::string > fColNames
Definition: RNumpyDS.hxx:74
ROOT::Internal::RDF::RNumpyDS::SetNSlots
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RNumpyDS.hxx:203
PyObject
_object PyObject
Definition: PyMethodBase.h:42
ROOT::Internal::RDF::RNumpyDS::fPyRVecs
PyObject * fPyRVecs
Definition: RNumpyDS.hxx:87
ROOT::Internal::RDF::RNumpyDS::Initialise
void Initialise()
Convenience method called before starting an event-loop.
Definition: RNumpyDS.hxx:221
ROOT::Internal::RDF::RNumpyDS::GetColumnReadersImpl
Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RNumpyDS.hxx:89
string_view
basic_string_view< char > string_view
Definition: libcpp_string_view.h:785
BatchHelpers::init
EvaluateInfo init(std::vector< RooRealProxy > parameters, std::vector< ArrayWrapper * > wrappers, std::vector< double * > arrays, size_t begin, size_t batchSize)
ROOT::TSeqI
TSeq< int > TSeqI
Definition: TSeq.hxx:194
ROOT::Internal::RDF::RNumpyDS::~RNumpyDS
~RNumpyDS()
Definition: RNumpyDS.hxx:165
RooFitShortHand::S
RooArgSet S(const RooAbsArg &v1)
Definition: RooGlobalFunc.cxx:348
RVec.hxx
ROOT::RDataFrame
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:42
ROOT::RDF::RDataSource
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:105
ROOT::Internal::RDF::RNumpyDS
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition: RNumpyDS.hxx:64
ROOT::Internal::RDF::RNumpyDS::fPointerHolders
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition: RNumpyDS.hxx:81
RDataSource.hxx
RIntegerSequence.hxx
ROOT::Internal::RDF::RNumpyDS::PointerHolderPtrs_t
std::vector< ROOT::Internal::TDS::TPointerHolder * > PointerHolderPtrs_t
Definition: RNumpyDS.hxx:71
TSeq.hxx
ROOT::Internal::RDF::RNumpyDS::GetLabel
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RNumpyDS.hxx:242
ROOT::Internal::RDF::MakeNumpyDataFrame
RDataFrame * MakeNumpyDataFrame(PyObject *pyRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > * > &&... colNameProxyPairs)
Definition: RNumpyDS.hxx:243
ROOT::Internal::RDF::RNumpyDS::fEntryRanges
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition: RNumpyDS.hxx:82
ROOT::Internal::RDF::RNumpyDS::SetEntryHelper
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition: RNumpyDS.hxx:121
ROOT::Internal::RDF::RNumpyDS::ColLenghtChecker
void ColLenghtChecker(std::index_sequence< S... >)
Definition: RNumpyDS.hxx:129
ROOT::Internal::TDS::TTypedPointerHolder
Class to wrap a pointer and delete the memory associated to it correctly.
Definition: RDataSource.hxx:58
ROOT::Internal::RDF::RNumpyDS::fColumns
std::tuple< ROOT::RVec< ColumnTypes > *... > fColumns
Definition: RNumpyDS.hxx:73
void
typedef void((*Func_t)())
ROOT::Internal::RDF::RNumpyDS::GetTypeName
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RNumpyDS.hxx:184
ULong64_t
unsigned long long ULong64_t
Definition: RtypesCore.h:74
ROOT::RDF::RDataSource::Record_t
std::vector< void * > Record_t
Definition: RDataSource.hxx:108
ROOT::Internal::RDF::RNumpyDS::GetColumnNames
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RNumpyDS.hxx:176
ROOT::Internal::RDF::RNumpyDS::HasColumn
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RNumpyDS.hxx:190
TMVA::DNN::forward
void forward(const LAYERDATA &prevLayerData, LAYERDATA &currLayerData)
apply the weights (and functions) in forward direction of the DNN
Definition: NeuralNet.icc:546
ROOT::Internal::RDF::RNumpyDS::RNumpyDS
RNumpyDS(PyObject *pyRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > * >... colsNameVals)
Definition: RNumpyDS.hxx:153
RMakeUnique.hxx
ROOT::TSeq
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
ROOT::Internal::RDF::RNumpyDS::SetEntry
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RNumpyDS.hxx:197
ROOT::Internal::RDF::RNumpyDS::fColTypesMap
const std::map< std::string, std::string > fColTypesMap
Definition: RNumpyDS.hxx:75
ROOT::Internal::RDF::TypeID2TypeName
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition: RDFUtils.cxx:84
TGeant4Unit::second
static constexpr double second
Definition: TGeant4SystemOfUnits.h:157
ROOT
VSD Structures.
Definition: StringConv.hxx:21
ROOT::VecOps::RVec
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition: RVec.hxx:55
ROOT::Internal::RDF::RNumpyDS::AsString
std::string AsString()
Definition: RNumpyDS.hxx:150
ROOT::Internal::RDF::RNumpyDS::fNSlots
unsigned int fNSlots
Definition: RNumpyDS.hxx:83
ROOT::Internal::RDF::RNumpyDS::GetEntriesNumber
size_t GetEntriesNumber()
Definition: RNumpyDS.hxx:119
ROOT::Internal::RDF::RNumpyDS::GetEntryRanges
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RNumpyDS.hxx:178
ROOT::Internal::RDF::RNumpyDS::fPointerHoldersModels
const PointerHolderPtrs_t fPointerHoldersModels
Definition: RNumpyDS.hxx:80