Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RVecDS.hxx
Go to the documentation of this file.
1// Author: Stefan Wunsch CERN 04/2019
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/RDataFrame.hxx>
12#include <ROOT/RDataSource.hxx>
13#include <ROOT/RVec.hxx>
14#include <ROOT/TSeq.hxx>
15
16#include <algorithm>
17#include <any>
18#include <map>
19#include <memory>
20#include <string>
21#include <tuple>
22#include <typeinfo>
23#include <utility>
24#include <vector>
25
26#ifndef ROOT_RVECDS
27#define ROOT_RVECDS
28
29namespace ROOT::Internal::RDF {
30
38
39////////////////////////////////////////////////////////////////////////////////////////////////
40/// \brief A RDataSource implementation which takes a collection of RVecs, which
41/// are able to adopt data from Numpy arrays
42///
43/// This component allows to create a data source on a set of columns with data
44/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
45/// arrays, with RVecs allows to read arbitrary data from memory.
46/// In addition, the data source has to keep a reference on the Python owned data
47/// so that the lifetime of the data is tied to the datasource.
48template <typename... ColumnTypes>
50 using PointerHolderPtrs_t = std::vector<ROOT::Internal::RDF::TPointerHolder *>;
51
52 std::tuple<ROOT::RVec<ColumnTypes>...> fColumns;
53 std::vector<std::string> fColNames;
54 std::unordered_map<std::string, std::string> fColTypesMap;
55 // The role of the fPointerHoldersModels is to be initialised with the pack
56 // of arguments in the constrcutor signature at construction time
57 // Once the number of slots is known, the fPointerHolders are initialised
58 // according to the models.
60 std::vector<PointerHolderPtrs_t> fPointerHolders;
61 std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
62 std::any fLifeline;
63
64 Record_t GetColumnReadersImpl(std::string_view, const std::type_info &) { return {}; }
65
66 size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); }
67 template <std::size_t... S>
68 void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
69 {
70 std::initializer_list<int> expander{
71 (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = std::get<S>(fColumns)[entry], 0)...};
72 (void)expander; // avoid unused variable warnings
73 }
74
75 template <std::size_t... S>
76 void ColLengthChecker(std::index_sequence<S...>)
77 {
78 if (sizeof...(S) < 2)
79 return;
80
81 const std::vector<size_t> colLengths{std::get<S>(fColumns).size()...};
82 const auto expectedLen = colLengths[0];
83 std::string err;
84 for (auto i : TSeqI(1, colLengths.size())) {
85 if (expectedLen != colLengths[i]) {
86 err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
87 "\" have different lengths: " + std::to_string(expectedLen) + " and " +
88 std::to_string(colLengths[i]);
89 }
90 }
91 if (!err.empty()) {
92 throw std::runtime_error(err);
93 }
94 }
95
96protected:
97 std::string AsString() { return "Numpy data source"; };
98
99public:
100 RVecDS(std::any lifeline, std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colsNameVals)
101 : fColumns(colsNameVals.second...),
102 fColNames{colsNameVals.first...},
103 fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
104 fPointerHoldersModels({new ROOT::Internal::RDF::TTypedPointerHolder<ColumnTypes>(new ColumnTypes())...}),
105 fLifeline{std::move(lifeline)}
106 {
107 }
108
109 // Rule of five
110 RVecDS(const RVecDS &) = delete;
111 RVecDS &operator=(const RVecDS &) = delete;
112 RVecDS(RVecDS &&) = delete;
113 RVecDS &operator=(RVecDS &&) = delete;
115 {
116 for (auto &&ptrHolderv : fPointerHolders) {
117 for (auto &&ptrHolder : ptrHolderv) {
118 delete ptrHolder;
119 }
120 }
121 }
122
123 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
124 GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final
125 {
126 auto colNameStr = std::string(colName);
127
128 auto it = fColTypesMap.find(colNameStr);
129 if (fColTypesMap.end() == it) {
130 std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
131 throw std::runtime_error(err);
132 }
133
134 const auto &colIdName = it->second;
136 if (colIdName != idName) {
137 std::string err = "Column " + colNameStr + " has type " + colIdName +
138 " while the id specified is associated to type " + idName;
139 throw std::runtime_error(err);
140 }
141
142 if (auto colNameIt = std::find(fColNames.begin(), fColNames.end(), colNameStr); colNameIt != fColNames.end()) {
143 const auto index = std::distance(fColNames.begin(), colNameIt);
144 return std::make_unique<ROOT::Internal::RDF::RVecDSColumnReader>(fPointerHolders[index][slot]);
145 }
146
147 throw std::runtime_error("Could not find column name \"" + colNameStr + "\" in available column names.");
148 }
149
150 const std::vector<std::string> &GetColumnNames() const { return fColNames; }
151
152 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
153 {
154 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
155 return entryRanges;
156 }
157
158 std::string GetTypeName(std::string_view colName) const
159 {
160 const auto key = std::string(colName);
161 return fColTypesMap.at(key);
162 }
163
164 bool HasColumn(std::string_view colName) const
165 {
166 const auto key = std::string(colName);
167 const auto endIt = fColTypesMap.end();
168 return endIt != fColTypesMap.find(key);
169 }
170
171 bool SetEntry(unsigned int slot, ULong64_t entry)
172 {
173 SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
174 return true;
175 }
176
177 void SetNSlots(unsigned int nSlots) final
178 {
179 fNSlots = nSlots;
180 const auto nCols = fColNames.size();
181 fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
182 auto colIndex = 0U;
183 for (auto &&ptrHolderv : fPointerHolders) {
184 for (auto slot : ROOT::TSeqI(fNSlots)) {
185 auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
186 ptrHolderv.emplace_back(ptrHolder);
187 (void)slot;
188 }
189 colIndex++;
190 }
191 for (auto &&ptrHolder : fPointerHoldersModels)
192 delete ptrHolder;
193 }
194
196 {
197 ColLengthChecker(std::index_sequence_for<ColumnTypes...>());
198 const auto nEntries = GetEntriesNumber();
199 const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
200 auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
201 fEntryRanges.resize(fNSlots);
202 auto init = 0ULL;
203 auto end = 0ULL;
204 for (auto &&range : fEntryRanges) {
205 end = init + nEntriesInRange;
206 if (0 != reminder) { // Distribute the reminder among the first chunks
207 reminder--;
208 end += 1;
209 }
210 range.first = init;
211 range.second = end;
212 init = end;
213 }
214 }
215
216 std::string GetLabel() { return "RVecDS"; }
217};
218
219// Factory to create datasource able to read Numpy arrays through RVecs.
220// \param pyRVecs Pointer to PyObject holding RVecs.
221// The RVecs itself hold a reference to the associated Numpy arrays so that
222// the data cannot go out of scope as long as the datasource survives.
223template <typename... ColumnTypes>
224std::unique_ptr<RDataFrame>
225MakeRVecDataFrame(std::any lifeline, std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colNameProxyPairs)
226{
227 return std::make_unique<RDataFrame>(
228 std::make_unique<RVecDS<ColumnTypes...>>(std::move(lifeline), colNameProxyPairs...));
229}
230
231} // namespace ROOT::Internal::RDF
232
233#endif // ROOT_RNUMPYDS
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
RVecDSColumnReader(TPointerHolder *ptrHolder)
Definition RVecDS.hxx:36
void * GetImpl(Long64_t) final
Definition RVecDS.hxx:33
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition RVecDS.hxx:49
RVecDS(RVecDS &&)=delete
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition RVecDS.hxx:60
void Initialize()
Convenience method called before starting an event-loop.
Definition RVecDS.hxx:195
RVecDS & operator=(const RVecDS &)=delete
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition RVecDS.hxx:158
std::unordered_map< std::string, std::string > fColTypesMap
Definition RVecDS.hxx:54
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition RVecDS.hxx:150
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RVecDS.hxx:171
std::tuple< ROOT::RVec< ColumnTypes >... > fColumns
Definition RVecDS.hxx:52
std::string GetLabel()
Return a string representation of the datasource type.
Definition RVecDS.hxx:216
RVecDS & operator=(RVecDS &&)=delete
std::vector< std::string > fColNames
Definition RVecDS.hxx:53
RVecDS(const RVecDS &)=delete
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition RVecDS.hxx:68
PointerHolderPtrs_t fPointerHoldersModels
Definition RVecDS.hxx:59
void ColLengthChecker(std::index_sequence< S... >)
Definition RVecDS.hxx:76
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition RVecDS.hxx:124
Record_t GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition RVecDS.hxx:64
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition RVecDS.hxx:164
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition RVecDS.hxx:152
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RVecDS.hxx:177
RVecDS(std::any lifeline, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colsNameVals)
Definition RVecDS.hxx:100
std::vector< ROOT::Internal::RDF::TPointerHolder * > PointerHolderPtrs_t
Definition RVecDS.hxx:50
Mother class of TTypedPointerHolder.
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1524
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:191
std::unique_ptr< RDataFrame > MakeRVecDataFrame(std::any lifeline, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colNameProxyPairs)
Definition RVecDS.hxx:225
TSeq< int > TSeqI
Definition TSeq.hxx:203