Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
19#include <ROOT/RFieldValue.hxx>
21#include <ROOT/RNTupleDS.hxx>
22#include <ROOT/RNTupleUtil.hxx>
23#include <ROOT/RPageStorage.hxx>
24#include <ROOT/RStringView.hxx>
25
26#include <TError.h>
27
28#include <string>
29#include <vector>
30#include <typeinfo>
31#include <utility>
32
33namespace ROOT {
34namespace Experimental {
35namespace Internal {
36
37/// An artificial field that transforms an RNTuple column that contains the offset of collections into
38/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
39/// `R_rdf_sizeof_jets` for a collection named `jets`.
40///
41/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
42/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
43/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
45protected:
46 std::unique_ptr<ROOT::Experimental::Detail::RFieldBase> CloneImpl(std::string_view /* newName */) const final
47 {
48 return std::make_unique<RRDFCardinalityField>();
49 }
50
51public:
52 static std::string TypeName() { return "std::size_t"; }
54 : ROOT::Experimental::Detail::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */) {}
58
59 // Field is only used for reading
60 void GenerateColumnsImpl() final { assert(false && "Cardinality fields must only be used for reading"); }
61
63 {
64 RColumnModel model(EColumnType::kIndex, true /* isSorted*/);
65 fColumns.emplace_back(std::unique_ptr<ROOT::Experimental::Detail::RColumn>(
66 ROOT::Experimental::Detail::RColumn::Create<ClusterSize_t, EColumnType::kIndex>(model, 0)));
67 fPrincipalColumn = fColumns[0].get();
68 }
69
71 {
72 return ROOT::Experimental::Detail::RFieldValue(this, static_cast<std::size_t *>(where));
73 }
75 {
76 return ROOT::Experimental::Detail::RFieldValue(true /* captureFlag */, this, where);
77 }
78 size_t GetValueSize() const final { return sizeof(std::size_t); }
79
80 /// Get the number of elements of the collection identified by globalIndex
81 void
83 {
84 RClusterIndex collectionStart;
86 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
87 *value->Get<std::size_t>() = size;
88 }
89
90 /// Get the number of elements of the collection identified by clusterIndex
93 {
94 RClusterIndex collectionStart;
96 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
97 *value->Get<std::size_t>() = size;
98 }
99};
100
101/// Every RDF column is represented by exactly one RNTuple field
106
107 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
108 RFieldValue fValue; ///< The memory location used to read from fField
109 Long64_t fLastEntry; ///< Last entry number that was read
110
111public:
112 RNTupleColumnReader(std::unique_ptr<RFieldBase> f)
113 : fField(std::move(f)), fValue(fField->GenerateValue()), fLastEntry(-1)
114 {
115 }
116 virtual ~RNTupleColumnReader() { fField->DestroyValue(fValue); }
117
118 /// Column readers are created as prototype and then cloned for every slot
119 std::unique_ptr<RNTupleColumnReader> Clone()
120 {
121 return std::make_unique<RNTupleColumnReader>(fField->Clone(fField->GetName()));
122 }
123
124 /// Connect the field and its subfields to the page source
125 void Connect(RPageSource &source)
126 {
127 fField->ConnectPageSource(source);
128 for (auto &f : *fField)
129 f.ConnectPageSource(source);
130 }
131
132 void *GetImpl(Long64_t entry) final
133 {
134 if (entry != fLastEntry) {
135 fField->Read(entry, &fValue);
136 fLastEntry = entry;
137 }
138 return fValue.GetRawPtr();
139 }
140};
141
142} // namespace Internal
143
144RNTupleDS::~RNTupleDS() = default;
145
146void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
147 std::vector<DescriptorId_t> skeinIDs)
148{
149 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
150 // using the following types and with a top-level field named "event" of type Event:
151 //
152 // struct Event {
153 // int id;
154 // std::vector<Track> tracks;
155 // };
156 // struct Track {
157 // std::vector<Hit> hits;
158 // };
159 // struct Hit {
160 // float x;
161 // float y;
162 // };
163 //
164 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
165 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
166 // tree of sub fields and expose the following RDF columns:
167 // TODO(jblomer): Collections should be exposed as RVec<T> instead of std::vector<T>
168 //
169 // "event" [Event]
170 // "event.id" [int]
171 // "event.tracks" [std::vector<Track>]
172 // "R_rdf_sizeof_event.tracks" [unsigned int]
173 // "event.tracks.hits" [std::vector<std::vector<Hit>>]
174 // "R_rdf_sizeof_event.tracks.hits" [std::vector<unsigned int>]
175 // "event.tracks.hits.x" [std::vector<std::vector<float>>]
176 // "R_rdf_sizeof_event.tracks.hits.x" [std::vector<unsigned int>]
177 // "event.tracks.hits.y" [std::vector<std::vector<float>>]
178 // "R_rdf_sizeof_event.tracks.hits.y" [std::vector<unsigned int>]
179
180 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
181 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
182 // Inner fields of collections are provided as projected collections of only that inner field,
183 // E.g. we provide a projected collection vector<vector<float>> for "event.tracks.hits.x" in the example
184 // above.
185
186 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
187 // skeinIDs would already contain the fieldID of "event.tracks"
188 skeinIDs.emplace_back(fieldId);
189
190 if (fieldDesc.GetTypeName().empty()) {
191 // Anonymous collection with one or several sub fields
192 auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
193 cardinalityField->SetOnDiskId(fieldId);
194 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
195 fColumnTypes.emplace_back(cardinalityField->GetType());
196 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
197 std::move(cardinalityField));
198 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
199
200 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
201 AddField(desc, std::string(colName) + "." + f.GetFieldName(), f.GetId(), skeinIDs);
202 }
203 } else {
204 // std::vector or ROOT::RVec with exactly one sub field
205 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
206 AddField(desc, colName, f.GetId(), skeinIDs);
207 }
208 // Note that at the end of the recursion, we handled the inner sub collections as well as the
209 // collection as whole, so we are done.
210 return;
211 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
212 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
213 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
214 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
215 AddField(desc, innerName, f.GetId(), skeinIDs);
216 }
217 }
218
219 // The fieldID could be the root field or the class of fieldId might not be loaded.
220 // In these cases, only the inner fields are exposed as RDF columns.
221 auto fieldOrException = Detail::RFieldBase::Create("", fieldDesc.GetTypeName());
222 if (!fieldOrException)
223 return;
224 auto valueField = fieldOrException.Unwrap();
225 valueField->SetOnDiskId(fieldId);
226 std::unique_ptr<Detail::RFieldBase> cardinalityField;
227 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
228 if (!skeinIDs.empty()) {
229 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
230 cardinalityField->SetOnDiskId(skeinIDs.back());
231 }
232
233 for (auto i = skeinIDs.rbegin(); i != skeinIDs.rend(); ++i) {
234 valueField = std::make_unique<ROOT::Experimental::RVectorField>("", std::move(valueField));
235 valueField->SetOnDiskId(*i);
236 // Skip the inner-most collection level to construct the cardinality column
237 if (i != skeinIDs.rbegin()) {
238 cardinalityField = std::make_unique<ROOT::Experimental::RVectorField>("", std::move(cardinalityField));
239 cardinalityField->SetOnDiskId(*i);
240 }
241 }
242
243 if (cardinalityField) {
244 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
245 fColumnTypes.emplace_back(cardinalityField->GetType());
246 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
247 std::move(cardinalityField));
248 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
249 }
250
251 skeinIDs.emplace_back(fieldId);
252 fColumnNames.emplace_back(colName);
253 fColumnTypes.emplace_back(valueField->GetType());
254 auto valColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(std::move(valueField));
255 fColumnReaderPrototypes.emplace_back(std::move(valColReader));
256}
257
258RNTupleDS::RNTupleDS(std::unique_ptr<Detail::RPageSource> pageSource)
259{
260 pageSource->Attach();
261 const auto &descriptor = pageSource->GetDescriptor();
262 fSources.emplace_back(std::move(pageSource));
263
264 AddField(descriptor, "", descriptor.GetFieldZeroId(), std::vector<DescriptorId_t>());
265}
266
267RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */)
268{
269 // This datasource uses the GetColumnReaders2 API instead (better name in the works)
270 return {};
271}
272
273std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
274RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
275{
276 // at this point we can assume that `name` will be found in fColumnNames, RDF is in charge validation
277 // TODO(jblomer): check incoming type
278 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
279 auto clone = fColumnReaderPrototypes[index]->Clone();
280 clone->Connect(*fSources[slot]);
281 return clone;
282}
283
285{
286 return true;
287}
288
289std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
290{
291 // TODO(jblomer): use cluster boundaries for the entry ranges
292 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
294 return ranges;
295
296 auto nEntries = fSources[0]->GetNEntries();
297 const auto chunkSize = nEntries / fNSlots;
298 const auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
299 auto start = 0UL;
300 auto end = 0UL;
301 for (auto i : ROOT::TSeqU(fNSlots)) {
302 start = end;
303 end += chunkSize;
304 ranges.emplace_back(start, end);
305 (void)i;
306 }
307 ranges.back().second += reminder;
308 fHasSeenAllRanges = true;
309 return ranges;
310}
311
312std::string RNTupleDS::GetTypeName(std::string_view colName) const
313{
314 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
315 return fColumnTypes[index];
316}
317
318bool RNTupleDS::HasColumn(std::string_view colName) const
319{
320 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
321}
322
324{
325 fHasSeenAllRanges = false;
326}
327
329
330void RNTupleDS::SetNSlots(unsigned int nSlots)
331{
332 R__ASSERT(fNSlots == 0);
333 R__ASSERT(nSlots > 0);
334 fNSlots = nSlots;
335
336 for (unsigned int i = 1; i < fNSlots; ++i) {
337 fSources.emplace_back(fSources[0]->Clone());
338 assert(i == (fSources.size() - 1));
339 fSources[i]->Attach();
340 }
341}
342} // namespace Experimental
343} // namespace ROOT
344
345ROOT::RDataFrame ROOT::Experimental::MakeNTupleDataFrame(std::string_view ntupleName, std::string_view fileName)
346{
347 auto pageSource = ROOT::Experimental::Detail::RPageSource::Create(ntupleName, fileName);
348 ROOT::RDataFrame rdf(std::make_unique<RNTupleDS>(std::move(pageSource)));
349 return rdf;
350}
typedef void(GLAPIENTRYP _GLUfuncptr)(void)
#define f(i)
Definition RSha256.hxx:104
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
#define R__ASSERT(e)
Definition TError.h:118
char name[80]
Definition TGX11.cxx:110
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition RColumn.hxx:264
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:149
std::vector< std::unique_ptr< RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition RField.hxx:103
RColumn * fPrincipalColumn
Points into fColumns.
Definition RField.hxx:101
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Every RDF column is represented by exactly one RNTuple field.
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Long64_t fLastEntry
Last entry number that was read.
std::unique_ptr< RNTupleColumnReader > Clone()
Column readers are created as prototype and then cloned for every slot.
RNTupleColumnReader(std::unique_ptr< RFieldBase > f)
RFieldValue fValue
The memory location used to read from fField.
void Connect(RPageSource &source)
Connect the field and its subfields to the page source.
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition RNTupleDS.cxx:44
ROOT::Experimental::Detail::RFieldValue CaptureValue(void *where) final
Creates a value from a memory location with an already constructed object.
Definition RNTupleDS.cxx:74
void GenerateColumnsImpl(const RNTupleDescriptor &) final
Creates the backing columns corresponsing to the field type for reading.
Definition RNTupleDS.cxx:62
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition RNTupleDS.cxx:60
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by globalIndex.
Definition RNTupleDS.cxx:82
ROOT::Experimental::Detail::RFieldValue GenerateValue(void *where) final
Generates a tree value in a given location of size at least GetValueSize().
Definition RNTupleDS.cxx:70
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition RNTupleDS.cxx:78
void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by clusterIndex.
Definition RNTupleDS.cxx:91
std::unique_ptr< ROOT::Experimental::Detail::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition RNTupleDS.cxx:46
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of a column in a tree.
A field translates read and write calls from/to underlying columns to/from tree values.
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< DescriptorId_t > skeinIDs)
Provides the RDF column "colName" given the field identified by fieldID.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RNTupleColumnReader > > fColumnReaderPrototypes
We prepare a column reader prototype for every column.
Definition RNTupleDS.hxx:52
std::vector< std::unique_ptr< ROOT::Experimental::Detail::RPageSource > > fSources
Clones of the first source, one for each slot.
Definition RNTupleDS.hxx:47
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:53
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
RNTupleDS(std::unique_ptr< ROOT::Experimental::Detail::RPageSource > pageSource)
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void Finalise() final
Convenience method called after concluding an event-loop.
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:54
void Initialise() final
Convenience method called before starting an event-loop.
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTree,...
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
RDataFrame MakeNTupleDataFrame(std::string_view ntupleName, std::string_view fileName)
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:202
Wrap the 32bit integer in a struct in order to avoid template specialization clash with std::uint32_t...