Logo ROOT  
Reference Guide
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
19#include <ROOT/RFieldValue.hxx>
21#include <ROOT/RNTupleDS.hxx>
22#include <ROOT/RNTupleUtil.hxx>
23#include <ROOT/RPageStorage.hxx>
24#include <ROOT/RStringView.hxx>
25
26#include <TError.h>
27
28#include <string>
29#include <vector>
30#include <typeinfo>
31#include <utility>
32
33namespace ROOT {
34namespace Experimental {
35namespace Internal {
36
37/// An artificial field that transforms an RNTuple column that contains the offset of collections into
38/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
39/// `R_rdf_sizeof_jets` for a collection named `jets`.
40///
41/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
42/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
43/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
45protected:
46 std::unique_ptr<ROOT::Experimental::Detail::RFieldBase> CloneImpl(std::string_view /* newName */) const final
47 {
48 return std::make_unique<RRDFCardinalityField>();
49 }
50
51public:
52 static std::string TypeName() { return "std::size_t"; }
54 : ROOT::Experimental::Detail::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */) {}
58
59 // Field is only used for reading
60 void GenerateColumnsImpl() final { R__ASSERT(false && "Cardinality fields must only be used for reading"); }
61
63 {
64 RColumnModel model(EColumnType::kIndex, true /* isSorted*/);
65 fColumns.emplace_back(std::unique_ptr<ROOT::Experimental::Detail::RColumn>(
66 ROOT::Experimental::Detail::RColumn::Create<ClusterSize_t, EColumnType::kIndex>(model, 0)));
67 fPrincipalColumn = fColumns[0].get();
68 }
69
71 {
72 return ROOT::Experimental::Detail::RFieldValue(this, static_cast<std::size_t *>(where));
73 }
75 {
76 return ROOT::Experimental::Detail::RFieldValue(true /* captureFlag */, this, where);
77 }
78 size_t GetValueSize() const final { return sizeof(std::size_t); }
79
80 /// Get the number of elements of the collection identified by globalIndex
81 void
83 {
84 RClusterIndex collectionStart;
86 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
87 *value->Get<std::size_t>() = size;
88 }
89
90 /// Get the number of elements of the collection identified by clusterIndex
93 {
94 RClusterIndex collectionStart;
96 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
97 *value->Get<std::size_t>() = size;
98 }
99};
100
101/// Every RDF column is represented by exactly one RNTuple field
106
107 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
108 RFieldValue fValue; ///< The memory location used to read from fField
109 Long64_t fLastEntry; ///< Last entry number that was read
110
111public:
112 RNTupleColumnReader(std::unique_ptr<RFieldBase> f)
113 : fField(std::move(f)), fValue(fField->GenerateValue()), fLastEntry(-1)
114 {
115 }
116 virtual ~RNTupleColumnReader() { fField->DestroyValue(fValue); }
117
118 /// Column readers are created as prototype and then cloned for every slot
119 std::unique_ptr<RNTupleColumnReader> Clone()
120 {
121 return std::make_unique<RNTupleColumnReader>(fField->Clone(fField->GetName()));
122 }
123
124 /// Connect the field and its subfields to the page source
125 void Connect(RPageSource &source)
126 {
127 fField->ConnectPageSource(source);
128 for (auto &f : *fField)
129 f.ConnectPageSource(source);
130 }
131
132 void *GetImpl(Long64_t entry) final
133 {
134 if (entry != fLastEntry) {
135 fField->Read(entry, &fValue);
136 fLastEntry = entry;
137 }
138 return fValue.GetRawPtr();
139 }
140};
141
142} // namespace Internal
143
144RNTupleDS::~RNTupleDS() = default;
145
147 std::vector<DescriptorId_t> skeinIDs)
148{
149 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
150 // using the following types and with a top-level field named "event" of type Event:
151 //
152 // struct Event {
153 // int id;
154 // std::vector<Track> tracks;
155 // };
156 // struct Track {
157 // std::vector<Hit> hits;
158 // };
159 // struct Hit {
160 // float x;
161 // float y;
162 // };
163 //
164 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
165 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
166 // tree of sub fields and expose the following RDF columns:
167 // TODO(jblomer): Collections should be exposed as RVec<T> instead of std::vector<T>
168 //
169 // "event" [Event]
170 // "event.id" [int]
171 // "event.tracks" [std::vector<Track>]
172 // "R_rdf_sizeof_event.tracks" [unsigned int]
173 // "event.tracks.hits" [std::vector<std::vector<Hit>>]
174 // "R_rdf_sizeof_event.tracks.hits" [std::vector<unsigned int>]
175 // "event.tracks.hits.x" [std::vector<std::vector<float>>]
176 // "R_rdf_sizeof_event.tracks.hits.x" [std::vector<unsigned int>]
177 // "event.tracks.hits.y" [std::vector<std::vector<float>>]
178 // "R_rdf_sizeof_event.tracks.hits.y" [std::vector<unsigned int>]
179
180 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
181 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
182 // Inner fields of collections are provided as projected collections of only that inner field,
183 // E.g. we provide a projected collection vector<vector<float>> for "event.tracks.hits.x" in the example
184 // above.
185
186 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
187 // skeinIDs would already contain the fieldID of "event.tracks"
188 skeinIDs.emplace_back(fieldId);
189 // There should only be one sub field but it's easiest to access via the sub field range
190 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
191 AddField(desc, colName, f.GetId(), skeinIDs);
192 }
193 // Note that at the end of the recursion, we handled the inner sub collections as well as the
194 // collection as whole, so we are done.
195 return;
196 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
197 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
198 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
199 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
200 AddField(desc, innerName, f.GetId(), skeinIDs);
201 }
202 }
203
204 // The fieldID could be the root field or the class of fieldId might not be loaded.
205 // In these cases, only the inner fields are exposed as RDF columns.
206 auto fieldOrException = Detail::RFieldBase::Create("", fieldDesc.GetTypeName());
207 if (!fieldOrException)
208 return;
209 auto valueField = fieldOrException.Unwrap();
210 valueField->SetOnDiskId(fieldId);
211 std::unique_ptr<Detail::RFieldBase> cardinalityField;
212 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
213 if (!skeinIDs.empty()) {
214 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
215 cardinalityField->SetOnDiskId(skeinIDs.back());
216 }
217
218 std::string typeName;
219 for (auto i = skeinIDs.rbegin(); i != skeinIDs.rend(); ++i) {
220 valueField = std::make_unique<ROOT::Experimental::RVectorField>("", std::move(valueField));
221 valueField->SetOnDiskId(*i);
222 // Skip the inner-most collection level to construct the cardinality column
223 if (i != skeinIDs.rbegin()) {
224 cardinalityField = std::make_unique<ROOT::Experimental::RVectorField>("", std::move(cardinalityField));
225 cardinalityField->SetOnDiskId(*i);
226 }
227 }
228
229 if (cardinalityField) {
230 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
231 fColumnTypes.emplace_back(cardinalityField->GetType());
232 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
233 std::move(cardinalityField));
234 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
235 }
236
237 skeinIDs.emplace_back(fieldId);
238 fColumnNames.emplace_back(colName);
239 fColumnTypes.emplace_back(valueField->GetType());
240 auto valColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(std::move(valueField));
241 fColumnReaderPrototypes.emplace_back(std::move(valColReader));
242}
243
244RNTupleDS::RNTupleDS(std::unique_ptr<Detail::RPageSource> pageSource)
245{
246 pageSource->Attach();
247 const auto &descriptor = pageSource->GetDescriptor();
248 fSources.emplace_back(std::move(pageSource));
249
250 AddField(descriptor, "", descriptor.GetFieldZeroId(), std::vector<DescriptorId_t>());
251}
252
254{
255 // This datasource uses the GetColumnReaders2 API instead (better name in the works)
256 return {};
257}
258
259std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
260RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
261{
262 // at this point we can assume that `name` will be found in fColumnNames, RDF is in charge validation
263 // TODO(jblomer): check incoming type
264 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
265 auto clone = fColumnReaderPrototypes[index]->Clone();
266 clone->Connect(*fSources[slot]);
267 return clone;
268}
269
271{
272 return true;
273}
274
275std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
276{
277 // TODO(jblomer): use cluster boundaries for the entry ranges
278 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
280 return ranges;
281
282 auto nEntries = fSources[0]->GetNEntries();
283 const auto chunkSize = nEntries / fNSlots;
284 const auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
285 auto start = 0UL;
286 auto end = 0UL;
287 for (auto i : ROOT::TSeqU(fNSlots)) {
288 start = end;
289 end += chunkSize;
290 ranges.emplace_back(start, end);
291 (void)i;
292 }
293 ranges.back().second += reminder;
294 fHasSeenAllRanges = true;
295 return ranges;
296}
297
298std::string RNTupleDS::GetTypeName(std::string_view colName) const
299{
300 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
301 return fColumnTypes[index];
302}
303
305{
306 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
307}
308
310{
311 fHasSeenAllRanges = false;
312}
313
315
316void RNTupleDS::SetNSlots(unsigned int nSlots)
317{
318 R__ASSERT(fNSlots == 0);
319 R__ASSERT(nSlots > 0);
320 fNSlots = nSlots;
321
322 for (unsigned int i = 1; i < fNSlots; ++i) {
323 fSources.emplace_back(fSources[0]->Clone());
324 R__ASSERT(i == (fSources.size() - 1));
325 fSources[i]->Attach();
326 }
327}
328} // namespace Experimental
329} // namespace ROOT
330
332{
333 auto pageSource = ROOT::Experimental::Detail::RPageSource::Create(ntupleName, fileName);
334 ROOT::RDataFrame rdf(std::make_unique<RNTupleDS>(std::move(pageSource)));
335 return rdf;
336}
typedef void(GLAPIENTRYP _GLUfuncptr)(void)
#define f(i)
Definition: RSha256.hxx:104
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition: RtypesCore.h:80
unsigned long long ULong64_t
Definition: RtypesCore.h:81
#define R__ASSERT(e)
Definition: TError.h:118
char name[80]
Definition: TGX11.cxx:110
@ kCollection
Definition: TStructNode.h:21
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition: RColumn.hxx:213
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName)
Factory method to resurrect a field from the stored on-disk type information.
Definition: RField.cxx:142
std::vector< std::unique_ptr< RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition: RField.hxx:103
RColumn * fPrincipalColumn
Points into fColumns.
Definition: RField.hxx:101
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Every RDF column is represented by exactly one RNTuple field.
Definition: RNTupleDS.cxx:102
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Definition: RNTupleDS.cxx:107
Long64_t fLastEntry
Last entry number that was read.
Definition: RNTupleDS.cxx:109
std::unique_ptr< RNTupleColumnReader > Clone()
Column readers are created as prototype and then cloned for every slot.
Definition: RNTupleDS.cxx:119
RNTupleColumnReader(std::unique_ptr< RFieldBase > f)
Definition: RNTupleDS.cxx:112
RFieldValue fValue
The memory location used to read from fField.
Definition: RNTupleDS.cxx:108
void Connect(RPageSource &source)
Connect the field and its subfields to the page source.
Definition: RNTupleDS.cxx:125
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition: RNTupleDS.cxx:44
ROOT::Experimental::Detail::RFieldValue CaptureValue(void *where) final
Creates a value from a memory location with an already constructed object.
Definition: RNTupleDS.cxx:74
void GenerateColumnsImpl(const RNTupleDescriptor &) final
Creates the backing columns corresponsing to the field type for reading.
Definition: RNTupleDS.cxx:62
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition: RNTupleDS.cxx:60
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by globalIndex.
Definition: RNTupleDS.cxx:82
ROOT::Experimental::Detail::RFieldValue GenerateValue(void *where) final
Generates a tree value in a given location of size at least GetValueSize().
Definition: RNTupleDS.cxx:70
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition: RNTupleDS.cxx:78
void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by clusterIndex.
Definition: RNTupleDS.cxx:91
std::unique_ptr< ROOT::Experimental::Detail::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition: RNTupleDS.cxx:46
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of a column in a tree.
A field translates read and write calls from/to underlying columns to/from tree values.
Definition: RField.hxx:58
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition: RNTupleDS.cxx:260
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< DescriptorId_t > skeinIDs)
Provides the RDF column "colName" given the field identified by fieldID.
Definition: RNTupleDS.cxx:146
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition: RNTupleDS.cxx:316
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RNTupleDS.cxx:270
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RNTupleColumnReader > > fColumnReaderPrototypes
We prepare a column reader prototype for every column.
Definition: RNTupleDS.hxx:52
std::vector< std::unique_ptr< ROOT::Experimental::Detail::RPageSource > > fSources
Clones of the first source, one for each slot.
Definition: RNTupleDS.hxx:47
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition: RNTupleDS.cxx:253
std::vector< std::string > fColumnNames
Definition: RNTupleDS.hxx:53
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition: RNTupleDS.cxx:298
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition: RNTupleDS.cxx:275
RNTupleDS(std::unique_ptr< ROOT::Experimental::Detail::RPageSource > pageSource)
Definition: RNTupleDS.cxx:244
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition: RNTupleDS.cxx:304
void Finalise() final
Convenience method called after concluding an event-loop.
Definition: RNTupleDS.cxx:314
std::vector< std::string > fColumnTypes
Definition: RNTupleDS.hxx:54
void Initialise() final
Convenience method called before starting an event-loop.
Definition: RNTupleDS.cxx:309
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTree,...
Definition: RDataFrame.hxx:40
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
basic_string_view< char > string_view
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
Definition: RNTupleUtil.hxx:77
RDataFrame MakeNTupleDataFrame(std::string_view ntupleName, std::string_view fileName)
Definition: RNTupleDS.cxx:331
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
Definition: RNTupleUtil.hxx:67
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Wrap the 32bit integer in a struct in order to avoid template specialization clash with std::uint32_t...
Definition: RNTupleUtil.hxx:80