Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
19#include <ROOT/RFieldValue.hxx>
20#include <ROOT/RNTuple.hxx>
22#include <ROOT/RNTupleDS.hxx>
23#include <ROOT/RNTupleUtil.hxx>
24#include <ROOT/RPageStorage.hxx>
25#include <ROOT/RStringView.hxx>
26
27#include <TError.h>
28
29#include <string>
30#include <vector>
31#include <typeinfo>
32#include <utility>
33
34// clang-format off
35/**
36* \class ROOT::Experimental::RNTupleDS
37* \ingroup dataframe
38* \brief The RDataSource implementation for RNTuple. It lets RDataFrame read RNTuple data.
39*
40* An RDataFrame that reads RNTuple data can be constructed using FromRNTuple().
41*
42* For each column containing an array or a collection, a corresponding column `#colname` is available to access
43* `colname.size()` without reading and deserializing the collection values.
44*
45**/
46// clang-format on
47
48namespace ROOT {
49namespace Experimental {
50namespace Internal {
51
52/// An artificial field that transforms an RNTuple column that contains the offset of collections into
53/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
54/// `R_rdf_sizeof_jets` for a collection named `jets`.
55///
56/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
57/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
58/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
60protected:
61 std::unique_ptr<ROOT::Experimental::Detail::RFieldBase> CloneImpl(std::string_view /* newName */) const final
62 {
63 return std::make_unique<RRDFCardinalityField>();
64 }
65
66public:
67 static std::string TypeName() { return "std::size_t"; }
69 : ROOT::Experimental::Detail::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */) {}
73
75 {
76 static RColumnRepresentations representations(
78 {});
79 return representations;
80 }
81 // Field is only used for reading
82 void GenerateColumnsImpl() final { assert(false && "Cardinality fields must only be used for reading"); }
83 void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
84 {
85 auto onDiskTypes = EnsureCompatibleColumnTypes(desc);
86 fColumns.emplace_back(
87 ROOT::Experimental::Detail::RColumn::Create<ClusterSize_t>(RColumnModel(onDiskTypes[0]), 0));
88 }
89
91 {
92 return ROOT::Experimental::Detail::RFieldValue(this, static_cast<std::size_t *>(where));
93 }
95 {
96 return ROOT::Experimental::Detail::RFieldValue(true /* captureFlag */, this, where);
97 }
98 size_t GetValueSize() const final { return sizeof(std::size_t); }
99 size_t GetAlignment() const final { return alignof(std::size_t); }
100
101 /// Get the number of elements of the collection identified by globalIndex
102 void
104 {
105 RClusterIndex collectionStart;
107 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
108 *value->Get<std::size_t>() = size;
109 }
110
111 /// Get the number of elements of the collection identified by clusterIndex
114 {
115 RClusterIndex collectionStart;
117 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
118 *value->Get<std::size_t>() = size;
119 }
120};
121
122/// Every RDF column is represented by exactly one RNTuple field
127
128 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
129 RFieldValue fValue; ///< The memory location used to read from fField
130 Long64_t fLastEntry; ///< Last entry number that was read
131
132public:
133 RNTupleColumnReader(std::unique_ptr<RFieldBase> f)
134 : fField(std::move(f)), fValue(fField->GenerateValue()), fLastEntry(-1)
135 {
136 }
137 ~RNTupleColumnReader() { fField->DestroyValue(fValue); }
138
139 /// Column readers are created as prototype and then cloned for every slot
140 std::unique_ptr<RNTupleColumnReader> Clone()
141 {
142 return std::make_unique<RNTupleColumnReader>(fField->Clone(fField->GetName()));
143 }
144
145 /// Connect the field and its subfields to the page source
146 void Connect(RPageSource &source)
147 {
148 fField->ConnectPageSource(source);
149 for (auto &f : *fField)
150 f.ConnectPageSource(source);
151 }
152
153 void *GetImpl(Long64_t entry) final
154 {
155 if (entry != fLastEntry) {
156 fField->Read(entry, &fValue);
157 fLastEntry = entry;
158 }
159 return fValue.GetRawPtr();
160 }
161};
162
163} // namespace Internal
164
165RNTupleDS::~RNTupleDS() = default;
166
167void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
168 std::vector<DescriptorId_t> skeinIDs)
169{
170 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
171 // using the following types and with a top-level field named "event" of type Event:
172 //
173 // struct Event {
174 // int id;
175 // std::vector<Track> tracks;
176 // };
177 // struct Track {
178 // std::vector<Hit> hits;
179 // };
180 // struct Hit {
181 // float x;
182 // float y;
183 // };
184 //
185 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
186 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
187 // tree of sub fields and expose the following RDF columns:
188 //
189 // "event" [Event]
190 // "event.id" [int]
191 // "event.tracks" [RVec<Track>]
192 // "R_rdf_sizeof_event.tracks" [unsigned int]
193 // "event.tracks.hits" [RVec<RVec<Hit>>]
194 // "R_rdf_sizeof_event.tracks.hits" [RVec<unsigned int>]
195 // "event.tracks.hits.x" [RVec<RVec<float>>]
196 // "R_rdf_sizeof_event.tracks.hits.x" [RVec<unsigned int>]
197 // "event.tracks.hits.y" [RVec<RVec<float>>]
198 // "R_rdf_sizeof_event.tracks.hits.y" [RVec<unsigned int>]
199
200 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
201 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
202 // Inner fields of collections are provided as projected collections of only that inner field,
203 // E.g. we provide a projected collection RVec<RVec<float>> for "event.tracks.hits.x" in the example
204 // above.
205
206 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
207 // skeinIDs would already contain the fieldID of "event.tracks"
208 skeinIDs.emplace_back(fieldId);
209
210 if (fieldDesc.GetTypeName().empty()) {
211 // Anonymous collection with one or several sub fields
212 auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
213 cardinalityField->SetOnDiskId(fieldId);
214 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
215 fColumnTypes.emplace_back(cardinalityField->GetType());
216 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
217 std::move(cardinalityField));
218 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
219
220 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
221 AddField(desc, std::string(colName) + "." + f.GetFieldName(), f.GetId(), skeinIDs);
222 }
223 } else {
224 // ROOT::RVec with exactly one sub field
225 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
226 AddField(desc, colName, f.GetId(), skeinIDs);
227 }
228 // Note that at the end of the recursion, we handled the inner sub collections as well as the
229 // collection as whole, so we are done.
230 return;
231 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
232 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
233 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
234 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
235 AddField(desc, innerName, f.GetId(), skeinIDs);
236 }
237 }
238
239 // The fieldID could be the root field or the class of fieldId might not be loaded.
240 // In these cases, only the inner fields are exposed as RDF columns.
241 auto fieldOrException = Detail::RFieldBase::Create("", fieldDesc.GetTypeName());
242 if (!fieldOrException)
243 return;
244 auto valueField = fieldOrException.Unwrap();
245 valueField->SetOnDiskId(fieldId);
246 std::unique_ptr<Detail::RFieldBase> cardinalityField;
247 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
248 if (!skeinIDs.empty()) {
249 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
250 cardinalityField->SetOnDiskId(skeinIDs.back());
251 }
252
253 for (auto i = skeinIDs.rbegin(); i != skeinIDs.rend(); ++i) {
254 valueField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(valueField));
255 valueField->SetOnDiskId(*i);
256 // Skip the inner-most collection level to construct the cardinality column
257 if (i != skeinIDs.rbegin()) {
258 cardinalityField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(cardinalityField));
259 cardinalityField->SetOnDiskId(*i);
260 }
261 }
262
263 if (cardinalityField) {
264 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
265 fColumnTypes.emplace_back(cardinalityField->GetType());
266 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
267 std::move(cardinalityField));
268 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
269 }
270
271 skeinIDs.emplace_back(fieldId);
272 fColumnNames.emplace_back(colName);
273 fColumnTypes.emplace_back(valueField->GetType());
274 auto valColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(std::move(valueField));
275 fColumnReaderPrototypes.emplace_back(std::move(valColReader));
276}
277
278RNTupleDS::RNTupleDS(std::unique_ptr<Detail::RPageSource> pageSource)
279{
280 pageSource->Attach();
281 auto descriptorGuard = pageSource->GetSharedDescriptorGuard();
282 fSources.emplace_back(std::move(pageSource));
283
284 AddField(descriptorGuard.GetRef(), "", descriptorGuard->GetFieldZeroId(), std::vector<DescriptorId_t>());
285}
286
287RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */)
288{
289 // This datasource uses the GetColumnReaders2 API instead (better name in the works)
290 return {};
291}
292
293std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
294RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
295{
296 // at this point we can assume that `name` will be found in fColumnNames, RDF is in charge validation
297 // TODO(jblomer): check incoming type
298 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
299 auto clone = fColumnReaderPrototypes[index]->Clone();
300 clone->Connect(*fSources[slot]);
301 return clone;
302}
303
305{
306 return true;
307}
308
309std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
310{
311 // TODO(jblomer): use cluster boundaries for the entry ranges
312 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
314 return ranges;
315
316 auto nEntries = fSources[0]->GetNEntries();
317 const auto chunkSize = nEntries / fNSlots;
318 const auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
319 auto start = 0UL;
320 auto end = 0UL;
321 for (auto i : ROOT::TSeqU(fNSlots)) {
322 start = end;
323 end += chunkSize;
324 ranges.emplace_back(start, end);
325 (void)i;
326 }
327 ranges.back().second += reminder;
328 fHasSeenAllRanges = true;
329 return ranges;
330}
331
332std::string RNTupleDS::GetTypeName(std::string_view colName) const
333{
334 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
335 return fColumnTypes[index];
336}
337
338bool RNTupleDS::HasColumn(std::string_view colName) const
339{
340 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
341}
342
344{
345 fHasSeenAllRanges = false;
346}
347
349
350void RNTupleDS::SetNSlots(unsigned int nSlots)
351{
352 R__ASSERT(fNSlots == 0);
353 R__ASSERT(nSlots > 0);
354 fNSlots = nSlots;
355
356 for (unsigned int i = 1; i < fNSlots; ++i) {
357 fSources.emplace_back(fSources[0]->Clone());
358 assert(i == (fSources.size() - 1));
359 fSources[i]->Attach();
360 }
361}
362} // namespace Experimental
363} // namespace ROOT
364
365ROOT::RDataFrame ROOT::RDF::Experimental::FromRNTuple(std::string_view ntupleName, std::string_view fileName)
366{
367 auto pageSource = ROOT::Experimental::Detail::RPageSource::Create(ntupleName, fileName);
368 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(std::move(pageSource)));
369 return rdf;
370}
371
373{
374 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(ntuple->MakePageSource()));
375 return rdf;
376}
#define f(i)
Definition RSha256.hxx:104
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
#define R__ASSERT(e)
Definition TError.h:118
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
char name[80]
Definition TGX11.cxx:110
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition RColumn.hxx:267
Some fields have multiple possible column representations, e.g.
Definition RField.hxx:107
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:243
std::vector< std::unique_ptr< RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition RField.hxx:159
const ColumnRepresentation_t & EnsureCompatibleColumnTypes(const RNTupleDescriptor &desc) const
Returns the on-disk column types found in the provided descriptor for fOnDiskId.
Definition RField.cxx:478
RColumn * fPrincipalColumn
Points into fColumns.
Definition RField.hxx:157
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Every RDF column is represented by exactly one RNTuple field.
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Long64_t fLastEntry
Last entry number that was read.
std::unique_ptr< RNTupleColumnReader > Clone()
Column readers are created as prototype and then cloned for every slot.
RNTupleColumnReader(std::unique_ptr< RFieldBase > f)
RFieldValue fValue
The memory location used to read from fField.
void Connect(RPageSource &source)
Connect the field and its subfields to the page source.
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition RNTupleDS.cxx:59
ROOT::Experimental::Detail::RFieldValue CaptureValue(void *where) final
Creates a value from a memory location with an already constructed object.
Definition RNTupleDS.cxx:94
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition RNTupleDS.cxx:82
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by globalIndex.
const RColumnRepresentations & GetColumnRepresentations() const final
Implementations in derived classes should return a static RColumnRepresentations object.
Definition RNTupleDS.cxx:74
ROOT::Experimental::Detail::RFieldValue GenerateValue(void *where) final
Generates a tree value in a given location of size at least GetValueSize().
Definition RNTupleDS.cxx:90
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition RNTupleDS.cxx:98
size_t GetAlignment() const final
As a rule of thumb, the alignment is equal to the size of the type.
Definition RNTupleDS.cxx:99
void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by clusterIndex.
std::unique_ptr< ROOT::Experimental::Detail::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition RNTupleDS.cxx:61
void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
Creates the backing columns corresponsing to the field type for reading.
Definition RNTupleDS.cxx:83
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of an RNTuple column.
A field translates read and write calls from/to underlying columns to/from tree values.
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< DescriptorId_t > skeinIDs)
Provides the RDF column "colName" given the field identified by fieldID.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RNTupleColumnReader > > fColumnReaderPrototypes
We prepare a column reader prototype for every column.
Definition RNTupleDS.hxx:53
std::vector< std::unique_ptr< ROOT::Experimental::Detail::RPageSource > > fSources
Clones of the first source, one for each slot.
Definition RNTupleDS.hxx:48
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void Initialize() final
Convenience method called before starting an event-loop.
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:54
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
RNTupleDS(std::unique_ptr< ROOT::Experimental::Detail::RPageSource > pageSource)
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:55
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:512
std::unique_ptr< Detail::RPageSource > MakePageSource(const RNTupleReadOptions &options=RNTupleReadOptions())
Create a page source from the RNTuple object.
Definition RNTuple.cxx:384
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName)
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Wrap the integer in a struct in order to avoid template specialization clash with std::uint32_t.