Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.hxx
Go to the documentation of this file.
1/// \file RNTupleDS.hxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
17#ifndef ROOT_RNTupleDS
18#define ROOT_RNTupleDS
19
20#include <ROOT/RDataFrame.hxx>
21#include <ROOT/RDataSource.hxx>
22#include <ROOT/RNTupleUtil.hxx>
23#include <string_view>
24
25#include <cstdint>
26#include <memory>
27#include <string>
28#include <vector>
29#include <unordered_map>
30
31namespace ROOT {
32namespace Experimental {
33
34class RFieldBase;
35class RNTuple;
36class RNTupleDescriptor;
37
38namespace Internal {
39class RNTupleColumnReader;
40class RPageSource;
41}
42
43class RNTupleDS final : public ROOT::RDF::RDataSource {
45
46 /// The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
47 /// The GetEntryRanges() swaps fNextRanges and fCurrentRanges and uses the list of
48 /// REntryRangeDS records to return the list of ranges ready to use by the RDF loop manager.
50 std::unique_ptr<ROOT::Experimental::Internal::RPageSource> fSource;
51 ULong64_t fFirstEntry = 0; ///< First entry index in fSource
52 /// End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry
54 };
55
56 /// The first source is used to extract the schema and build the prototype fields. The page source
57 /// is used to extract a clone of the descriptor to fPrincipalDescriptor. Afterwards it is moved
58 /// into the first REntryRangeDS.
59 std::unique_ptr<Internal::RPageSource> fPrincipalSource;
60 /// A clone of the first pages source's descriptor.
61 std::unique_ptr<RNTupleDescriptor> fPrincipalDescriptor;
62
63 /// The data source may be constructed with an ntuple name and a list of files
64 std::string fNTupleName;
65 std::vector<std::string> fFileNames;
66 std::size_t fNextFileIndex = 0; ///< Index into fFileNames to the next file to process
67
68 /// We prepare a prototype field for every column. If a column reader is actually requested
69 /// in GetColumnReaders(), we move a clone of the field into a new column reader for RDataFrame.
70 /// Only the clone connects to the backing page store and acquires I/O resources.
71 /// The field IDs are set in the context of the first source and used as keys in fFieldId2QualifiedName.
72 std::vector<std::unique_ptr<ROOT::Experimental::RFieldBase>> fProtoFields;
73 /// Connects the IDs of active proto fields and their subfields to their fully qualified name (a.b.c.d).
74 /// This enables the column reader to rewire the field IDs when the file changes (chain),
75 /// using the fully qualified name as a search key in the descriptor of the other page sources.
76 std::unordered_map<ROOT::Experimental::DescriptorId_t, std::string> fFieldId2QualifiedName;
77 std::vector<std::string> fColumnNames;
78 std::vector<std::string> fColumnTypes;
79 /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers
80 /// to new page sources when the files in the chain change.
81 std::vector<std::vector<Internal::RNTupleColumnReader *>> fActiveColumnReaders;
82
83 unsigned int fNSlots = 0;
84 ULong64_t fSeenEntries = 0; ///< The number of entries so far returned by GetEntryRanges()
85 std::vector<REntryRangeDS> fCurrentRanges; ///< Basis for the ranges returned by the last GetEntryRanges() call
86 std::vector<REntryRangeDS> fNextRanges; ///< Basis for the ranges populated by the PrepareNextRanges() call
87 /// Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index in
88 /// the fCurrentRanges vectors. This is necessary because the returned ranges get distributed arbitrarily
89 /// onto slots. In the InitSlot method, the column readers use this map to find the correct range to connect to.
90 std::unordered_map<ULong64_t, std::size_t> fFirstEntry2RangeIdx;
91
92 /// \brief Holds useful information about fields added to the RNTupleDS
93 struct RFieldInfo {
95 std::size_t fNRepetitions;
96 // Enable `std::vector::emplace_back` for this type
97 RFieldInfo(DescriptorId_t fieldId, std::size_t nRepetitions) : fFieldId(fieldId), fNRepetitions(nRepetitions) {}
98 };
99
100 /// Provides the RDF column "colName" given the field identified by fieldID. For records and collections,
101 /// AddField recurses into the sub fields. The fieldInfos argument is a list of objects holding info
102 /// about the fields of the outer collection(s) (w.r.t. fieldId). For instance, if fieldId refers to an
103 /// `std::vector<Jet>`, with
104 /// struct Jet {
105 /// float pt;
106 /// float eta;
107 /// };
108 /// AddField will recurse into Jet.pt and Jet.eta and provide the two inner fields as std::vector<float> each.
109 void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
110 std::vector<RFieldInfo> fieldInfos);
111
112 /// Populates fNextRanges with the next set of entry ranges. Opens files from the chain as necessary
113 /// and aligns ranges with cluster boundaries for scheduling the tail of files.
114 /// Upon return, the fNextRanges list is ordered. It has usually fNSlots elements; fewer if there
115 /// is not enough work to give at least one cluster to every slot.
116 void PrepareNextRanges();
117
118 explicit RNTupleDS(std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource);
119
120public:
121 RNTupleDS(std::string_view ntupleName, std::string_view fileName);
123 RNTupleDS(std::string_view ntupleName, const std::vector<std::string> &fileNames);
125
126 void SetNSlots(unsigned int nSlots) final;
127 const std::vector<std::string> &GetColumnNames() const final { return fColumnNames; }
128 bool HasColumn(std::string_view colName) const final;
129 std::string GetTypeName(std::string_view colName) const final;
130 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
131 std::string GetLabel() final { return "RNTupleDS"; }
132
133 bool SetEntry(unsigned int slot, ULong64_t entry) final;
134
135 void Initialize() final;
136 void InitSlot(unsigned int slot, ULong64_t firstEntry) final;
137 void FinalizeSlot(unsigned int slot) final;
138 void Finalize() final;
139
140 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
141 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &) final;
142
143protected:
144 Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final;
145};
146
147} // ns Experimental
148
149namespace RDF {
150namespace Experimental {
151RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName);
152RDataFrame FromRNTuple(std::string_view ntupleName, const std::vector<std::string> &fileNames);
153RDataFrame FromRNTuple(ROOT::Experimental::RNTuple *ntuple);
154} // namespace Experimental
155} // namespace RDF
156
157} // ns ROOT
158
159#endif
unsigned long long ULong64_t
Definition RtypesCore.h:81
char name[80]
Definition TGX11.cxx:110
Every RDF column is represented by exactly one RNTuple field.
The RDataSource implementation for RNTuple.
Definition RNTupleDS.hxx:43
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void Initialize() final
Convenience method called before starting an event-loop.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< RFieldInfo > fieldInfos)
Provides the RDF column "colName" given the field identified by fieldID.
std::size_t fNextFileIndex
Index into fFileNames to the next file to process.
Definition RNTupleDS.hxx:66
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:77
std::unordered_map< ULong64_t, std::size_t > fFirstEntry2RangeIdx
Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index...
Definition RNTupleDS.hxx:90
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
std::unordered_map< ROOT::Experimental::DescriptorId_t, std::string > fFieldId2QualifiedName
Connects the IDs of active proto fields and their subfields to their fully qualified name (a....
Definition RNTupleDS.hxx:76
void FinalizeSlot(unsigned int slot) final
Convenience method called at the end of the data processing associated to a slot.
std::vector< std::vector< Internal::RNTupleColumnReader * > > fActiveColumnReaders
List of column readers returned by GetColumnReaders() organized by slot.
Definition RNTupleDS.hxx:81
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::unique_ptr< Internal::RPageSource > fPrincipalSource
The first source is used to extract the schema and build the prototype fields.
Definition RNTupleDS.hxx:59
std::vector< REntryRangeDS > fCurrentRanges
Basis for the ranges returned by the last GetEntryRanges() call.
Definition RNTupleDS.hxx:85
std::unique_ptr< RNTupleDescriptor > fPrincipalDescriptor
A clone of the first pages source's descriptor.
Definition RNTupleDS.hxx:61
std::vector< REntryRangeDS > fNextRanges
Basis for the ranges populated by the PrepareNextRanges() call.
Definition RNTupleDS.hxx:86
std::vector< std::unique_ptr< ROOT::Experimental::RFieldBase > > fProtoFields
We prepare a prototype field for every column.
Definition RNTupleDS.hxx:72
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void PrepareNextRanges()
Populates fNextRanges with the next set of entry ranges.
std::vector< std::string > fFileNames
Definition RNTupleDS.hxx:65
std::string fNTupleName
The data source may be constructed with an ntuple name and a list of files.
Definition RNTupleDS.hxx:64
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:78
std::string GetLabel() final
Return a string representation of the datasource type.
ULong64_t fSeenEntries
The number of entries so far returned by GetEntryRanges()
Definition RNTupleDS.hxx:84
The on-storage meta-data of an ntuple.
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:61
Pure virtual base class for all column reader types.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
Definition RNTupleDS.hxx:49
std::unique_ptr< ROOT::Experimental::Internal::RPageSource > fSource
Definition RNTupleDS.hxx:50
ULong64_t fLastEntry
End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry.
Definition RNTupleDS.hxx:53
ULong64_t fFirstEntry
First entry index in fSource.
Definition RNTupleDS.hxx:51
Holds useful information about fields added to the RNTupleDS.
Definition RNTupleDS.hxx:93
RFieldInfo(DescriptorId_t fieldId, std::size_t nRepetitions)
Definition RNTupleDS.hxx:97