Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDatasetLoader.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026
2
3/*************************************************************************
4 * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_RDATASETLOADER
12#define TMVA_RDATASETLOADER
13
14#include <vector>
15#include <random>
16
17#include "TMVA/RTensor.hxx"
18#include "ROOT/RDataFrame.hxx"
21#include "ROOT/RDF/Utils.hxx"
22#include "ROOT/RVec.hxx"
23
24#include "ROOT/RLogger.hxx"
25
26namespace TMVA {
27namespace Experimental {
28namespace Internal {
29
30// clang-format off
31/**
32\class ROOT::TMVA::Experimental::Internal::RDatasetLoaderFunctor
33\ingroup tmva
34\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
35*/
36
37template <typename... ColTypes>
39 // clang-format on
40 std::size_t fOffset{};
41 std::size_t fVecSizeIdx{};
42 float fVecPadding{};
43 std::vector<std::size_t> fMaxVecSizes{};
45
46 std::size_t fNumDatasetCols;
47
48 int fI;
50
51 //////////////////////////////////////////////////////////////////////////
52 /// \brief Copy the content of a column into RTensor when the column consits of vectors
54 void AssignToTensor(const T &vec, int i, int numColumns)
55 {
56 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
57 std::size_t vec_size = vec.size();
58 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
59 {
60 std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]);
61 std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size],
63 } else // Copy only max_vec_size length from vector column
64 {
65 std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]);
66 }
68 }
69
70 //////////////////////////////////////////////////////////////////////////
71 /// \brief Copy the content of a column into RTensor when the column consits of single values
73 void AssignToTensor(const T &val, int i, int numColumns)
74 {
76 fOffset++;
77 }
78
79public:
85
86 void operator()(const ColTypes &...cols)
87 {
88 fVecSizeIdx = 0;
90 }
91};
92
93// clang-format off
94/**
95\class ROOT::TMVA::Experimental::Internal::RDatasetLoader
96\ingroup tmva
97\brief Load the whole dataset into memory.
98
99In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and validation sets with the user-defined validation split fraction.
100*/
101
102template <typename... Args>
104private:
105 // clang-format on
106 std::size_t fNumEntries;
108
109 std::vector<std::size_t> fVecSizes;
110 std::size_t fSumVecSizes;
111 std::size_t fVecPadding;
112 std::size_t fNumDatasetCols;
113
114 std::vector<RFlat2DMatrix> fTrainingDatasets;
115 std::vector<RFlat2DMatrix> fValidationDatasets;
116
119
122 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
123
124 std::vector<ROOT::RDF::RNode> f_rdfs;
125 std::vector<std::string> fCols;
126 std::size_t fNumCols;
127 std::size_t fSetSeed;
128
131
133
134public:
135 RDatasetLoader(const std::vector<ROOT::RDF::RNode> &rdfs, const float validationSplit,
136 const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
137 const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0)
138 : f_rdfs(rdfs),
139 fCols(cols),
145 {
146 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
147 fNumCols = fCols.size();
148 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
149
151 }
152
153 //////////////////////////////////////////////////////////////////////////
154 /// \brief Split an individual dataframe into a training and validation dataset
155 /// \param[in] rdf Dataframe that will be split into training and validation
156 /// \param[in] TrainingDataset Tensor for the training dataset
157 /// \param[in] ValidationDataset Tensor for the validation dataset
159 {
160 std::size_t NumEntries = rdf.Count().GetValue();
162
163 // add the last element in entries to not go out of range when filling chunks
164 Entries->push_back((*Entries)[NumEntries - 1] + 1);
165
166 // number of training and validation entries after the split
167 std::size_t NumValidationEntries = static_cast<std::size_t>(fValidationSplit * NumEntries);
168 std::size_t NumTrainingEntries = NumEntries - NumValidationEntries;
169
171
172 bool NotFiltered = rdf.GetFilterNames().empty();
173 if (NotFiltered) {
175 rdf.Foreach(func, fCols);
176 }
177
178 else {
179 std::size_t datasetEntry = 0;
180 for (std::size_t j = 0; j < NumEntries; j++) {
183 rdf.Foreach(func, fCols);
184 datasetEntry++;
185 }
186 }
187
188 // reset dataframe
190
195 }
196
197 //////////////////////////////////////////////////////////////////////////
198 /// \brief Split the dataframes in a training and validation dataset
218
219 //////////////////////////////////////////////////////////////////////////
220 /// \brief Concatenate the datasets to a dataset
226
227 std::vector<RFlat2DMatrix> GetTrainingDatasets() {return fTrainingDatasets;}
228 std::vector<RFlat2DMatrix> GetValidationDatasets() {return fValidationDatasets;}
229
232
235};
236
237} // namespace Internal
238} // namespace Experimental
239} // namespace TMVA
240#endif // TMVA_RDATASETLOADER
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
The public interface to the RDataFrame federation of classes.
Smart pointer for the return type of actions.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
std::vector< RFlat2DMatrix > GetValidationDatasets()
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< RFlat2DMatrix > GetTrainingDatasets()
void ConcatenateDatasets()
Concatenate the datasets to a dataset.
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
void SplitDatasets()
Split the dataframes in a training and validation dataset.
std::vector< RFlat2DMatrix > fValidationDatasets
std::vector< RFlat2DMatrix > fTrainingDatasets
void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
Split an individual dataframe into a training and validation dataset.
RDatasetLoader(const std::vector< ROOT::RDF::RNode > &rdfs, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
create variable transformations
Wrapper around ROOT::RVec<float> representing a 2D matrix.