Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef TMVA_RCHUNKLOADER
16#define TMVA_RCHUNKLOADER
17
18#include <vector>
19#include <random>
20
21#include "TMVA/RTensor.hxx"
23#include "ROOT/RDataFrame.hxx"
24#include "ROOT/RDF/Utils.hxx"
25#include "ROOT/RVec.hxx"
26
27#include "ROOT/RLogger.hxx"
28
29namespace TMVA {
30namespace Experimental {
31namespace Internal {
32
33// clang-format off
34/**
35\class ROOT::TMVA::Experimental::Internal::RChunkLoaderFunctor
36\ingroup tmva
37\brief Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
38*/
39
40template <typename... ColTypes>
42 // clang-format on
43 std::size_t fOffset{};
44 std::size_t fVecSizeIdx{};
45 float fVecPadding{};
46 std::vector<std::size_t> fMaxVecSizes{};
48
49 std::size_t fNumChunkCols;
50
51 int fI;
53
54 //////////////////////////////////////////////////////////////////////////
55 /// \brief Copy the content of a column into RTensor when the column consits of vectors
57 void AssignToTensor(const T &vec, int i, int numColumns)
58 {
59 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
60 std::size_t vec_size = vec.size();
61 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
62 {
63 std::copy(vec.begin(), vec.end(), &fChunkTensor.GetData()[fOffset + numColumns * i]);
64 std::fill(&fChunkTensor.GetData()[fOffset + numColumns * i + vec_size],
66 } else // Copy only max_vec_size length from vector column
67 {
68 std::copy(vec.begin(), vec.begin() + max_vec_size, &fChunkTensor.GetData()[fOffset + numColumns * i]);
69 }
71 }
72
73 //////////////////////////////////////////////////////////////////////////
74 /// \brief Copy the content of a column into RTensor when the column consits of single values
76 void AssignToTensor(const T &val, int i, int numColumns)
77 {
79 fOffset++;
80 // fChunkTensor.GetData()[numColumns * i] = val;
81 }
82
83public:
89
90 void operator()(const ColTypes &...cols)
91 {
92 fVecSizeIdx = 0;
94 }
95};
96
97// clang-format off
98/**
99\class ROOT::TMVA::Experimental::Internal::RChunkLoader
100\ingroup tmva
101\brief Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor
102
103In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction.
104*/
105
106template <typename... Args>
108private:
109 // clang-format on
110 std::size_t fNumEntries;
111 std::size_t fChunkSize;
112 std::size_t fBlockSize;
114
115 std::vector<std::size_t> fVecSizes;
116 std::size_t fSumVecSizes;
117 std::size_t fVecPadding;
118 std::size_t fNumChunkCols;
119
120 std::size_t fNumTrainEntries;
122
124 std::vector<std::string> fCols;
125 std::size_t fNumCols;
126 std::size_t fSetSeed;
127
130
132
133 std::unique_ptr<RChunkConstructor> fTraining;
134 std::unique_ptr<RChunkConstructor> fValidation;
135
136public:
137 RChunkLoader(ROOT::RDF::RNode &rdf, std::size_t numEntries,
138 ROOT::RDF::RResultPtr<std::vector<ULong64_t>> rdf_entries, const std::size_t chunkSize,
139 const std::size_t blockSize, const float validationSplit, const std::vector<std::string> &cols,
140 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true,
141 const std::size_t setSeed = 0)
142 : f_rdf(rdf),
143 fNumEntries(numEntries),
145 fCols(cols),
149 fBlockSize(blockSize),
151 fNotFiltered(f_rdf.GetFilterNames().empty()),
154 {
155 fNumCols = fCols.size();
156 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
157
159
160 // number of training and validation entries after the split
161 fNumValidationEntries = static_cast<std::size_t>(fValidationSplit * fNumEntries);
163
164 fTraining = std::make_unique<RChunkConstructor>(fNumTrainEntries, fChunkSize, fBlockSize);
165 fValidation = std::make_unique<RChunkConstructor>(fNumValidationEntries, fChunkSize, fBlockSize);
166 }
167
168 //////////////////////////////////////////////////////////////////////////
169 /// \brief Distribute the blocks into training and validation datasets
171 {
172 std::random_device rd;
173 std::mt19937 g;
174
175 if (fSetSeed == 0) {
176 g.seed(rd());
177 } else {
178 g.seed(fSetSeed);
179 }
180
181 std::vector<Long_t> BlockSizes = {};
182
183 // fill the training and validation block sizes
184 for (size_t i = 0; i < fTraining->NumberOfDifferentBlocks.size(); i++) {
185 BlockSizes.insert(BlockSizes.end(), fTraining->NumberOfDifferentBlocks[i], fTraining->SizeOfBlocks[i]);
186 }
187
188 for (size_t i = 0; i < fValidation->NumberOfDifferentBlocks.size(); i++) {
189 BlockSizes.insert(BlockSizes.end(), fValidation->NumberOfDifferentBlocks[i], fValidation->SizeOfBlocks[i]);
190 }
191
192 // make an identity permutation map
193 std::vector<Long_t> indices(BlockSizes.size());
194
195 for (int i = 0; i < indices.size(); ++i) {
196 indices[i] = i;
197 }
198
199 // shuffle the identity permutation to create a new permutation
200 if (fShuffle) {
201 std::shuffle(indices.begin(), indices.end(), g);
202 }
203
204 // use the permuation to shuffle the vector of block sizes
205 std::vector<Long_t> PermutedBlockSizes(BlockSizes.size());
206 for (int i = 0; i < BlockSizes.size(); ++i) {
207 PermutedBlockSizes[i] = BlockSizes[indices[i]];
208 }
209
210 // create a vector for storing the boundaries of the blocks
211 std::vector<Long_t> BlockBoundaries(BlockSizes.size());
212
213 // get the boundaries of the blocks with the partial sum of the block sizes
214 // insert 0 at the beginning for the lower boundary of the first block
217
218 // distribute the neighbouring block boudaries into pairs to get the intevals for the blocks
219 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
220 for (size_t i = 0; i < BlockBoundaries.size() - 1; ++i) {
221 BlockIntervals.emplace_back(BlockBoundaries[i], BlockBoundaries[i + 1]);
222 }
223
224 // use the inverse of the permutation above to order the block intervals in the same order as
225 // the original vector of block sizes
226 std::vector<std::pair<Long_t, Long_t>> UnpermutedBlockIntervals(BlockIntervals.size());
227 for (int i = 0; i < BlockIntervals.size(); ++i) {
228 UnpermutedBlockIntervals[indices[i]] = BlockIntervals[i];
229 }
230
231 // distribute the block intervals between training and validation
232 fTraining->BlockIntervals.insert(fTraining->BlockIntervals.begin(), UnpermutedBlockIntervals.begin(),
233 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks);
234 fValidation->BlockIntervals.insert(fValidation->BlockIntervals.begin(),
235 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks,
237
238 // distribute the different block intervals types for training and validation
239 fTraining->DistributeBlockIntervals();
240 fValidation->DistributeBlockIntervals();
241 }
242
243 //////////////////////////////////////////////////////////////////////////
244 /// \brief Create training chunks consisiting of block intervals of different types
246 {
247
248 std::random_device rd;
249 std::mt19937 g;
250
251 if (fSetSeed == 0) {
252 g.seed(rd());
253 } else {
254 g.seed(fSetSeed);
255 }
256
257 // shuffle the block intervals within each type of block
258 if (fShuffle) {
259 std::shuffle(fTraining->FullBlockIntervalsInFullChunks.begin(),
260 fTraining->FullBlockIntervalsInFullChunks.end(), g);
261 std::shuffle(fTraining->LeftoverBlockIntervalsInFullChunks.begin(),
262 fTraining->LeftoverBlockIntervalsInFullChunks.end(), g);
263 std::shuffle(fTraining->FullBlockIntervalsInLeftoverChunks.begin(),
264 fTraining->FullBlockIntervalsInLeftoverChunks.end(), g);
265 std::shuffle(fTraining->LeftoverBlockIntervalsInLeftoverChunks.begin(),
266 fTraining->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
267 }
268
269 // reset the chunk intervals and sizes before each epoch
270 fTraining->ChunksIntervals = {};
271 fTraining->ChunksSizes = {};
272
273 // create the chunks each consisiting of block intervals
274 fTraining->CreateChunksIntervals();
275
276 if (fShuffle) {
277 std::shuffle(fTraining->ChunksIntervals.begin(), fTraining->ChunksIntervals.end(), g);
278 }
279
280 fTraining->SizeOfChunks();
281 }
282
283 //////////////////////////////////////////////////////////////////////////
284 /// \brief Create training chunks consisiting of block intervals of different types
286 {
287 std::random_device rd;
288 std::mt19937 g;
289
290 if (fSetSeed == 0) {
291 g.seed(rd());
292 } else {
293 g.seed(fSetSeed);
294 }
295
296 if (fShuffle) {
297 std::shuffle(fValidation->FullBlockIntervalsInFullChunks.begin(),
298 fValidation->FullBlockIntervalsInFullChunks.end(), g);
299 std::shuffle(fValidation->LeftoverBlockIntervalsInFullChunks.begin(),
300 fValidation->LeftoverBlockIntervalsInFullChunks.end(), g);
301 std::shuffle(fValidation->FullBlockIntervalsInLeftoverChunks.begin(),
302 fValidation->FullBlockIntervalsInLeftoverChunks.end(), g);
303 std::shuffle(fValidation->LeftoverBlockIntervalsInLeftoverChunks.begin(),
304 fValidation->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
305 }
306
307 fValidation->ChunksIntervals = {};
308 fValidation->ChunksSizes = {};
309
310 fValidation->CreateChunksIntervals();
311
312 if (fShuffle) {
313 std::shuffle(fValidation->ChunksIntervals.begin(), fValidation->ChunksIntervals.end(), g);
314 }
315
316 fValidation->SizeOfChunks();
317 }
318
319 //////////////////////////////////////////////////////////////////////////
320 /// \brief Load the nth chunk from the training dataset into a tensor
321 /// \param[in] TrainChunkTensor RTensor for the training chunk
322 /// \param[in] chunk Index of the chunk in the dataset
324 {
325
326 std::random_device rd;
327 std::mt19937 g;
328
329 if (fSetSeed == 0) {
330 g.seed(rd());
331 } else {
332 g.seed(fSetSeed);
333 }
334
335 std::size_t chunkSize = fTraining->ChunksSizes[chunk];
336
337 if (chunk < fTraining->Chunks) {
340
341 // make an identity permutation map
342 std::vector<int> indices(chunkSize);
343 std::iota(indices.begin(), indices.end(), 0);
344
345 // shuffle the identity permutation to create a new permutation
346 if (fShuffle) {
347 std::shuffle(indices.begin(), indices.end(), g);
348 }
349
350 // fill a chunk by looping over the blocks in a chunk (see RChunkConstructor)
351 std::size_t chunkEntry = 0;
352 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fTraining->ChunksIntervals[chunk];
353
354 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
355 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
356 return a.first < b.first;
357 });
358
359 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
360
361 // Use the block start and end entry to load into the chunk if the dataframe is not filtered
362 if (fNotFiltered) {
365
366 f_rdf.Foreach(func, fCols);
367 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
368 }
369
370 // use the entry column of the dataframe as a map to load the entries that passed the filters
371 else {
372 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
373 for (std::size_t j = 0; j < blockSize; j++) {
376 (*fEntries)[BlocksInChunk[i].first + j + 1]);
377 f_rdf.Foreach(func, fCols);
378 chunkEntry++;
379 }
380 }
381 }
382
383 // shuffle data in RTensor with the permutation map defined above
384 for (std::size_t i = 0; i < chunkSize; i++) {
385 std::copy(Tensor.GetData() + indices[i] * fNumChunkCols,
386 Tensor.GetData() + (indices[i] + 1) * fNumChunkCols,
387 TrainChunkTensor.GetData() + i * fNumChunkCols);
388 }
389 }
390 }
391
392 //////////////////////////////////////////////////////////////////////////
393 /// \brief Load the nth chunk from the validation dataset into a tensor
394 /// \param[in] ValidationChunkTensor RTensor for the validation chunk
395 /// \param[in] chunk Index of the chunk in the dataset
397 {
398
399 std::random_device rd;
400 std::mt19937 g;
401
402 if (fSetSeed == 0) {
403 g.seed(rd());
404 } else {
405 g.seed(fSetSeed);
406 }
407
408 std::size_t chunkSize = fValidation->ChunksSizes[chunk];
409
410 if (chunk < fValidation->Chunks) {
413
414 // make an identity permutation map
415 std::vector<int> indices(chunkSize);
416 std::iota(indices.begin(), indices.end(), 0);
417
418 // shuffle the identity permutation to create a new permutation
419 if (fShuffle) {
420 std::shuffle(indices.begin(), indices.end(), g);
421 }
422
423 std::size_t chunkEntry = 0;
424 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fValidation->ChunksIntervals[chunk];
425
426 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
427 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
428 return a.first < b.first;
429 });
430
431 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
432
433 // use the block start and end entry to load into the chunk if the dataframe is not filtered
434 if (fNotFiltered) {
437 f_rdf.Foreach(func, fCols);
438 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
439 }
440
441 // use the entry column of the dataframe as a map to load the entries that passed the filters
442 else {
443 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
444 for (std::size_t j = 0; j < blockSize; j++) {
447 (*fEntries)[BlocksInChunk[i].first + j + 1]);
448
449 f_rdf.Foreach(func, fCols);
450 chunkEntry++;
451 }
452 }
453 }
454
455 // shuffle data in RTensor with the permutation map defined above
456 for (std::size_t i = 0; i < chunkSize; i++) {
457 std::copy(Tensor.GetData() + indices[i] * fNumChunkCols,
458 Tensor.GetData() + (indices[i] + 1) * fNumChunkCols,
459 ValidationChunkTensor.GetData() + i * fNumChunkCols);
460 }
461 }
462 }
463
464 std::vector<std::size_t> GetTrainingChunkSizes() { return fTraining->ChunksSizes; }
465 std::vector<std::size_t> GetValidationChunkSizes() { return fValidation->ChunksSizes; }
466
467 std::size_t GetNumTrainingEntries() { return fNumTrainEntries; }
469
471 {
472 auto tensorSize = Tensor.GetSize();
474
475 std::list<int> allEntries;
476 for (int i = 0; i < tensorSize; i++) {
477 allEntries.push_back(SqueezeTensor(0, i));
478 }
479 allEntries.sort();
480 allEntries.unique();
481 if (allEntries.size() == tensorSize) {
482 std::cout << "Tensor consists of only unique elements" << std::endl;
483 }
484 };
485
487 {
488 auto tensorSize1 = Tensor1.GetSize();
490
491 std::list<int> allEntries1;
492 for (int i = 0; i < tensorSize1; i++) {
493 allEntries1.push_back(SqueezeTensor1(0, i));
494 }
495
496 auto tensorSize2 = Tensor2.GetSize();
498
499 std::list<int> allEntries2;
500 for (int i = 0; i < tensorSize2; i++) {
501 allEntries2.push_back(SqueezeTensor2(0, i));
502 }
503
504 std::set<int> result;
505
506 // Call the set_intersection(), which computes the
507 // intersection of set1 and set2 and
508 // inserts the result into the 'result' set
509 std::set<int> set1(allEntries1.begin(), allEntries1.end());
510 std::set<int> set2(allEntries2.begin(), allEntries2.end());
511 std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), inserter(result, result.begin()));
512 // std::list<int> result = intersection(allEntries1, allEntries2);
513
514 if (result.size() == 0) {
515 std::cout << "No overlap between the tensors" << std::endl;
516 } else {
517 std::cout << "Intersection between tensors: ";
518 for (int num : result) {
519 std::cout << num << " ";
520 }
521 std::cout << std::endl;
522 }
523 };
524
525 std::size_t GetNumTrainingChunks() { return fTraining->Chunks; }
526
527 std::size_t GetNumValidationChunks() { return fValidation->Chunks; }
528};
529
530} // namespace Internal
531} // namespace Experimental
532} // namespace TMVA
533#endif // TMVA_RCHUNKLOADER
#define b(i)
Definition RSha256.hxx:100
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
The public interface to the RDataFrame federation of classes.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
Smart pointer for the return type of actions.
const_iterator begin() const
const_iterator end() const
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RChunkLoaderFunctor(TMVA::Experimental::RTensor< float > &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
TMVA::Experimental::RTensor< float > & fChunkTensor
RChunkLoader(ROOT::RDF::RNode &rdf, std::size_t numEntries, ROOT::RDF::RResultPtr< std::vector< ULong64_t > > rdf_entries, const std::size_t chunkSize, const std::size_t blockSize, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void LoadTrainingChunk(TMVA::Experimental::RTensor< float > &TrainChunkTensor, std::size_t chunk)
Load the nth chunk from the training dataset into a tensor.
std::unique_ptr< RChunkConstructor > fValidation
std::vector< std::size_t > GetTrainingChunkSizes()
void CheckIfOverlap(TMVA::Experimental::RTensor< float > &Tensor1, TMVA::Experimental::RTensor< float > &Tensor2)
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
void CheckIfUnique(TMVA::Experimental::RTensor< float > &Tensor)
std::vector< std::size_t > GetValidationChunkSizes()
void SplitDataset()
Distribute the blocks into training and validation datasets.
void CreateValidationChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void CreateTrainingChunksIntervals()
Create training chunks consisiting of block intervals of different types.
std::unique_ptr< RChunkConstructor > fTraining
void LoadValidationChunk(TMVA::Experimental::RTensor< float > &ValidationChunkTensor, std::size_t chunk)
Load the nth chunk from the validation dataset into a tensor.
RTensor is a container with contiguous memory and shape information.
Definition RTensor.hxx:163
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
create variable transformations