Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkConstructor.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
2
3/*************************************************************************
4 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_RCHUNKCONSTRUCTOR
12#define TMVA_RCHUNKCONSTRUCTOR
13
14#include <vector>
15
16#include "TMVA/RTensor.hxx"
17#include "ROOT/RDataFrame.hxx"
18#include "ROOT/RDF/Utils.hxx"
19#include "ROOT/RVec.hxx"
20
21#include "ROOT/RLogger.hxx"
22
23namespace TMVA {
24namespace Experimental {
25namespace Internal {
26
27// clang-format off
28/**
29\class ROOT::TMVA::Experimental::Internal::RChunkConstructor
30\ingroup tmva
31\brief The logic for constructing chunks from a dataset.
32
33This struct handles the logic for splitting a dataset into smaller subsets
34known as chunks, which are constructed from blocks.
35
36A chunk is the largest portion of the dataset loaded into memory at once,
37and each chunk is further divided into batches for machine learning training.
38
39The dataset is split into disjoint chunks based on a user-defined chunk size.
40There are two types of chunks:
41 - Full chunks: contain exactly the number of entries specified by the chunk size.
42 - Leftover chunk: contains any remaining entries that don't make up a full chunk.
43
44Each chunk is constructed from blocks based on a user-defined block size.
45There are two types of blocks:
46 - Full blocks: contain exactly the number of entries specified by the block size.
47 - Leftover block: contains any remaining entries that don't make up a full block.
48
49The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries.
50*/
51
53 // clang-format on
54 std::size_t fNumEntries{};
55 std::size_t fChunkSize{};
56 std::size_t fBlockSize{};
57
58 // size of full and leftover chunks
59 std::size_t SizeOfFullChunk;
61
62 // size of full and leftover blocks in a full and leftover chunk
67
68 // number of full, leftover and total chunks
69 std::size_t FullChunks;
70 std::size_t LeftoverChunks;
71 std::size_t Chunks;
72
73 // number of full, leftover and total blocks in a full chunk
76 std::size_t BlockPerFullChunk;
77
78 // number of full, leftover and total blocks in the leftover chunk
82
83 // total number of full and leftover blocks in the full chunks
86
87 // total number of full and leftover blocks in the leftover chunks
90
91 // vector of the different block sizes
92 std::vector<std::size_t> SizeOfBlocks;
93
94 // vector with the number of the different block
95 std::vector<std::size_t> NumberOfDifferentBlocks;
96
97 // total number of blocks
98 std::size_t NumberOfBlocks;
99
100 // pair of start and end entries in the different block types
101 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
102
103 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks;
104 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInFullChunks;
105
106 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInLeftoverChunks;
107 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInLeftoverChunks;
108
109 std::vector<std::vector<std::pair<Long_t, Long_t>>> ChunksIntervals;
110
111 std::vector<std::size_t> ChunksSizes;
112
113 RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
114 : fNumEntries(numEntries), fChunkSize(chunkSize), fBlockSize(blockSize)
115 {
116 // size of full and leftover chunks
119
120 // size of full and leftover blocks in a full and leftover chunk
121 SizeOfFullBlockInFullChunk = blockSize;
125
126 // number of full, leftover and total chunks
127 FullChunks = numEntries / SizeOfFullChunk;
128 LeftoverChunks = SizeOfLeftoverChunk == 0 ? 0 : 1;
130
131 // number of full, leftover and total blocks in a full chunk
135
136 // number of full, leftover and total blocks in the leftover chunk
140
141 // total number of full and leftover blocks in the full chunks
144
145 // total number of full and leftover blocks in the leftover chunks
148
149 // vector of the different block sizes
152
153 // vector with the number of the different block
156
157 // total number of blocks
158 NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
159 };
160
161 //////////////////////////////////////////////////////////////////////////
162 /// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
164 {
165
166 std::vector<std::vector<std::pair<Long_t, Long_t>> *> TypesOfBlockIntervals = {
169
170 std::vector<std::size_t> IndexOfDifferentBlocks(NumberOfDifferentBlocks.size());
173
174 for (size_t i = 0; i < TypesOfBlockIntervals.size(); ++i) {
175 size_t start = IndexOfDifferentBlocks[i];
176 size_t end = IndexOfDifferentBlocks[i + 1];
177
178 TypesOfBlockIntervals[i]->insert(TypesOfBlockIntervals[i]->begin(), BlockIntervals.begin() + start,
179 BlockIntervals.begin() + end);
180 }
181 }
182
183 //////////////////////////////////////////////////////////////////////////
184 /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry.
186 {
187
188 ChunksIntervals.resize(Chunks);
189 for (size_t i = 0; i < FullChunks; i++) {
190
192 size_t end_FullBlock = FullBlocksPerFullChunk * (i + 1);
193
196
199 ChunksIntervals[i].insert(ChunksIntervals[i].end(),
202 }
203
204 for (size_t i = 0; i < LeftoverChunks; i++) {
205
206 size_t j = i + FullChunks;
208 size_t end_FullBlock = FullBlocksPerLeftoverChunk * (i + 1);
209
212
213 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
216 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
219 }
220 }
221
222 //////////////////////////////////////////////////////////////////////////
223 /// \brief Fills a vector with the size of every chunk from the dataset
225 {
226
227 for (size_t i = 0; i < Chunks; i++) {
228 std::size_t chunkSize = 0;
229 for (size_t j = 0; j < ChunksIntervals[i].size(); j++) {
230 std::size_t start = ChunksIntervals[i][j].first;
231 std::size_t end = ChunksIntervals[i][j].second;
232
233 std::size_t intervalSize = end - start;
235 }
236
237 ChunksSizes.insert(ChunksSizes.end(), chunkSize);
238 }
239 }
240};
241} // namespace Internal
242} // namespace Experimental
243} // namespace TMVA
244
245#endif // TMVA_RCHUNKCONSTRUCTOR
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
const_iterator begin() const
create variable transformations
void CreateChunksIntervals()
Creates chunks from the dataset consisting of blocks with the begin and end entry.
std::vector< std::vector< std::pair< Long_t, Long_t > > > ChunksIntervals
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInLeftoverChunks
RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
std::vector< std::pair< Long_t, Long_t > > BlockIntervals
void DistributeBlockIntervals()
Group the blocks based on the block type (full or leftover) based on the size of the block.
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInLeftoverChunks
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInFullChunks
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInFullChunks
void SizeOfChunks()
Fills a vector with the size of every chunk from the dataset.