Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RBatchLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef TMVA_RBATCHLOADER
16#define TMVA_RBATCHLOADER
17
18#include <vector>
19#include <memory>
20#include <numeric>
21
22// Imports for threading
23#include <queue>
24#include <mutex>
25#include <condition_variable>
26
27#include "TMVA/RTensor.hxx"
28#include "TMVA/Tools.h"
29
31
32/**
33\class ROOT::TMVA::Experimental::Internal::RBatchLoader
34\ingroup tmva
35\brief Building and loading the batches from loaded chunks in RChunkLoader
36
37In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training
38which are loaded into a queue. This is done for both the training and validation chunks separately.
39*/
40
42private:
43 std::size_t fBatchSize;
44 std::size_t fNumColumns;
45
46 bool fIsActive = false;
47
48 std::mutex fBatchLock;
49 std::condition_variable fBatchCondition;
50
51 // queuse of tensors of the training and validation batches
52 std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fTrainingBatchQueue;
53 std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fValidationBatchQueue;
54
55 // number of training and validation batches in the queue
58
59 // current batch that is loaded into memory
60 std::unique_ptr<TMVA::Experimental::RTensor<float>> fCurrentBatch;
61
62 // primary and secondary batches used to create batches from a chunk
63 std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverTrainingBatch;
64 std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverTrainingBatch;
65
66 std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverValidationBatch;
67 std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverValidationBatch;
68
69public:
70 RBatchLoader(std::size_t batchSize, std::size_t numColumns) : fBatchSize(batchSize), fNumColumns(numColumns)
71 {
72
74 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
76 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
77
79 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
81 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
82
85 }
86
87public:
88 void Activate()
89 {
90 {
91 std::lock_guard<std::mutex> lock(fBatchLock);
92 fIsActive = true;
93 }
94 fBatchCondition.notify_all();
95 }
96
97 /// \brief DeActivate the batchloader. This means that no more batches are created.
98 /// Batches can still be returned if they are already loaded
100 {
101 {
102 std::lock_guard<std::mutex> lock(fBatchLock);
103 fIsActive = false;
104 }
105 fBatchCondition.notify_all();
106 }
107
108 /// \brief Return a batch of data as a unique pointer.
109 /// After the batch has been processed, it should be destroyed.
110 /// \param[in] chunkTensor RTensor with the data from the chunk
111 /// \param[in] idxs Index of batch in the chunk
112 /// \return Training batch
113 std::unique_ptr<TMVA::Experimental::RTensor<float>>
115 {
116 auto batch =
117 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({fBatchSize, fNumColumns}));
118 std::copy(chunkTensor.GetData() + (idxs * fBatchSize * fNumColumns),
119 chunkTensor.GetData() + ((idxs + 1) * fBatchSize * fNumColumns), batch->GetData());
120
121 return batch;
122 }
123
124 /// \brief Loading the training batch from the queue
125 /// \return Training batch
127 {
128
129 if (fTrainingBatchQueue.empty()) {
130 fCurrentBatch = std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({0}));
131 return *fCurrentBatch;
132 }
133
134 fCurrentBatch = std::move(fTrainingBatchQueue.front());
136
137 return *fCurrentBatch;
138 }
139
140 /// \brief Loading the validation batch from the queue
141 /// \return Training batch
143 {
144
145 if (fValidationBatchQueue.empty()) {
146 fCurrentBatch = std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({0}));
147 return *fCurrentBatch;
148 }
149
150 fCurrentBatch = std::move(fValidationBatchQueue.front());
152
153 return *fCurrentBatch;
154 }
155
156 /// \brief Creating the training batches from a chunk and add them to the queue.
157 /// \param[in] chunkTensor RTensor with the data from the chunk
158 /// \param[in] lastbatch Check if the batch in the chunk is the last one
159 /// \param[in] leftoverBatchSize Size of the leftover batch in the training dataset
160 /// \param[in] dromRemainder Bool to drop the remainder batch or not
162 std::size_t leftoverBatchSize, bool dropRemainder)
163 {
164 std::size_t ChunkSize = chunkTensor.GetShape()[0];
165 std::size_t Batches = ChunkSize / fBatchSize;
166 std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
167
168 // create a vector of batches
169 std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
170
171 // fill the full batches from the chunk into a vector
172 for (std::size_t i = 0; i < Batches; i++) {
173 // Fill a batch
174 batches.emplace_back(CreateBatch(chunkTensor, i));
175 }
176
177 // copy the remaining entries from the chunk into a leftover batch
179 std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
181 LeftoverBatch.GetData());
182
183 // calculate how many empty slots are left in fPrimaryLeftoverTrainingBatch
184 std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverTrainingBatch).GetShape()[0];
186
187 // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
189 (*fPrimaryLeftoverTrainingBatch) =
190 (*fPrimaryLeftoverTrainingBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, fNumColumns});
191 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
193
194 // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch and add it to the batch vector
196 auto copy =
197 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
198 std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
199 fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
200 batches.emplace_back(std::move(copy));
201
202 // reset fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
205 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
206 }
207 }
208
209 // copy LeftoverBatch to both fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
210 else if (emptySlots < LeftoverBatchSize) {
211 // copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
212 (*fPrimaryLeftoverTrainingBatch) = (*fPrimaryLeftoverTrainingBatch).Resize({fBatchSize, fNumColumns});
213 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
215
216 // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverTrainingBatch
217 (*fSecondaryLeftoverTrainingBatch) =
218 (*fSecondaryLeftoverTrainingBatch).Resize({LeftoverBatchSize - emptySlots, fNumColumns});
219 std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
222
223 // add fPrimaryLeftoverTrainingBatch to the batch vector
224 auto copy =
225 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
226 std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
227 fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
228 batches.emplace_back(std::move(copy));
229
230 // exchange fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverValidationBatch
232
233 // reset fSecondaryLeftoverValidationBatch
235 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
236 }
237
238 // copy the content of fPrimaryLeftoverTrainingBatch to the leftover batch from the chunk
239 if (lastbatch == 1) {
240
241 if (dropRemainder == false && leftoverBatchSize > 0) {
242 auto copy = std::make_unique<TMVA::Experimental::RTensor<float>>(
243 std::vector<std::size_t>{leftoverBatchSize, fNumColumns});
244 std::copy((*fPrimaryLeftoverTrainingBatch).GetData(),
245 (*fPrimaryLeftoverTrainingBatch).GetData() + (leftoverBatchSize * fNumColumns), copy->GetData());
246 batches.emplace_back(std::move(copy));
247 }
248
250 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
252 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
253 }
254
255 // append the batches from the batch vector from the chunk to the training batch queue
256 for (std::size_t i = 0; i < batches.size(); i++) {
257 fTrainingBatchQueue.push(std::move(batches[i]));
258 }
259 }
260
261 /// \brief Creating the validation batches from a chunk and adding them to the queue
262 /// \param[in] chunkTensor RTensor with the data from the chunk
263 /// \param[in] lastbatch Check if the batch in the chunk is the last one
264 /// \param[in] leftoverBatchSize Size of the leftover batch in the validation dataset
265 /// \param[in] dromRemainder Bool to drop the remainder batch or not
267 std::size_t leftoverBatchSize, bool dropRemainder)
268 {
269 std::size_t ChunkSize = chunkTensor.GetShape()[0];
270 std::size_t NumCols = chunkTensor.GetShape()[1];
271 std::size_t Batches = ChunkSize / fBatchSize;
272 std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
273
274 std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
275
276 for (std::size_t i = 0; i < Batches; i++) {
277 // Fill a batch
278 batches.emplace_back(CreateBatch(chunkTensor, i));
279 }
280
282 std::copy(chunkTensor.GetData() + (Batches * fBatchSize * NumCols),
284 LeftoverBatch.GetData());
285
286 std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverValidationBatch).GetShape()[0];
288
290 (*fPrimaryLeftoverValidationBatch) =
291 (*fPrimaryLeftoverValidationBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, NumCols});
292 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * NumCols),
294
296 auto copy =
297 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
298 std::copy(fPrimaryLeftoverValidationBatch->GetData(),
299 fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
300 batches.emplace_back(std::move(copy));
303 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
304 }
305 }
306
307 else if (emptySlots < LeftoverBatchSize) {
308 (*fPrimaryLeftoverValidationBatch) = (*fPrimaryLeftoverValidationBatch).Resize({fBatchSize, NumCols});
309 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * NumCols),
311 (*fSecondaryLeftoverValidationBatch) =
312 (*fSecondaryLeftoverValidationBatch).Resize({LeftoverBatchSize - emptySlots, NumCols});
313 std::copy(LeftoverBatch.GetData() + (emptySlots * NumCols),
316 auto copy =
317 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
318 std::copy(fPrimaryLeftoverValidationBatch->GetData(),
319 fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
320 batches.emplace_back(std::move(copy));
323 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
324 }
325
326 if (lastbatch == 1) {
327
328 if (dropRemainder == false && leftoverBatchSize > 0) {
329 auto copy = std::make_unique<TMVA::Experimental::RTensor<float>>(
330 std::vector<std::size_t>{leftoverBatchSize, fNumColumns});
331 std::copy((*fPrimaryLeftoverValidationBatch).GetData(),
332 (*fPrimaryLeftoverValidationBatch).GetData() + (leftoverBatchSize * fNumColumns),
333 copy->GetData());
334 batches.emplace_back(std::move(copy));
335 }
337 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
339 std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
340 }
341
342 for (std::size_t i = 0; i < batches.size(); i++) {
343 fValidationBatchQueue.push(std::move(batches[i]));
344 }
345 }
346 std::size_t GetNumTrainingBatchQueue() { return fTrainingBatchQueue.size(); }
347 std::size_t GetNumValidationBatchQueue() { return fValidationBatchQueue.size(); }
348};
349
350} // namespace TMVA::Experimental::Internal
351
352#endif // TMVA_RBATCHLOADER
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
These classes encapsulate the necessary data for the computations.
std::unique_ptr< TMVA::Experimental::RTensor< float > > fSecondaryLeftoverValidationBatch
TMVA::Experimental::RTensor< float > GetValidationBatch()
Loading the validation batch from the queue.
RBatchLoader(std::size_t batchSize, std::size_t numColumns)
std::unique_ptr< TMVA::Experimental::RTensor< float > > fSecondaryLeftoverTrainingBatch
std::unique_ptr< TMVA::Experimental::RTensor< float > > fPrimaryLeftoverValidationBatch
void CreateTrainingBatches(TMVA::Experimental::RTensor< float > &chunkTensor, int lastbatch, std::size_t leftoverBatchSize, bool dropRemainder)
Creating the training batches from a chunk and add them to the queue.
std::unique_ptr< TMVA::Experimental::RTensor< float > > fPrimaryLeftoverTrainingBatch
std::queue< std::unique_ptr< TMVA::Experimental::RTensor< float > > > fValidationBatchQueue
TMVA::Experimental::RTensor< float > GetTrainBatch()
Loading the training batch from the queue.
std::unique_ptr< TMVA::Experimental::RTensor< float > > CreateBatch(TMVA::Experimental::RTensor< float > &chunkTensor, std::size_t idxs)
Return a batch of data as a unique pointer.
void CreateValidationBatches(TMVA::Experimental::RTensor< float > &chunkTensor, std::size_t lastbatch, std::size_t leftoverBatchSize, bool dropRemainder)
Creating the validation batches from a chunk and adding them to the queue.
std::unique_ptr< TMVA::Experimental::RTensor< float > > fCurrentBatch
void DeActivate()
DeActivate the batchloader.
std::queue< std::unique_ptr< TMVA::Experimental::RTensor< float > > > fTrainingBatchQueue
RTensor is a container with contiguous memory and shape information.
Definition RTensor.hxx:163