15#ifndef ROOT_INTERNAL_ML_RCLUSTERLOADER
16#define ROOT_INTERNAL_ML_RCLUSTERLOADER
47 static_cast<std::size_t
>(
end -
start)};
59template <
typename... ColTypes>
74 template <typename T, std::enable_if_t<ROOT::Internal::RDF::IsDataContainer<T>::value,
int> = 0>
78 std::size_t vec_size =
vec.size();
81 if (vec_size < max_vec_size)
83 std::copy(
vec.begin(),
vec.end(), dst);
84 std::fill(dst + vec_size, dst + max_vec_size,
fVecPadding);
87 std::copy(
vec.begin(),
vec.begin() + max_vec_size, dst);
94 template <typename T, std::enable_if_t<!ROOT::Internal::RDF::IsDataContainer<T>::value,
int> = 0>
103 const std::vector<std::size_t> &maxVecSizes,
float vecPadding,
int i,
104 std::size_t rowOffset = 0)
110 fOffset(rowOffset * numColumns)
148template <
typename... Args>
151 std::vector<ROOT::RDF::RNode> &
fRdfs;
177 RClusterLoader(std::vector<ROOT::RDF::RNode> &rdfs,
const std::vector<std::string> &cols,
178 const std::vector<std::size_t> &vecSizes,
float vecPadding,
float validationSplit,
bool shuffle,
192 for (
auto &rdf :
fRdfs) {
194 if (!rdf.GetFilterNames().empty()) {
204 for (std::size_t rdfIdx = 0; rdfIdx <
fRdfs.size(); ++rdfIdx) {
207 auto numEntries =
r.second -
r.first;
220 throw std::runtime_error(
"RClusterLoader::SplitDataset: no clusters found.");
233 std::uniform_int_distribution<int> coin(0, 1);
236 const std::size_t sz =
c.GetNumEntries();
237 const std::size_t trainSz =
static_cast<std::size_t
>((1.0f -
fValidationSplit) * sz);
238 const std::size_t valSz = sz - trainSz;
241 bool trainIsPrefix = coin(
g);
242 const uint64_t trainStart = trainIsPrefix ?
c.start :
c.start +
static_cast<std::uint64_t
>(valSz);
243 const uint64_t valStart = trainIsPrefix ?
c.start +
static_cast<std::uint64_t
>(trainSz) :
c.start;
246 fTrainingClusters.push_back({
c.rdfIdx, trainStart, trainStart +
static_cast<std::uint64_t
>(trainSz)});
250 fValidationClusters.push_back({
c.rdfIdx, valStart, valStart +
static_cast<std::uint64_t
>(valSz)});
261 std::size_t accumulated = 0;
262 std::size_t splitIdx = 0;
264 const std::size_t sz =
fAllClusters[splitIdx].GetNumEntries();
265 if (accumulated + sz > targetTraining) {
275 if (splitIdx <
fAllClusters.size() && accumulated < targetTraining) {
278 const std::uint64_t splitPoint = boundary.
start +
static_cast<std::uint64_t
>(targetTraining - accumulated);
294 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for training after split. "
295 "Reduce validation_split.");
298 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for validation after split. "
299 "Increase validation_split.");
326 std::size_t rowOffset = 0)
331 rdf.Foreach(func,
fCols);
350 std::uint64_t endRow, std::size_t rowOffset = 0)
355 std::vector<ROOT::RDF::RResultPtr<ULong64_t>> counts;
356 counts.reserve(
fRdfs.size());
357 for (
auto &rdf :
fRdfs) {
358 counts.push_back(rdf.Count());
362 std::size_t totalFiltered = 0;
363 for (
auto &
c : counts) {
364 totalFiltered +=
c.GetValue();
373 std::vector<ULong64_t> rdfEntries;
374 rdfEntries.reserve(endRow - startRow);
379 std::vector<std::string> colsWithEntry;
380 colsWithEntry.reserve(
fCols.size() + 1);
381 colsWithEntry.push_back(
"rdfentry_");
382 colsWithEntry.insert(colsWithEntry.end(),
fCols.begin(),
fCols.end());
385 [&](
ULong64_t entry,
const Args &...cols) {
386 rdfEntries.push_back(entry);
393 const std::size_t totalFiltered = rdfEntries.size();
394 if (totalFiltered == 0) {
397 std::sort(rdfEntries.begin(), rdfEntries.end());
400 const std::size_t trainCount =
401 std::min(
static_cast<std::size_t
>(totalFiltered * (1.0f -
fValidationSplit)), trainRemaining);
402 const std::size_t valCount = totalFiltered - trainCount;
404 bool trainIsPrefix =
true;
409 std::uniform_int_distribution<int> coin(0, 1);
410 trainIsPrefix = coin(
g);
415 const std::uint64_t trainBoundaryEntry = trainIsPrefix ? rdfEntries[trainCount] : rdfEntries[valCount];
416 const std::uint64_t boundary = (valCount > 0) ? trainBoundaryEntry : endRow;
418 const std::uint64_t trainStart = trainIsPrefix ? startRow : boundary;
419 const std::uint64_t trainEnd = trainIsPrefix ? boundary : endRow;
420 const std::uint64_t valStart = trainIsPrefix ? boundary : startRow;
421 const std::uint64_t valEnd = trainIsPrefix ? endRow : boundary;
433 return endRow - startRow;
439 std::size_t rowOffset = 0)
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t dest
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Functor invoked by RDataFrame::Foreach to fill one row of an RFlat2DMatrix.
std::size_t fNumChunkCols
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of vectors.
RFlat2DMatrix & fChunkTensor
RClusterLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i, std::size_t rowOffset=0)
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of scalar values.
void operator()(const ColTypes &...cols)
std::vector< std::size_t > fMaxVecSizes
std::size_t GetNumValidationClusters() const
std::size_t fNumValidationEntries
void ShuffleTrainingClusters(std::size_t epochIdx)
Re-order training clusters for the upcoming epoch.
void FinaliseSplitDiscovery()
Mark the train/val split as finalised after the first epoch.
std::size_t fAccumulatedFilteredForTrain
std::size_t fNumTrainingEntries
void LoadClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
std::vector< std::size_t > fRdfSizes
void ShuffleValidationClusters(std::size_t epochIdx)
Re-order validation clusters for the upcoming epoch.
std::size_t fNumChunkCols
std::size_t LoadTrainingClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one training cluster and return the number of rows written.
std::size_t GetNumTrainingClusters() const
std::vector< RClusterRange > fAllClusters
void SplitDataset()
Distribute the clusters into training and validation datasets No-op for filtered RDataFrames,...
void LoadValidationClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one validation cluster into dest starting at rowOffset.
const std::vector< RClusterRange > & GetTrainingClusters() const
std::size_t fTotalEntries
std::size_t GetNmTotalClusters() const
RClusterLoader(std::vector< ROOT::RDF::RNode > &rdfs, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes, float vecPadding, float validationSplit, bool shuffle, std::size_t setSeed)
const std::vector< RClusterRange > & GetValidationClusters() const
bool IsSplitDiscovered() const
std::size_t GetNumValidationEntries() const
std::size_t GetNumTrainingEntries() const
std::vector< std::size_t > fVecSizes
std::vector< RClusterRange > fValidationClusters
std::vector< ROOT::RDF::RNode > & fRdfs
std::vector< std::string > fCols
std::size_t GetNumChunkCols() const
std::vector< RClusterRange > fTrainingClusters
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
unsigned int RunGraphs(std::vector< RResultHandle > handles)
Run the event loops of multiple RDataFrames concurrently.
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
Describes a contiguous range of entries within a single RDataFrame, corresponding to one TTree/RNTupl...
std::size_t GetNumEntries() const
void SetNumEntries(std::size_t num)
Wrapper around ROOT::RVec<float> representing a 2D matrix.