template<typename... Args>
class ROOT::Experimental::Internal::ML::RClusterLoader< Args >

Loads TTree/RNTuple clusters from one or more RDataFrames into RFlat2DMatrix buffers for ML training and validation.

Overview

At construction the loader scans the cluster boundaries of every provided RDataFrame and stores them as a flat list of RClusterRange objects. SplitDataset() then partitions those ranges into training and validation sets according to validationSplit.

The split strategy depends on whether shuffling is enabled or not

Unshuffled: one cut is made so that the first (1 - validationSplit) fraction of entries goes to training. At most one cluster is split at the boundary.
Shuffled: each cluster is split proportionally (according to validationSplit) so both sets draw entries from every part of the dataset. ShuffleTrainingClusters() and ShuffleValidationClusters() re-order the cluster lists at the start of each epoch. A second shuffling step, at the entries level, happens inside LoadTrainingClusterInto() and LoadValidationClusterInto() when loading the data into the tensors.

Filtered RDataFrames

When any RDataFrame carries a filter, the true entry count is not known until the computation graph is executed. In this case SplitDataset() is a no-op and the split is discovered lazily inside LoadTrainingClusterInto() during the first epoch. After the first epoch FinaliseSplitDiscovery() marks the split as stable and all subsequent epochs use the same pre-computed ranges.

Definition at line 149 of file RClusterLoader.hxx.

Public Member Functions
	RClusterLoader (std::vector< ROOT::RDF::RNode > &rdfs, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes, float vecPadding, float validationSplit, bool shuffle, std::size_t setSeed)
void	FinaliseSplitDiscovery ()
	Mark the train/val split as finalised after the first epoch.
std::size_t	GetNmTotalClusters () const
std::size_t	GetNumChunkCols () const
std::size_t	GetNumTrainingClusters () const
std::size_t	GetNumTrainingEntries () const
std::size_t	GetNumValidationClusters () const
std::size_t	GetNumValidationEntries () const
const std::vector< RClusterRange > &	GetTrainingClusters () const
const std::vector< RClusterRange > &	GetValidationClusters () const
bool	IsSplitDiscovered () const
void	LoadClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
std::size_t	LoadTrainingClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
	Load one training cluster and return the number of rows written.
void	LoadValidationClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
	Load one validation cluster into `dest` starting at `rowOffset`.
void	ShuffleTrainingClusters (std::size_t epochIdx)
	Re-order training clusters for the upcoming epoch.
void	ShuffleValidationClusters (std::size_t epochIdx)
	Re-order validation clusters for the upcoming epoch.
void	SplitDataset ()
	Distribute the clusters into training and validation datasets No-op for filtered RDataFrames, the split is discovered lazily during the first epoch.

Private Attributes
std::size_t	fAccumulatedFilteredForTrain {0}
std::vector< RClusterRange >	fAllClusters
std::vector< std::string >	fCols
bool	fIsFiltered {false}
std::size_t	fNumChunkCols
std::size_t	fNumCols
std::size_t	fNumTrainingEntries {0}
std::size_t	fNumValidationEntries {0}
std::vector< ROOT::RDF::RNode > &	fRdfs
std::vector< std::size_t >	fRdfSizes
std::size_t	fSetSeed
bool	fShuffle
bool	fSplitDiscovered {false}
std::size_t	fSumVecSizes
std::size_t	fTotalEntries {0}
std::vector< RClusterRange >	fTrainingClusters
std::vector< RClusterRange >	fValidationClusters
float	fValidationSplit
float	fVecPadding
std::vector< std::size_t >	fVecSizes

Constructor & Destructor Documentation

◆ RClusterLoader()

template<typename... Args>

ROOT::Experimental::Internal::ML::RClusterLoader< Args >::RClusterLoader	(	std::vector< ROOT::RDF::RNode > &	rdfs,
		const std::vector< std::string > &	cols,
		const std::vector< std::size_t > &	vecSizes,
		float	vecPadding,
		float	validationSplit,
		bool	shuffle,
		std::size_t	setSeed )

inline

Definition at line 177 of file RClusterLoader.hxx.

Member Function Documentation

◆ FinaliseSplitDiscovery()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::FinaliseSplitDiscovery ( )

inline

Mark the train/val split as finalised after the first epoch.

Definition at line 422 of file RClusterLoader.hxx.

◆ GetNmTotalClusters()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNmTotalClusters ( ) const

inline

Definition at line 447 of file RClusterLoader.hxx.

◆ GetNumChunkCols()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumChunkCols ( ) const

inline

Definition at line 434 of file RClusterLoader.hxx.

◆ GetNumTrainingClusters()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumTrainingClusters ( ) const

inline

Definition at line 442 of file RClusterLoader.hxx.

◆ GetNumTrainingEntries()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumTrainingEntries ( ) const

inline

Definition at line 432 of file RClusterLoader.hxx.

◆ GetNumValidationClusters()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumValidationClusters ( ) const

inline

Definition at line 446 of file RClusterLoader.hxx.

◆ GetNumValidationEntries()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumValidationEntries ( ) const

inline

Definition at line 433 of file RClusterLoader.hxx.

◆ GetTrainingClusters()

template<typename... Args>

const std::vector< RClusterRange > & ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetTrainingClusters ( ) const

inline

Definition at line 436 of file RClusterLoader.hxx.

◆ GetValidationClusters()

template<typename... Args>

const std::vector< RClusterRange > & ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetValidationClusters ( ) const

inline

Definition at line 440 of file RClusterLoader.hxx.

◆ IsSplitDiscovered()

template<typename... Args>

bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::IsSplitDiscovered ( ) const

inline

Definition at line 428 of file RClusterLoader.hxx.

◆ LoadClusterInto()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadClusterInto	(	RFlat2DMatrix &	dest,
		std::size_t	rdfIdx,
		std::uint64_t	startRow,
		std::uint64_t	endRow,
		std::size_t	rowOffset = 0 )

inline

Definition at line 316 of file RClusterLoader.hxx.

◆ LoadTrainingClusterInto()

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadTrainingClusterInto	(	RFlat2DMatrix &	dest,
		std::size_t	rdfIdx,
		std::uint64_t	startRow,
		std::uint64_t	endRow,
		std::size_t	rowOffset = 0 )

inline

Load one training cluster and return the number of rows written.

Unfiltered: delegates directly to LoadClusterInto() Filtered, epoch 1 (!fSplitDiscovered):

On the first call, Count() is called across all RDFs to obtain the total filtered entry count, fNumTrainingEntries and fNumValidationEntries are set as targets.
A single Foreach on the full raw cluster range loads data and captures rdfentry_ simultaneously. The real train/val boundary is computed from the accumulated filtered count vs the target, then the train sub-range is pushed to fTrainingClusters and the val sub-range to fValidationClusters.
Only the train rows are written into dest. -All subsequent epochs: delegates directly to LoadClusterInto()

Definition at line 340 of file RClusterLoader.hxx.

◆ LoadValidationClusterInto()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadValidationClusterInto	(	RFlat2DMatrix &	dest,
		std::size_t	rdfIdx,
		std::uint64_t	startRow,
		std::uint64_t	endRow,
		std::size_t	rowOffset = 0 )

inline

Load one validation cluster into dest starting at rowOffset.

Definition at line 414 of file RClusterLoader.hxx.

◆ ShuffleTrainingClusters()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::ShuffleTrainingClusters ( std::size_t epochIdx )

inline

Re-order training clusters for the upcoming epoch.

Definition at line 295 of file RClusterLoader.hxx.

◆ ShuffleValidationClusters()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::ShuffleValidationClusters ( std::size_t epochIdx )

inline

Re-order validation clusters for the upcoming epoch.

Definition at line 307 of file RClusterLoader.hxx.

◆ SplitDataset()

template<typename... Args>

void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::SplitDataset ( )

inline

Distribute the clusters into training and validation datasets No-op for filtered RDataFrames, the split is discovered lazily during the first epoch.

Definition at line 217 of file RClusterLoader.hxx.

Member Data Documentation

◆ fAccumulatedFilteredForTrain

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fAccumulatedFilteredForTrain {0}

private

Definition at line 174 of file RClusterLoader.hxx.

◆ fAllClusters

template<typename... Args>

std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fAllClusters

private

Definition at line 164 of file RClusterLoader.hxx.

◆ fCols

template<typename... Args>

std::vector<std::string> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fCols

private

Definition at line 153 of file RClusterLoader.hxx.

◆ fIsFiltered

template<typename... Args>

bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fIsFiltered {false}

private

Definition at line 172 of file RClusterLoader.hxx.

◆ fNumChunkCols

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumChunkCols

private

Definition at line 162 of file RClusterLoader.hxx.

◆ fNumCols

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumCols

private

Definition at line 160 of file RClusterLoader.hxx.

◆ fNumTrainingEntries

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumTrainingEntries {0}

private

Definition at line 169 of file RClusterLoader.hxx.

◆ fNumValidationEntries

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumValidationEntries {0}

private

Definition at line 170 of file RClusterLoader.hxx.

◆ fRdfs

template<typename... Args>

std::vector<ROOT::RDF::RNode>& ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fRdfs

private

Definition at line 151 of file RClusterLoader.hxx.

◆ fRdfSizes

template<typename... Args>

std::vector<std::size_t> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fRdfSizes

private

Definition at line 152 of file RClusterLoader.hxx.

◆ fSetSeed

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSetSeed

private

Definition at line 158 of file RClusterLoader.hxx.

◆ fShuffle

template<typename... Args>

bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fShuffle

private

Definition at line 157 of file RClusterLoader.hxx.

◆ fSplitDiscovered

template<typename... Args>

bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSplitDiscovered {false}

private

Definition at line 173 of file RClusterLoader.hxx.

◆ fSumVecSizes

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSumVecSizes

private

Definition at line 161 of file RClusterLoader.hxx.

◆ fTotalEntries

template<typename... Args>

std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fTotalEntries {0}

private

Definition at line 168 of file RClusterLoader.hxx.

◆ fTrainingClusters

template<typename... Args>

std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fTrainingClusters

private

Definition at line 165 of file RClusterLoader.hxx.

◆ fValidationClusters

template<typename... Args>

std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fValidationClusters

private

Definition at line 166 of file RClusterLoader.hxx.

◆ fValidationSplit

template<typename... Args>

float ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fValidationSplit

private

Definition at line 156 of file RClusterLoader.hxx.

◆ fVecPadding

template<typename... Args>

float ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fVecPadding

private

Definition at line 155 of file RClusterLoader.hxx.

◆ fVecSizes

template<typename... Args>

std::vector<std::size_t> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fVecSizes

private

Definition at line 154 of file RClusterLoader.hxx.

Overview

The split strategy depends on whether shuffling is enabled or not

Filtered RDataFrames

Public Member Functions

Private Attributes

Constructor & Destructor Documentation

◆ RClusterLoader()

Member Function Documentation

◆ FinaliseSplitDiscovery()

◆ GetNmTotalClusters()

◆ GetNumChunkCols()

◆ GetNumTrainingClusters()

◆ GetNumTrainingEntries()

◆ GetNumValidationClusters()

◆ GetNumValidationEntries()

◆ GetTrainingClusters()

◆ GetValidationClusters()

◆ IsSplitDiscovered()

◆ LoadClusterInto()

◆ LoadTrainingClusterInto()

◆ LoadValidationClusterInto()

◆ ShuffleTrainingClusters()

◆ ShuffleValidationClusters()

◆ SplitDataset()

Member Data Documentation

◆ fAccumulatedFilteredForTrain

◆ fAllClusters

◆ fCols

◆ fIsFiltered

◆ fNumChunkCols

◆ fNumCols

◆ fNumTrainingEntries

◆ fNumValidationEntries

◆ fRdfs

◆ fRdfSizes

◆ fSetSeed

◆ fShuffle

◆ fSplitDiscovered

◆ fSumVecSizes

◆ fTotalEntries

◆ fTrainingClusters

◆ fValidationClusters

◆ fValidationSplit

◆ fVecPadding

◆ fVecSizes