60 Log() << kFATAL <<
"DataSet prepared for \"" <<
fNumFolds <<
"\" folds, requested fold \"" << foldNumber
61 <<
"\" is outside of range." <<
Endl;
65 auto prepareDataSetInternal = [
this, &dsi, foldNumber](std::vector<std::vector<Event *>> vec) {
69 UInt_t nTotal = std::accumulate(vec.begin(), vec.end(), 0,
70 [&](
UInt_t sum, std::vector<TMVA::Event *>
v) {
return sum +
v.size(); });
72 UInt_t nTrain = nTotal - vec.at(foldNumber).size();
73 UInt_t nTest = vec.at(foldNumber).size();
75 std::vector<Event *> tempTrain;
76 std::vector<Event *> tempTest;
78 tempTrain.reserve(nTrain);
79 tempTest.reserve(nTest);
82 for (
UInt_t i = 0; i < numFolds; ++i) {
83 if (i == foldNumber) {
87 tempTrain.insert(tempTrain.end(), vec.at(i).begin(), vec.at(i).end());
91 tempTest.insert(tempTest.end(), vec.at(foldNumber).begin(), vec.at(foldNumber).end());
93 Log() << kDEBUG <<
"Fold prepared, num events in training set: " << tempTrain.size() <<
Endl;
94 Log() << kDEBUG <<
"Fold prepared, num events in test set: " << tempTest.size() <<
Endl;
106 Log() << kFATAL <<
"PrepareFoldDataSet can only work with training and testing data sets." << std::endl;
117 Log() << kFATAL <<
"Only kTraining is supported for CvSplit::RecombineKFoldDataSet currently." << std::endl;
120 std::vector<Event *> *tempVec =
new std::vector<Event *>;
140 : fDsi(dsi), fIdxFormulaParNumFolds(
std::numeric_limits<
UInt_t>::max()), fSplitFormula(
"", expr),
141 fParValues(fSplitFormula.GetNpar())
144 throw std::runtime_error(
"Split expression \"" + std::string(
fSplitExpr.
Data()) +
"\" is not a valid TFormula.");
152 if (name ==
"NumFolds" or name ==
"numFolds") {
167 auto iFormulaPar = p.first;
168 auto iSpectator = p.second;
179 if (
fabs(iFold - (
double)((
UInt_t)iFold)) > 1
e-5) {
180 throw std::runtime_error(
181 "Output of splitExpr should be a non-negative integer between 0 and numFolds-1 inclusive.");
202 for (
UInt_t iSpectator = 0; iSpectator < spectatorInfos.size(); ++iSpectator) {
213 throw std::runtime_error(
"Spectator \"" + std::string(name.
Data()) +
"\" not found.");
236 :
CvSplit(numFolds), fSeed(seed), fSplitExprString(splitExpr), fStratified(stratified)
243 Log() << kFATAL <<
"Stratified KFolds not currently implemented." << std::endl;
261 Log() << kINFO <<
"Splitting in k-folds has been already done" <<
Endl;
290 std::vector<UInt_t> fOrigToFoldMapping;
291 fOrigToFoldMapping.reserve(nEntries);
292 for (
UInt_t iEvent = 0; iEvent < nEntries; ++iEvent) {
293 fOrigToFoldMapping.push_back(iEvent % numFolds);
298 std::shuffle(fOrigToFoldMapping.begin(), fOrigToFoldMapping.end(), rng);
300 return fOrigToFoldMapping;
309 std::vector<std::vector<TMVA::Event *>>
312 const ULong64_t nEntries = oldSet.size();
313 const ULong64_t foldSize = nEntries / numFolds;
315 std::vector<std::vector<Event *>> tempSets;
317 for (
UInt_t iFold = 0; iFold < numFolds; ++iFold) {
318 tempSets.emplace_back();
319 tempSets.at(iFold).reserve(foldSize);
326 for (
ULong64_t i = 0; i < nEntries; i++) {
329 tempSets.at((
UInt_t)iFold).push_back(ev);
335 for (
UInt_t iEvent = 0; iEvent < nEntries; ++iEvent) {
336 UInt_t iFold = fOrigToFoldMapping[iEvent];
338 tempSets.at(iFold).push_back(ev);
std::unique_ptr< CvSplitKFoldsExpr > fSplitExpr
Expression used to split data into folds. Should output values between 0 and numFolds.
Int_t fIdxFormulaParNumFolds
Maps parameter indicies in splitExpr to their spectator index in the datasetinfo. ...
CvSplitKFolds(UInt_t numFolds, TString splitExpr="", Bool_t stratified=kTRUE, UInt_t seed=100)
Splits a dataset into k folds, ready for use in cross validation.
virtual const char * GetName() const
Returns name of object.
static long int sum(long int i)
std::vector< Double_t > fParValues
TFormula for splitExpr.
MsgLogger & Endl(MsgLogger &ml)
std::vector< UInt_t > GetEventIndexToFoldMapping(UInt_t nEntries, UInt_t numFolds, UInt_t seed=100)
Generates a vector of fold assignments.
std::vector< VariableInfo > & GetSpectatorInfos()
UInt_t GetSpectatorIndexForName(DataSetInfo &dsi, TString name)
void MakeKFoldDataSet(DataSetInfo &dsi) override
Prepares a DataSet for cross validation.
const std::vector< Event * > & GetEventCollection(Types::ETreeType type=Types::kMaxTreeType) const
const TString & GetLabel() const
std::vector< std::pair< Int_t, Int_t > > fFormulaParIdxToDsiSpecIdx
const TString & GetExpression() const
virtual void PrepareFoldDataSet(DataSetInfo &dsi, UInt_t foldNumber, Types::ETreeType tt)
Set training and test set vectors of dataset described by dsi.
std::vector< std::vector< TMVA::Event * > > fTestEvents
TString fSplitExpr
Keeps track of the index of reserved par "NumFolds" in splitExpr.
Class that contains all the data information.
virtual void RecombineKFoldDataSet(DataSetInfo &dsi, Types::ETreeType tt=Types::kTraining)
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
CvSplitKFoldsExpr(DataSetInfo &dsi, TString expr)
std::vector< std::vector< Event * > > SplitSets(std::vector< TMVA::Event *> &oldSet, UInt_t numFolds)
Split sets for into k-folds.
static Bool_t Validate(TString expr)
unsigned long long ULong64_t
std::vector< std::vector< TMVA::Event * > > fTrainEvents
void SetEventCollection(std::vector< Event *> *, Types::ETreeType, Bool_t deleteEvents=true)
Sets the event collection (by DataSetFactory)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
std::map< const TMVA::Event *, UInt_t > fEventToFoldMapping
TFormula fSplitFormula
Expression used to split data into folds. Should output values between 0 and numFolds.
UInt_t Eval(UInt_t numFolds, const Event *ev)
TString()
TString default ctor.
Class for type info of MVA input variable.
Float_t GetSpectator(UInt_t ivar) const
return spectator content
DataSet * GetDataSet() const
returns data set
const char * Data() const