53 #ifndef ROOT_TMVA_MsgLogger
56 #ifndef ROOT_TMVA_Configurable
59 #ifndef ROOT_TMVA_VariableIdentityTransform
62 #ifndef ROOT_TMVA_VariableDecorrTransform
65 #ifndef ROOT_TMVA_VariablePCATransform
68 #ifndef ROOT_TMVA_DataSet
71 #ifndef ROOT_TMVA_DataSetInfo
74 #ifndef ROOT_TMVA_DataInputHandler
77 #ifndef ROOT_TMVA_Event
93 if (a<b) {
Int_t tmp =
a; a=b; b=tmp; }
105 fVerboseLevel(
TString(
"Info")),
106 fScaleWithPreselEff(0),
119 std::vector<TTreeFormula*>::const_iterator formIt;
121 for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; formIt++)
if (*formIt)
delete *formIt;
122 for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; formIt++)
if (*formIt)
delete *formIt;
123 for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; formIt++)
if (*formIt)
delete *formIt;
124 for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; formIt++)
if (*formIt)
delete *formIt;
125 for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++)
if (*formIt)
delete *formIt;
137 DataSet * ds = BuildInitialDataSet( dsi, dataInput );
158 Log() <<
kDEBUG <<
"Build DataSet consisting of one Event with dynamically changing variables" <<
Endl;
168 std::vector<Float_t*>* evdyn =
new std::vector<Float_t*>(0);
172 if (varinfos.empty())
173 Log() <<
kFATAL <<
"Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." <<
Endl;
175 std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
176 for (;it!=itEnd;++it) {
179 Log() <<
kDEBUG <<
"The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." <<
Endl;
180 evdyn->push_back (external);
184 it = spectatorinfos.begin();
185 for (;it!=spectatorinfos.end();it++) evdyn->push_back( (
Float_t*)(*it).GetExternalLink() );
187 TMVA::Event * ev =
new Event((
const std::vector<Float_t*>*&)evdyn, varinfos.size());
188 std::vector<Event*>* newEventVector =
new std::vector<Event*>;
189 newEventVector->push_back(ev);
206 if (dataInput.
GetEntries()==0)
return BuildDynamicDataSet( dsi );
211 std::vector< TString >* classList = dataInput.
GetClassList();
212 for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
222 InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
226 BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
228 DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
229 splitMode, mixMode, normMode, splitSeed);
232 if (showCollectedOutput) {
260 <<
" could not be resolved to a valid formula. " <<
Endl;
263 <<
" does not provide data for this event. "
264 <<
"This event is not taken into account. --> please check if you use as a variable "
265 <<
"an entry of an array which is not filled for some events "
266 <<
"(e.g. arr[4] when arr has only 3 elements)." <<
Endl;
267 Log() <<
kWARNING <<
"If you want to take the event into account you can do something like: "
268 <<
"\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
269 <<
" 0 is taken as an alternative." <<
Endl;
276 for (
int i = 0, iEnd = ttf->
GetNcodes (); i < iEnd; ++i)
304 std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
305 for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
306 fInputFormulas.clear();
313 fInputFormulas.push_back( ttf );
320 for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
321 fTargetFormulas.clear();
326 fTargetFormulas.push_back( ttf );
333 for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
334 fSpectatorFormulas.clear();
339 fSpectatorFormulas.push_back( ttf );
346 for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
347 fCutFormulas.clear();
354 Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
360 fCutFormulas.push_back( ttf );
367 for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
368 fWeightFormula.clear();
373 fWeightFormula.push_back( 0 );
379 ttf =
new TTreeFormula(
"FormulaWeight", tmpWeight, tr );
380 Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
389 fWeightFormula.push_back( ttf );
396 Log() <<
kDEBUG <<
"enable branches: input variables" <<
Endl;
398 for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); formIt++) {
406 for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); formIt++) {
413 for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) {
420 for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); formIt++) {
428 for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); formIt++) {
455 for (
UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
456 for (
UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
457 for (
UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
463 for (
UInt_t ivar=0; ivar<nvar; ivar++) {
465 if (v<min[ivar]) min[ivar] =
v;
466 if (v>max[ivar]) max[ivar] =
v;
468 for (
UInt_t itgt=0; itgt<ntgts; itgt++) {
470 if (v<tgmin[itgt]) tgmin[itgt] =
v;
471 if (v>tgmax[itgt]) tgmax[itgt] =
v;
473 for (
UInt_t ivis=0; ivis<nvis; ivis++) {
475 if (v<vmin[ivis]) vmin[ivis] =
v;
476 if (v>vmax[ivis]) vmax[ivis] =
v;
480 for (
UInt_t ivar=0; ivar<nvar; ivar++) {
483 if(
TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
486 for (
UInt_t ivar=0; ivar<ntgts; ivar++) {
489 if(
TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
492 for (
UInt_t ivar=0; ivar<nvis; ivar++) {
514 TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
519 for (ivar=0; ivar<nvar; ivar++) {
520 for (jvar=0; jvar<nvar; jvar++) {
522 Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
523 if (d > 0) (*mat)(ivar, jvar) /=
sqrt(d);
525 Log() <<
kWARNING <<
"<GetCorrelationMatrix> Zero variances for variables "
526 <<
"(" << ivar <<
", " << jvar <<
") = " << d
528 (*mat)(ivar, jvar) = 0;
534 for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
545 UInt_t ivar = 0, jvar = 0;
552 for (ivar=0; ivar<nvar; ivar++) {
554 for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
562 if (ev->
GetClass() != classNumber )
continue;
567 for (ivar=0; ivar<nvar; ivar++) {
570 vec(ivar) += xi*weight;
571 mat2(ivar, ivar) += (xi*xi*weight);
573 for (jvar=ivar+1; jvar<nvar; jvar++) {
575 mat2(ivar, jvar) += (xi*xj*weight);
580 for (ivar=0; ivar<nvar; ivar++)
581 for (jvar=ivar+1; jvar<nvar; jvar++)
582 mat2(jvar, ivar) = mat2(ivar, jvar);
586 for (ivar=0; ivar<nvar; ivar++) {
587 for (jvar=0; jvar<nvar; jvar++) {
588 (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
610 splitSpecs.SetConfigDescription(
"Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
612 splitMode =
"Random";
613 splitSpecs.DeclareOptionRef( splitMode,
"SplitMode",
614 "Method of picking training and testing events (default: random)" );
615 splitSpecs.AddPreDefVal(
TString(
"Random"));
616 splitSpecs.AddPreDefVal(
TString(
"Alternate"));
617 splitSpecs.AddPreDefVal(
TString(
"Block"));
619 mixMode =
"SameAsSplitMode";
620 splitSpecs.DeclareOptionRef( mixMode,
"MixMode",
621 "Method of mixing events of differnt classes into one dataset (default: SameAsSplitMode)" );
622 splitSpecs.AddPreDefVal(
TString(
"SameAsSplitMode"));
623 splitSpecs.AddPreDefVal(
TString(
"Random"));
624 splitSpecs.AddPreDefVal(
TString(
"Alternate"));
625 splitSpecs.AddPreDefVal(
TString(
"Block"));
628 splitSpecs.DeclareOptionRef( splitSeed,
"SplitSeed",
629 "Seed for random event shuffling" );
631 normMode =
"EqualNumEvents";
632 splitSpecs.DeclareOptionRef( normMode,
"NormMode",
633 "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
634 splitSpecs.AddPreDefVal(
TString(
"None"));
635 splitSpecs.AddPreDefVal(
TString(
"NumEvents"));
636 splitSpecs.AddPreDefVal(
TString(
"EqualNumEvents"));
638 splitSpecs.DeclareOptionRef(fScaleWithPreselEff=
kFALSE,
"ScaleWithPreselEff",
"Scale the number of requested events by the eff. of the preselection cuts (or not)" );
648 splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested,
TString(
"nTrain_")+clName, titleTrain );
649 splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested ,
TString(
"nTest_")+clName , titleTest );
652 splitSpecs.DeclareOptionRef( fVerbose,
"V",
"Verbosity (default: true)" );
654 splitSpecs.DeclareOptionRef( fVerboseLevel=
TString(
"Info"),
"VerboseLevel",
"VerboseLevel (Debug/Verbose/Info)" );
655 splitSpecs.AddPreDefVal(
TString(
"Debug"));
656 splitSpecs.AddPreDefVal(
TString(
"Verbose"));
657 splitSpecs.AddPreDefVal(
TString(
"Info"));
659 splitSpecs.ParseOptions();
660 splitSpecs.CheckForUnusedOptions();
664 if (fVerboseLevel.CompareTo(
"Debug") ==0) fLogger->SetMinType(
kDEBUG );
665 if (fVerboseLevel.CompareTo(
"Verbose") ==0) fLogger->SetMinType(
kVERBOSE );
666 if (fVerboseLevel.CompareTo(
"Info") ==0) fLogger->SetMinType(
kINFO );
671 Log() <<
kINFO <<
"Splitmode is: \"" << splitMode <<
"\" the mixmode is: \"" << mixMode <<
"\"" <<
Endl;
672 if (mixMode==
"SAMEASSPLITMODE") mixMode = splitMode;
673 else if (mixMode!=splitMode)
674 Log() <<
kINFO <<
"DataSet splitmode="<<splitMode
675 <<
" differs from mixmode="<<mixMode<<
Endl;
700 for (
size_t i=0; i<nclasses; i++) {
701 eventCounts[i].varAvLength =
new Float_t[nvars];
702 for (
UInt_t ivar=0; ivar<nvars; ivar++)
703 eventCounts[i].varAvLength[ivar] = 0;
711 for (
UInt_t cl=0; cl<nclasses; cl++) {
713 Log() <<
kINFO <<
"Create training and testing trees -- looping over class \""
716 EventStats& classEventCounts = eventCounts[cl];
729 std::vector<Float_t> vars(nvars);
730 std::vector<Float_t> tgts(ntgts);
731 std::vector<Float_t> vis(nvis);
740 ChangeToNewTree( currentInfo, dsi );
747 for (
Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
754 ChangeToNewTree( currentInfo, dsi );
758 Int_t sizeOfArrays = 1;
759 Int_t prevArrExpr = 0;
764 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
765 Int_t ndata = fInputFormulas[ivar]->GetNdata();
767 if (ndata == 1)
continue;
769 varIsArray[ivar] =
kTRUE;
770 if (sizeOfArrays == 1) {
771 sizeOfArrays =
ndata;
774 else if (sizeOfArrays!=ndata) {
775 Log() <<
kERROR <<
"ERROR while preparing training and testing trees:" <<
Endl;
776 Log() <<
" multiple array-type expressions of different length were encountered" <<
Endl;
777 Log() <<
" location of error: event " << evtIdx
780 Log() <<
" expression " << fInputFormulas[ivar]->GetTitle() <<
" has "
781 << ndata <<
" entries, while" <<
Endl;
782 Log() <<
" expression " << fInputFormulas[prevArrExpr]->GetTitle() <<
" has "
783 << fInputFormulas[prevArrExpr]->GetNdata() <<
" entries" <<
Endl;
789 for (
Int_t idata = 0; idata<sizeOfArrays; idata++) {
796 formula = fCutFormulas[cl];
804 Log() <<
kWARNING <<
"Cut expression resolves to infinite value (NaN): "
810 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
811 formula = fInputFormulas[ivar];
813 vars[ivar] = (ndata == 1 ?
818 Log() <<
kWARNING <<
"Input expression resolves to infinite value (NaN): "
824 for (
UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
825 formula = fTargetFormulas[itrgt];
827 tgts[itrgt] = (ndata == 1 ?
832 Log() <<
kWARNING <<
"Target expression resolves to infinite value (NaN): "
838 for (
UInt_t itVis=0; itVis<nvis; itVis++) {
839 formula = fSpectatorFormulas[itVis];
841 vis[itVis] = (ndata == 1 ?
846 Log() <<
kWARNING <<
"Spectator expression resolves to infinite value (NaN): "
854 formula = fWeightFormula[cl];
857 weight *= (ndata == 1 ?
862 Log() <<
kWARNING <<
"Weight expression resolves to infinite value (NaN): "
874 if (cutVal<0.5)
continue;
895 event_v.push_back(
new Event(vars, tgts , vis, cl , weight));
905 Log() <<
kINFO <<
"Number of events in input trees (after possible flattening of arrays):" <<
Endl;
909 <<
" -- number of events : "
910 << std::setw(5) << eventCounts[cl].nEvBeforeCut
911 <<
" / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut <<
Endl;
916 <<
" tree -- total number of entries: "
920 if (fScaleWithPreselEff)
921 Log() <<
kINFO <<
"Preselection: (will affect number of requested training and testing events)" <<
Endl;
923 Log() <<
kINFO <<
"Preselection: (will NOT affect number of requested training and testing events)" <<
Endl;
931 <<
" -- number of events passed: "
932 << std::setw(5) << eventCounts[cl].nEvAfterCut
933 <<
" / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut <<
Endl;
936 <<
" -- efficiency : "
937 << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut <<
Endl;
940 else Log() <<
kINFO <<
" No preselection cuts applied on event classes" <<
Endl;
969 if (splitMode.
Contains(
"RANDOM" ) ) {
973 if( ! unspecifiedEvents.empty() ) {
975 << unspecifiedEvents.size()
976 <<
" events of class " << cls
977 <<
" which are not yet associated to testing or training" <<
Endl;
978 std::random_shuffle( unspecifiedEvents.begin(),
979 unspecifiedEvents.end(),
989 Log() <<
kDEBUG <<
"check number of training/testing events, requested and available number of events and for class " << cls <<
Endl;
996 Int_t availableTraining = eventVectorTraining.size();
997 Int_t availableTesting = eventVectorTesting.size();
998 Int_t availableUndefined = eventVectorUndefined.size();
1001 if (fScaleWithPreselEff) {
1002 presel_scale = eventCounts[cls].cutScaling();
1003 if (presel_scale < 1)
1004 Log() <<
kINFO <<
" you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<<
Endl;
1007 if (eventCounts[cls].cutScaling() < 1)
1008 Log() <<
kINFO <<
" you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" <<
Endl;
1011 Int_t requestedTraining =
Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1012 Int_t requestedTesting =
Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1014 Log() <<
kDEBUG <<
"events in training trees : " << availableTraining <<
Endl;
1015 Log() <<
kDEBUG <<
"events in testing trees : " << availableTesting <<
Endl;
1016 Log() <<
kDEBUG <<
"events in unspecified trees : " << availableUndefined <<
Endl;
1017 Log() <<
kDEBUG <<
"requested for training : " << requestedTraining;
1020 Log() <<
" ( " << eventCounts[cls].nTrainingEventsRequested
1021 <<
" * " << presel_scale <<
" preselection efficiency)" <<
Endl;
1024 Log() <<
kDEBUG <<
"requested for testing : " << requestedTesting;
1026 Log() <<
" ( " << eventCounts[cls].nTestingEventsRequested
1027 <<
" * " << presel_scale <<
" preselection efficiency)" <<
Endl;
1078 Int_t useForTesting(0),useForTraining(0);
1079 Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1081 if( (requestedTraining == 0) && (requestedTesting == 0)){
1085 if ( availableUndefined >=
TMath::Abs(availableTraining - availableTesting) ) {
1087 useForTraining = useForTesting = allAvailable/2;
1090 useForTraining = availableTraining;
1091 useForTesting = availableTesting;
1092 if (availableTraining < availableTesting)
1093 useForTraining += availableUndefined;
1095 useForTesting += availableUndefined;
1097 requestedTraining = useForTraining;
1098 requestedTesting = useForTesting;
1101 else if (requestedTesting == 0){
1103 useForTraining =
TMath::Max(requestedTraining,availableTraining);
1104 if (allAvailable < useForTraining) {
1105 Log() <<
kFATAL <<
"More events requested for training ("
1106 << requestedTraining <<
") than available ("
1107 << allAvailable <<
")!" <<
Endl;
1109 useForTesting = allAvailable - useForTraining;
1110 requestedTesting = useForTesting;
1113 else if (requestedTraining == 0){
1114 useForTesting =
TMath::Max(requestedTesting,availableTesting);
1115 if (allAvailable < useForTesting) {
1116 Log() <<
kFATAL <<
"More events requested for testing ("
1117 << requestedTesting <<
") than available ("
1118 << allAvailable <<
")!" <<
Endl;
1120 useForTraining= allAvailable - useForTesting;
1121 requestedTraining = useForTraining;
1130 Int_t stillNeedForTraining =
TMath::Max(requestedTraining-availableTraining,0);
1131 Int_t stillNeedForTesting =
TMath::Max(requestedTesting-availableTesting,0);
1133 int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1134 if (NFree <0) NFree = 0;
1135 useForTraining =
TMath::Max(requestedTraining,availableTraining) + NFree/2;
1136 useForTesting= allAvailable - useForTraining;
1139 Log() <<
kDEBUG <<
"determined event sample size to select training sample from="<<useForTraining<<
Endl;
1140 Log() <<
kDEBUG <<
"determined event sample size to select test sample from="<<useForTesting<<
Endl;
1145 if( splitMode ==
"ALTERNATE" ){
1147 Int_t nTraining = availableTraining;
1148 Int_t nTesting = availableTesting;
1149 for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1151 if( nTraining <= requestedTraining ){
1152 eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1157 eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1165 Log() <<
kDEBUG <<
"availableundefined : " << availableUndefined <<
Endl;
1166 Log() <<
kDEBUG <<
"useForTraining : " << useForTraining <<
Endl;
1167 Log() <<
kDEBUG <<
"useForTesting : " << useForTesting <<
Endl;
1168 Log() <<
kDEBUG <<
"availableTraining : " << availableTraining <<
Endl;
1169 Log() <<
kDEBUG <<
"availableTesting : " << availableTesting <<
Endl;
1171 if( availableUndefined<(useForTraining-availableTraining) ||
1172 availableUndefined<(useForTesting -availableTesting ) ||
1173 availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1174 Log() <<
kFATAL <<
"More events requested than available!" <<
Endl;
1178 if (useForTraining>availableTraining){
1179 eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1180 eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1182 if (useForTesting>availableTesting){
1183 eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1186 eventVectorUndefined.clear();
1189 if (splitMode.
Contains(
"RANDOM" )){
1190 UInt_t sizeTraining = eventVectorTraining.size();
1191 if( sizeTraining >
UInt_t(requestedTraining) ){
1192 std::vector<UInt_t> indicesTraining( sizeTraining );
1196 std::random_shuffle( indicesTraining.begin(), indicesTraining.end(),
rndm );
1198 indicesTraining.erase( indicesTraining.begin()+sizeTraining-
UInt_t(requestedTraining), indicesTraining.end() );
1200 for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1201 delete eventVectorTraining.at( (*it) );
1202 eventVectorTraining.at( (*it) ) =
NULL;
1205 eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (
void*)
NULL ), eventVectorTraining.end() );
1208 UInt_t sizeTesting = eventVectorTesting.size();
1209 if( sizeTesting >
UInt_t(requestedTesting) ){
1210 std::vector<UInt_t> indicesTesting( sizeTesting );
1214 std::random_shuffle( indicesTesting.begin(), indicesTesting.end(),
rndm );
1216 indicesTesting.erase( indicesTesting.begin()+sizeTesting-
UInt_t(requestedTesting), indicesTesting.end() );
1218 for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1219 delete eventVectorTesting.at( (*it) );
1220 eventVectorTesting.at( (*it) ) =
NULL;
1223 eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (
void*)
NULL ), eventVectorTesting.end() );
1227 if( eventVectorTraining.size() <
UInt_t(requestedTraining) )
1228 Log() <<
kWARNING <<
"DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1229 <<
"There is probably an issue. Please contact the TMVA developers." <<
Endl;
1230 std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1231 eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1233 if( eventVectorTesting.size() <
UInt_t(requestedTesting) )
1234 Log() <<
kWARNING <<
"DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1235 <<
"There is probably an issue. Please contact the TMVA developers." <<
Endl;
1236 std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1237 eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1243 Int_t trainingSize = 0;
1244 Int_t testingSize = 0;
1258 trainingEventVector->reserve( trainingSize );
1259 testingEventVector->reserve( testingSize );
1267 if( mixMode ==
"ALTERNATE" ){
1272 Log() <<
kINFO <<
"Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<
Endl;
1275 Log() <<
kINFO <<
"Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<
Endl;
1278 typedef EventVector::iterator EvtVecIt;
1279 EvtVecIt itEvent, itEventEnd;
1282 Log() <<
kDEBUG <<
"insert class 0 into training and test vector" <<
Endl;
1284 testingEventVector->insert( testingEventVector->end(), tmpEventVector[
Types::kTesting].at(0).begin(), tmpEventVector[
Types::kTesting].at(0).end() );
1291 itTarget = trainingEventVector->begin() - 1;
1295 if( (trainingEventVector->end() - itTarget) <
Int_t(cls+1) ) {
1296 itTarget = trainingEventVector->end();
1297 trainingEventVector->insert( itTarget, itEvent, itEventEnd );
1301 trainingEventVector->insert( itTarget, (*itEvent) );
1305 itTarget = testingEventVector->begin() - 1;
1309 if( ( testingEventVector->end() - itTarget ) <
Int_t(cls+1) ) {
1310 itTarget = testingEventVector->end();
1311 testingEventVector->insert( itTarget, itEvent, itEventEnd );
1315 testingEventVector->insert( itTarget, (*itEvent) );
1332 trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[
Types::kTraining].at(cls).begin(), tmpEventVector[
Types::kTraining].at(cls).end() );
1333 testingEventVector->insert ( testingEventVector->end(), tmpEventVector[
Types::kTesting].at(cls).begin(), tmpEventVector[
Types::kTesting].at(cls).end() );
1349 if (mixMode ==
"RANDOM") {
1352 std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(),
rndm );
1353 std::random_shuffle( testingEventVector->begin(), testingEventVector->end(),
rndm );
1356 Log() <<
kDEBUG <<
"trainingEventVector " << trainingEventVector->size() <<
Endl;
1357 Log() <<
kDEBUG <<
"testingEventVector " << testingEventVector->size() <<
Endl;
1362 Log() <<
kINFO <<
"Create internal training tree" <<
Endl;
1364 Log() <<
kINFO <<
"Create internal testing tree" <<
Endl;
1369 Log() <<
kFATAL <<
"Dataset " << std::string(dsi.
GetName()) <<
" does not have any training events, I better stop here and let you fix that one first " << Endl;
1373 Log() <<
kERROR <<
"Dataset " << std::string(dsi.
GetName()) <<
" does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1402 Int_t trainingSize = 0;
1403 Int_t testingSize = 0;
1411 Double_t trainingSumSignalWeights = 0;
1412 Double_t trainingSumBackgrWeights = 0;
1413 Double_t testingSumSignalWeights = 0;
1414 Double_t testingSumBackgrWeights = 0;
1419 trainingSizePerClass.at(cls) = tmpEventVector[
Types::kTraining].at(cls).size();
1420 testingSizePerClass.at(cls) = tmpEventVector[
Types::kTesting].at(cls).size();
1422 trainingSize += trainingSizePerClass.back();
1423 testingSize += testingSizePerClass.back();
1437 trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTraining].
at(cls).begin(),
1444 testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTesting].
at(cls).begin(),
1452 trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1453 testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1455 trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1456 testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1476 if (normMode ==
"NONE") {
1477 Log() <<
kINFO <<
"No weight renormalisation applied: use original global and event weights" <<
Endl;
1483 else if (normMode ==
"NUMEVENTS") {
1484 Log() <<
kINFO <<
"Weight renormalisation mode: \"NumEvents\": renormalises all event classes " <<
Endl;
1485 Log() <<
kINFO <<
" such that the effective (weighted) number of events in each class equals the respective " <<
Endl;
1486 Log() <<
kINFO <<
" number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" <<
Endl;
1487 Log() <<
kINFO <<
" ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." <<
Endl;
1488 Log() <<
kINFO <<
" ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." <<
Endl;
1489 Log() <<
kINFO <<
" ..... Testing events are not renormalised nor included in the renormalisation factor! )"<<
Endl;
1495 renormFactor.at(cls) = ((
Float_t)trainingSizePerClass.at(cls) )/
1496 (trainingSumWeightsPerClass.at(cls)) ;
1499 else if (normMode ==
"EQUALNUMEVENTS") {
1505 Log() <<
kINFO <<
"Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." <<
Endl;
1506 Log() <<
kINFO <<
" such that the effective (weighted) number of events in each class is the same " <<
Endl;
1507 Log() <<
kINFO <<
" (and equals the number of events (entries) given for class=0 )" <<
Endl;
1508 Log() <<
kINFO <<
"... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." <<
Endl;
1509 Log() <<
kINFO <<
"... (note that N_j is the sum of TRAINING events" <<
Endl;
1510 Log() <<
kINFO <<
" ..... Testing events are not renormalised nor included in the renormalisation factor!)" <<
Endl;
1513 UInt_t referenceClass = 0;
1515 renormFactor.at(cls) =
Float_t(trainingSizePerClass.at(referenceClass))/
1516 (trainingSumWeightsPerClass.at(cls));
1520 Log() <<
kFATAL <<
"<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode <<
Endl;
1527 Log() <<
kINFO <<
"--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1531 (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1541 Log() <<
kINFO <<
"Number of training and testing events after rescaling:" <<
Endl;
1542 Log() <<
kINFO <<
"------------------------------------------------------" <<
Endl;
1544 trainingSumSignalWeights = 0;
1545 trainingSumBackgrWeights = 0;
1546 testingSumSignalWeights = 0;
1547 testingSumBackgrWeights = 0;
1551 trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[
Types::kTraining].
at(cls).begin(),
1558 testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTesting].
at(cls).begin(),
1567 trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1568 testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1570 trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1571 testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1576 Log() <<
kINFO << setiosflags(ios::left) << std::setw(maxL)
1578 <<
"training events : " << trainingSizePerClass.at(cls)
1579 <<
" (sum of weights: " << trainingSumWeightsPerClass.at(cls) <<
")"
1580 <<
" - requested were " << eventCounts[cls].nTrainingEventsRequested <<
" events" <<
Endl;
1581 Log() <<
kINFO << setiosflags(ios::left) << std::setw(maxL)
1583 <<
"testing events : " << testingSizePerClass.at(cls)
1584 <<
" (sum of weights: " << testingSumWeightsPerClass.at(cls) <<
")"
1585 <<
" - requested were " << eventCounts[cls].nTestingEventsRequested <<
" events" <<
Endl;
1586 Log() <<
kINFO << setiosflags(ios::left) << std::setw(maxL)
1588 <<
"training and testing events: "
1589 << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls))
1590 <<
" (sum of weights: "
1591 << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) <<
")" << Endl;
1592 if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1593 Log() <<
kINFO << setiosflags(ios::left) << std::setw(maxL)
1595 <<
"due to the preselection a scaling factor has been applied to the numbers of requested events: "
1596 << eventCounts[cls].cutScaling() <<
Endl;
virtual const char * GetTitle() const
Returns title of object.
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
void SetEventCollection(std::vector< Event * > *, Types::ETreeType)
Sets the event collection (by DataSetFactory)
std::vector< EventVector > EventVectorOfClasses
Long64_t GetNTestEvents() const
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y)
MsgLogger & Endl(MsgLogger &ml)
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
std::vector< VariableInfo > & GetSpectatorInfos()
const TString & GetExpression() const
Int_t GetClassNameMaxLength() const
UInt_t GetNClasses() const
std::vector< Double_t > ValuePerClass
Types::ETreeType GetTreeType() const
UInt_t GetNTargets() const
void ToUpper()
Change string to upper case.
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
const TString & GetWeight() const
void generate(R &r, TH1D *h)
TBranch * GetBranch() const
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Float_t GetSpectator(UInt_t ivar) const
return spectator content
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
TFile * GetCurrentFile() const
Return pointer to the current file.
UInt_t GetNVariables() const
std::vector< int > NumberPerClass
const TString & GetClassName() const
const TString & GetInternalName() const
virtual const char * GetName() const
Returns name of object.
const char * Data() const
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
virtual Bool_t IsOnTerminalBranch() const
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights -none (kind of obvious) .
void SetCurrentEvent(Long64_t ievt) const
Double_t GetWeight() const
Double_t GetOriginalWeight() const
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
A specialized string object used for TTree selections.
virtual TFile * GetFile() const
TMatrixT< Double_t > TMatrixD
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Int_t LargestCommonDivider(Int_t a, Int_t b)
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
TDirectory * GetDirectory() const
virtual const char * ClassName() const
Returns name of class to which the object belongs.
VariableInfo & GetTargetInfo(Int_t i)
ClassInfo * GetClassInfo(Int_t clNum) const
char * Form(const char *fmt,...)
const TString & GetName() const
virtual const char * GetName() const
Returns name of object.
UInt_t GetSignalClassIndex()
const TCut & GetCut() const
std::vector< Event * > EventVector
static DataSetFactory * fgInstance
const TString & GetSplitOptions() const
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
const Event * GetEvent() const
~DataSetFactory()
destructor
void SetCurrentType(Types::ETreeType type) const
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
VariableInfo & GetSpectatorInfo(Int_t i)
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
virtual TTree * GetTree() const
VariableInfo & GetVariableInfo(Int_t i)
RooCmdArg Verbose(Bool_t flag=kTRUE)
ClassInfo * AddClass(const TString &className)
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
void SetConfigName(const char *n)
Float_t GetTarget(UInt_t itgt) const
std::vector< EventStats > EvtStatsPerClass
Short_t Max(Short_t a, Short_t b)
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
UInt_t GetNSpectators(bool all=kTRUE) const
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
virtual Long64_t GetEntries() const
Long64_t GetNTrainingEvents() const
A TTree object has a header with a name and a title.
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting Bool_t emptyUndefined = kTRUE;...
void SetNormalization(const TString &norm)
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
std::vector< VariableInfo > & GetVariableInfos()