150 #include <unordered_map> 153 using std::make_pair;
171 , fSigToBkgFraction(0)
176 , fBaggedGradBoost(
kFALSE)
180 , fMinNodeSizeS(
"5%")
183 , fMinLinCorrForFisher(.8)
184 , fUseExclusiveVars(0)
186 , fNodePurityLimit(0)
191 , fFValidationEvents(0)
193 , fRandomisedTrees(
kFALSE)
195 , fUsePoissonNvars(0)
196 , fUseNTrainEvents(0)
197 , fBaggedSampleFraction(0)
198 , fNoNegWeightsInTraining(
kFALSE)
199 , fInverseBoostNegWeights(
kFALSE)
200 , fPairNegWeightsGlobal(
kFALSE)
201 , fTrainWithNegWeights(
kFALSE)
211 , fSkipNormalization(
kFALSE)
356 DeclareOptionRef(
fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
358 DeclareOptionRef(
fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
382 DeclareOptionRef(
fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
384 DeclareOptionRef(
fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
385 DeclareOptionRef(
fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
388 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
393 DeclareOptionRef(
fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
428 DeclareOptionRef(
fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
442 DeclareOptionRef(
fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
456 DeclareOptionRef(
fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
457 DeclareOptionRef(
fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
458 DeclareOptionRef(
fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
472 "Use weighted trees or simple average in classification from the forest");
494 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " <<
fSepTypeS <<
" called" <<
Endl;
499 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " <<
fHuberQuantile <<
", does not match this criteria" <<
Endl;
518 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " <<
fPruneMethodS <<
" option called" <<
Endl;
524 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
530 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " <<
fMinNodeEvents<<
" ** the min absolute number \n" 531 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n" 532 <<
"*MinNodeSize* giving the relative number as percentage of training \n" 533 <<
"events instead. \n" 536 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n" 550 Log() << kINFO <<
"the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change" <<
Endl;
551 Log() << kINFO <<
"to new default for GradBoost *Pray*" <<
Endl;
552 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
565 Log() << kWARNING <<
"You have chosen to use more than half of your training sample " 566 <<
"to optimize the automatic pruning algorithm. This is probably wasteful " 567 <<
"and your overall results will be degraded. Are you sure you want this?" 572 if (this->
Data()->HasNegativeEventWeights()){
573 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. " 574 <<
"That should in principle be fine as long as on average you end up with " 575 <<
"something positive. For this you have to make sure that the minimal number " 576 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize=" 578 <<
", (or the deprecated equivalent nEventsMin) you can set this via the " 579 <<
"BDT option string when booking the " 580 <<
"classifier) is large enough to allow for reasonable averaging!!! " 581 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining " 582 <<
"which ignores events with negative weight in the training. " <<
Endl 583 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
588 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
593 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
597 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
601 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
602 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
603 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
608 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
614 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
615 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
622 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! " 623 <<
" I set it to 1 .. just so that the program does not crash" 636 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " <<
fNegWeightTreatment <<
" requested" <<
Endl;
640 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
651 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<
fNNodesMax 652 <<
"* \n this has been translated to MaxDepth="<<
fMaxDepth<<
Endl;
658 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<
fUseNTrainEvents 665 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
673 if (sizeInPercent > 0 && sizeInPercent < 50){
677 Log() << kFATAL <<
"you have demanded a minimal node size of " 678 << sizeInPercent <<
"% of the training events.. \n" 679 <<
" that somehow does not make sense "<<
Endl;
691 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which " 692 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
755 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
784 std::vector<const TMVA::Event*> tmpEventSample;
785 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
788 tmpEventSample.push_back(event);
794 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
799 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
811 if (firstNegWeight) {
812 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
816 }
else if (event->GetWeight()==0){
817 if (firstZeroWeight) {
819 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
823 if (event->GetWeight() < 0) {
828 Log() << kWARNING <<
"Events with negative event weights are found and " 829 <<
" will be removed prior to the actual BDT training by global " 830 <<
" paring (and subsequent annihilation) with positiv weight events" 833 Log() << kWARNING <<
"Events with negative event weights are USED during " 834 <<
"the BDT training. This might cause problems with small node sizes " 835 <<
"or with the boosting. Please remove negative events from training " 836 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you " 837 <<
"observe problems with the boosting" 845 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
856 Log() << kINFO <<
"<InitEventSample> Internally I use " <<
fEventSample.size()
859 <<
"% of training used for validation)" << Endl;
872 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
873 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
874 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
890 Int_t sumSig=0, sumBkg=0;
900 if (sumSigW && sumBkgW){
903 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = " 905 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
906 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
907 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
908 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
913 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
914 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
948 std::vector<const Event*> negEvents;
958 if (totalNegWeights == 0 ) {
959 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
962 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
963 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
964 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
971 for (
Int_t i=0; i<2; i++){
972 invCov = ((*cov)[i]);
974 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant=" 976 <<
" did you use the variables that are linear combinations or highly correlated?" 980 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant=" 982 <<
" did you use the variables that are linear combinations?" 991 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " <<
fEventSample.size() <<
" training events " <<
Endl;
992 Timer timer(negEvents.size(),
"Negative Event paired");
993 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
995 Double_t weight = negEvents[nev]->GetWeight();
996 UInt_t iClassID = negEvents[nev]->GetClass();
997 invCov = ((*cov)[iClassID]);
1008 dist += (negEvents[nev]->GetValue(ivar)-
fEventSample[iev]->GetValue(ivar))*
1009 (*invCov)[ivar][jvar]*
1010 (negEvents[nev]->GetValue(jvar)-
fEventSample[iev]->GetValue(jvar));
1013 if (dist < minDist) { iMin=iev; minDist=
dist;}
1021 negEvents[nev]->SetBoostWeight( 0 );
1024 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1028 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1029 weight = negEvents[nev]->GetWeight();
1032 Log() << kINFO <<
"<Negative Event Pairing> took: " << timer.GetElapsedTime()
1036 totalNegWeights = 0;
1037 totalPosWeights = 0;
1044 std::vector<const Event*> newEventSample;
1065 if (totalNegWeights < 0)
Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1070 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " <<
fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1071 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1083 std::map<TString,TMVA::Interval*> tuneParameters;
1084 std::map<TString,Double_t> tunedParameters;
1093 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1094 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1095 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1101 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1104 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1109 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1113 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1114 std::map<TString,TMVA::Interval*>::iterator it;
1115 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1116 Log() << kWARNING << it->first <<
Endl;
1117 std::ostringstream oss;
1118 (it->second)->
Print(oss);
1124 tunedParameters=optimize.
optimize();
1126 return tunedParameters;
1135 std::map<TString,Double_t>::iterator it;
1136 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1137 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1139 else if (it->first ==
"MinNodeSize" )
SetMinNodeSize (it->second);
1143 else if (it->first ==
"Shrinkage" )
SetShrinkage (it->second);
1146 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1164 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! " 1165 <<
" I set it to 1 .. just so that the program does not crash" 1171 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1179 if (
IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; " 1180 <<
"please remove the option from the configuration string, or " 1181 <<
"use \"!Normalise\"" 1187 Log() << kINFO <<
"Training "<<
fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1189 Log() << kDEBUG <<
"Training with maximal depth = " <<
fMaxDepth 1199 TString hname =
"AdaBooost weight distribution";
1209 hname=
"Boost event weights distribution";
1224 results->
Store(h,
"BoostWeights");
1229 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,
fNTrees,2,0,1.05);
1231 boostMonitor->
SetYTitle(
"ROC Integral");
1232 results->
Store(boostMonitor,
"BoostMonitor");
1234 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1235 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1236 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1243 results->
Store(h,
"BoostWeightsVsTree");
1249 results->
Store(h,
"ErrorFrac");
1252 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1253 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1254 results->
Store(nodesBeforePruningVsTree);
1257 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1258 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1259 results->
Store(nodesAfterPruningVsTree);
1269 Int_t nNodesBeforePruningCount = 0;
1270 Int_t nNodesAfterPruningCount = 0;
1272 Int_t nNodesBeforePruning = 0;
1273 Int_t nNodesAfterPruning = 0;
1283 while (itree <
fNTrees && continueBoost){
1299 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. " 1300 <<
"Please change boost option accordingly (GradBoost)." 1305 for (
UInt_t i=0;i<nClasses;i++){
1314 fForest.back()->SetUseFisherCuts();
1326 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1341 fForest.back()->SetUseFisherCuts();
1349 nNodesBeforePruning =
fForest.back()->CleanTree();
1352 nNodesBeforePruningCount += nNodesBeforePruning;
1353 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1358 std::vector<const Event*> * validationSample = NULL;
1365 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1377 nNodesAfterPruning =
fForest.back()->GetNNodes();
1378 nNodesAfterPruningCount += nNodesAfterPruning;
1379 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1388 if ( itree==
fNTrees-1 || (!(itree%500)) ||
1389 (!(itree%250) && itree <1000)||
1390 (!(itree%100) && itree < 500)||
1391 (!(itree%50) && itree < 250)||
1392 (!(itree%25) && itree < 150)||
1393 (!(itree%10) && itree < 50)||
1394 (!(itree%5) && itree < 20)
1406 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : " 1410 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : " 1411 << nNodesBeforePruningCount/
GetNTrees() <<
" / " 1420 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1437 for (
UInt_t itree=0; itree<nTrees; itree++) {
1442 return 2.0/(1.0+
exp(-2.0*sum))-1;
1452 std::vector<Double_t> expCache;
1453 if (cls == nClasses - 1) {
1454 expCache.resize(nClasses);
1456 for (
auto e : eventSample) {
1458 if (cls == nClasses - 1) {
1460 std::transform(residualsThisEvent.begin(),
1461 residualsThisEvent.begin() + nClasses,
1463 for (
UInt_t i = 0; i < nClasses; i++) {
1465 for (
UInt_t j = 0; j < nClasses; j++) {
1467 norm += expCache[j] / expCache[i];
1470 Double_t p_cls = 1.0 / (1.0 + norm);
1471 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1477 for (
auto e : eventSample) {
1480 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1494 #ifdef R__USE_IMT // multithreaded version if ROOT was compiled with multithreading 1501 auto f = [
this, &eventSample, &nPartitions](
UInt_t partition = 0) ->
Int_t{
1503 Int_t start = 1.0*partition/nPartitions*eventSample.size();
1504 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
1506 for(
Int_t i=start; i<end; ++i)
1514 #else // ROOT was not compiled with multithreading, use standard version 1537 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1538 for (
auto e : eventSample) {
1541 auto &
v = leaves[node];
1542 auto target =
e->GetTarget(cls);
1543 v.sumWeightTarget += target * weight;
1544 v.sum2 +=
fabs(target) * (1.0 -
fabs(target)) * weight;
1546 for (
auto &iLeave : leaves) {
1547 constexpr
auto minValue = 1
e-30;
1548 if (iLeave.second.sum2 < minValue) {
1549 iLeave.second.sum2 = minValue;
1552 iLeave.first->SetResponse(
fShrinkage * (K - 1) / K * iLeave.second.sumWeightTarget / iLeave.second.sum2);
1569 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1570 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1579 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1580 iLeave!=leaves.end();++iLeave){
1582 (iLeave->first)->SetResponse(
fShrinkage*fit);
1600 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1611 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1612 for (
UInt_t i=0;i<nClasses;i++){
1614 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1621 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1646 return ncorrect / (ncorrect + nfalse);
1666 returnVal = this->
GradBoost (eventSample, dt, cls);
1668 returnVal = this->
GradBoost (eventSample, dt);
1672 Log() << kFATAL <<
"<Boost> unknown boost option " <<
fBoostType<<
" called" <<
Endl;
1691 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1692 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1707 for (
UInt_t iev=0; iev < nevents; iev++){
1710 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1716 std::vector<TH1F*> hS;
1717 std::vector<TH1F*> hB;
1719 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1720 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1721 results->
Store(hS.back(),hS.back()->GetTitle());
1722 results->
Store(hB.back(),hB.back()->GetTitle());
1729 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1730 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1731 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1732 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1734 TH1F *tmpBoostWeights;
1735 std::vector<TH1F*> *
h;
1739 tmpBoostWeights=tmpBoostWeightsS;
1742 tmpBoostWeights=tmpBoostWeightsB;
1783 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1785 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1788 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1791 UInt_t iclass=(*e)->GetClass();
1796 sumGlobalwfalse += w * tmpDev;
1797 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1798 if (tmpDev > maxDev) maxDev = tmpDev;
1803 if (!(isSignalType ==
DataInfo().IsSignal(*
e))) {
1804 sumGlobalwfalse+= w;
1811 sumGlobalwfalse+= w*trueType*dtoutput;
1816 err = sumGlobalwfalse/sumGlobalw ;
1820 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1823 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1827 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1830 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1835 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 1837 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1841 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1845 std::vector<Double_t> newSumw(sumw.size(),0);
1852 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 1853 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 1855 <<
"please check why this happens, maybe too many events per node requested ?" 1859 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1860 <<
") That should not happen, please check your code (i.e... the BDT code), I " 1861 <<
" stop boosting here" <<
Endl;
1865 }
else if (err < 0) {
1866 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 1867 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 1868 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 1869 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1878 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1883 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1890 if ( (*e)->GetWeight() > 0 ){
1891 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1896 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1908 if ( (*e)->GetWeight() > 0 ){
1909 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1914 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1917 newSumGlobalw+=(*e)->GetWeight();
1918 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1924 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1927 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1932 else (*e)->ScaleBoostWeight( globalNormWeight );
1966 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
1968 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1970 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1972 sumGlobalWeights += w;
1973 UInt_t iclass=(*e)->GetClass();
1978 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1984 Bool_t isSelectedSignal = (dtoutput>0);
1985 if (isTrueSignal) trueType = 1;
1989 if (isTrueSignal && isSelectedSignal) cost=Css;
1990 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1991 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1992 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1993 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
1995 sumGlobalCost+= w*trueType*dtoutput*cost;
2001 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2006 sumGlobalCost /= sumGlobalWeights;
2011 vector<Double_t> newSumClassWeights(sumw.size(),0);
2017 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2021 Bool_t isSelectedSignal = (dtoutput>0);
2022 if (isTrueSignal) trueType = 1;
2026 if (isTrueSignal && isSelectedSignal) cost=Css;
2027 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2028 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2029 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2030 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2034 if ( (*e)->GetWeight() > 0 ){
2035 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
2042 newSumGlobalWeights+=(*e)->GetWeight();
2043 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
2048 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
2049 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
2052 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2056 else (*e)->ScaleBoostWeight( globalNormWeight );
2094 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2130 if ( !
DoRegression() )
Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2132 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2134 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2139 sumwfalse += w * tmpDev;
2140 sumwfalse2 += w * tmpDev*tmpDev;
2141 if (tmpDev > maxDev) maxDev = tmpDev;
2146 err = sumwfalse/maxDev/sumw ;
2149 err = sumwfalse2/maxDev/maxDev/sumw ;
2153 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2156 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2161 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 2163 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2171 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 2172 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 2174 <<
"please check why this happens, maybe too many events per node requested ?" 2178 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2179 <<
") That should not happen, but is possible for regression trees, and" 2180 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I " 2181 <<
" stop boosting " <<
Endl;
2185 }
else if (err < 0) {
2186 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 2187 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 2188 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 2189 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2193 Double_t boostWeight = err / (1.-err);
2198 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2200 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2202 if ( (*e)->GetWeight() > 0 ){
2203 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2204 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2205 if (newWeight == 0) {
2206 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2207 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2208 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2209 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2210 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2211 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2213 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2216 (*e)->SetBoostWeight( newBoostWeight );
2219 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2221 newSumw+=(*e)->GetWeight();
2225 Double_t normWeight = sumw / newSumw;
2226 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2229 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2267 void* trxml =
fForest[i]->AddXMLTo(wght);
2287 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2322 if(
gTools().HasAttr(parent,
"TreeType")) {
2333 fForest.back()->SetTreeID(i++);
2347 Int_t analysisType(0);
2351 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2359 istr >> dummy >> iTree >> dummy >> boostWeight;
2361 fForest.back()->Print( std::cout );
2362 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree=" 2363 << iTree <<
" i=" << i
2364 <<
" dummy " << dummy
2365 <<
" boostweight " << boostWeight
2412 if (useNTrees > 0 ) nTrees = useNTrees;
2418 for (
UInt_t itree=0; itree<nTrees; itree++) {
2437 std::vector<Double_t> temp(nClasses);
2438 auto forestSize =
fForest.size();
2442 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2444 if (++classOfTree == nClasses) classOfTree = 0;
2449 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){
return exp(
d);});
2451 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2453 for(
UInt_t j=0;j<nClasses;j++){
2455 norm += temp[j] / temp[iClass];
2457 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
2488 vector< Double_t > response(
fForest.size());
2489 vector< Double_t > weight(
fForest.size());
2498 std::vector< std::vector<Double_t> > vtemp;
2499 vtemp.push_back( response );
2500 vtemp.push_back( weight );
2505 while (sumOfWeights <= totalSumOfWeights/2.) {
2506 sumOfWeights += vtemp[1][t];
2576 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2598 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2599 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2629 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2630 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2631 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2632 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2633 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2634 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2635 Log() <<
"weight in the training of the following tree." <<
Endl;
2637 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2638 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2639 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2640 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2641 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2645 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2646 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2647 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2648 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2649 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2650 Log() <<
"to optimise the BDT performance." <<
Endl;
2654 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2655 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2656 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2657 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2658 Log() <<
"If this number is too large, detailed features " <<
Endl;
2659 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2660 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2661 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2662 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2664 Log() <<
"The default minimal number is currently set to " <<
Endl;
2665 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2666 Log() <<
"and can be changed by the user." <<
Endl;
2668 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2669 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2670 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2671 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2672 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2673 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2674 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2675 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2687 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2688 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2689 fout <<
"};" << std::endl << std::endl;
2690 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2691 fout <<
"{" << std::endl;
2692 fout <<
" double myMVA = 0;" << std::endl;
2696 fout <<
" if (inputValues["<<ivar<<
"] < " <<
fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2699 fout <<
" if (inputValues["<<ivar<<
"] < "<<
fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2702 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2705 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2711 fout <<
" double norm = 0;" << std::endl;
2713 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2714 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2715 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2716 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2717 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2718 fout <<
" }" << std::endl;
2720 fout <<
" myMVA += current->GetResponse();" << std::endl;
2722 if (
fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2723 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2724 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2726 fout <<
" }" << std::endl;
2728 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2730 else fout <<
" return myMVA /= norm;" << std::endl;
2731 fout <<
"};" << std::endl << std::endl;
2732 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2733 fout <<
"{" << std::endl;
2736 fout <<
" // itree = " << itree << std::endl;
2737 fout <<
" fBoostWeights.push_back(" <<
fBoostWeights[itree] <<
");" << std::endl;
2738 fout <<
" fForest.push_back( " << std::endl;
2740 fout <<
" );" << std::endl;
2742 fout <<
" return;" << std::endl;
2743 fout <<
"};" << std::endl;
2744 fout <<
" " << std::endl;
2745 fout <<
"// Clean up" << std::endl;
2746 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2747 fout <<
"{" << std::endl;
2748 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2749 fout <<
" delete fForest[itree]; " << std::endl;
2750 fout <<
" }" << std::endl;
2751 fout <<
"}" << std::endl;
2763 fout <<
"#define NN new "<<nodeName << std::endl;
2765 fout <<
" " << std::endl;
2766 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2767 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2768 fout <<
" " << std::endl;
2769 fout <<
"class "<<nodeName<<
" {" << std::endl;
2770 fout <<
" " << std::endl;
2771 fout <<
"public:" << std::endl;
2772 fout <<
" " << std::endl;
2773 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2774 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2776 fout <<
" int nFisherCoeff," << std::endl;
2778 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2781 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2782 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2783 fout <<
" fLeft ( left )," << std::endl;
2784 fout <<
" fRight ( right )," << std::endl;
2785 if (
fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2786 fout <<
" fSelector ( selector )," << std::endl;
2787 fout <<
" fCutValue ( cutValue )," << std::endl;
2788 fout <<
" fCutType ( cutType )," << std::endl;
2789 fout <<
" fNodeType ( nodeType )," << std::endl;
2790 fout <<
" fPurity ( purity )," << std::endl;
2791 fout <<
" fResponse ( response ){" << std::endl;
2794 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2797 fout <<
" }" << std::endl << std::endl;
2798 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2799 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2800 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2801 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2802 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2803 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2804 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2805 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2806 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2807 fout <<
" // return the node type" << std::endl;
2808 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2809 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2810 fout <<
"private:" << std::endl << std::endl;
2811 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2812 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2814 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2815 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2817 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2818 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2819 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2820 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2821 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2822 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2823 fout <<
"}; " << std::endl;
2824 fout <<
" " << std::endl;
2825 fout <<
"//_______________________________________________________________________" << std::endl;
2826 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2827 fout <<
"{" << std::endl;
2828 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2829 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2830 fout <<
"}; " << std::endl;
2831 fout <<
" " << std::endl;
2832 fout <<
"//_______________________________________________________________________" << std::endl;
2833 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2834 fout <<
"{" << std::endl;
2835 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2836 fout <<
" bool result;" << std::endl;
2838 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2839 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2840 fout <<
" }else{" << std::endl;
2841 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2842 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2843 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2844 fout <<
" result = fisher > fCutValue;" << std::endl;
2845 fout <<
" }" << std::endl;
2847 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2849 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2850 fout <<
" else return !result;" << std::endl;
2851 fout <<
"}" << std::endl;
2852 fout <<
" " << std::endl;
2853 fout <<
"//_______________________________________________________________________" << std::endl;
2854 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2855 fout <<
"{" << std::endl;
2856 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2857 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2858 fout <<
" else return false;" << std::endl;
2859 fout <<
"}" << std::endl;
2860 fout <<
" " << std::endl;
2861 fout <<
"#endif" << std::endl;
2862 fout <<
" " << std::endl;
2871 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2874 fout <<
"NN("<<std::endl;
2881 fout <<
", " <<std::endl;
2888 fout <<
", " << std::endl
2889 << std::setprecision(6);
2916 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2918 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2933 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2935 nTotS += (*it)->GetWeight();
2939 nTotB += (*it)->GetWeight();
2947 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2949 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2950 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2951 for( ; it != it_end; ++it ) {
2953 sigWeightCtr += (**it)->GetWeight();
2955 bkgWeightCtr += (**it)->GetWeight();
2957 it->SetCumulativeWeight(
false,bkgWeightCtr);
2958 it->SetCumulativeWeight(
true,sigWeightCtr);
2964 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
2965 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
2970 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
2973 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
2974 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
2976 tmpEffS=nSelS/nTotS;
2977 tmpEffB=nSelB/nTotB;
2981 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS;
fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal;
fIsLowSigCut[ivar]=
kTRUE;}
2988 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
2989 if (
fDoPreselection)
Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
2990 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
2993 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " <<
fLowBkgCut[ivar] <<
Endl;
2996 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " <<
fLowSigCut[ivar] <<
Endl;
2999 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " <<
fHighBkgCut[ivar] <<
Endl;
3002 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " <<
fHighSigCut[ivar] <<
Endl;
Types::EAnalysisType fAnalysisType
void Train(void)
BDT training.
void PreProcessNegativeEventWeights()
O.k.
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
double dist(Rotation3D const &r1, Rotation3D const &r2)
void GetBaggedSubSample(std::vector< const TMVA::Event *> &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
static long int sum(long int i)
virtual Double_t Fit(std::vector< LossFunctionEventInfo > &evs)=0
Random number generator class based on M.
THist< 1, int, THistStatContent > TH1I
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
MsgLogger & Endl(MsgLogger &ml)
Singleton class for Global types used by TMVA.
std::vector< Bool_t > fIsLowSigCut
Double_t RegBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
std::map< const TMVA::Event *, LossFunctionEventInfo > fLossFunctionEventInfo
Bool_t fPairNegWeightsGlobal
void AddPoint(Double_t x, Double_t y1, Double_t y2)
This function is used only in 2 TGraph case, and it will add new data points to graphs.
void SetUseNvars(Int_t n)
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void Delete(Option_t *option="")
Delete this tree from memory or/and disk.
Bool_t IsConstructedFromWeightFile() const
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
void BDT(TString dataset, const TString &fin="TMVA.root")
Absolute Deviation BDT Loss Function.
TString & ReplaceAll(const TString &s1, const TString &s2)
virtual Int_t Fill()
Fill all branches.
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
std::vector< Bool_t > fIsHighSigCut
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void DeclareOptions()
Define the options (their key words).
std::vector< Double_t > fVariableImportance
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
void DeterminePreselectionCuts(const std::vector< const TMVA::Event *> &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
static Config & Instance()
static function: returns TMVA instance
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
Double_t fMinLinCorrForFisher
Virtual base Class for all MVA method.
std::vector< const TMVA::Event * > fEventSample
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
1-D histogram with a float per channel (see TH1 documentation)}
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
Ranking for variables in method (implementation)
Short_t Min(Short_t a, Short_t b)
void ToLower()
Change string to lower-case.
virtual void SetYTitle(const char *title)
virtual void SetTitle(const char *title="")
Set graph title.
Double_t AdaBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
The AdaBoost implementation.
UInt_t GetNClasses() const
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
Int_t FloorNint(Double_t x)
void GetHelpMessage() const
Get help message text.
Bool_t GetCutType(void) const
std::vector< Bool_t > fIsHighBkgCut
void SetShrinkage(Double_t s)
Double_t AdaCost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events...
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
virtual Double_t GetROCIntegral(TH1D *histS, TH1D *histB) const
calculate the area (integral) under the ROC curve as a overall quality measure of the classification ...
TString fRegressionLossFunctionBDTGS
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
const TString & GetInputLabel(Int_t i) const
UInt_t GetNumThreadsInPool()
std::vector< Double_t > fHighBkgCut
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
TSeq< unsigned int > TSeqU
Double_t fBaggedSampleFraction
Implementation of the CrossEntropy as separation criterion.
Bool_t fInverseBoostNegWeights
Double_t GradBoostRegression(std::vector< const TMVA::Event *> &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
virtual void SetName(const char *name="")
Set graph name.
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
Float_t GetCutValue(void) const
UInt_t GetTrainingTMVAVersionCode() const
const Event * GetEvent() const
Double_t fSigToBkgFraction
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
UInt_t GetNFisherCoeff() const
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.). ...
TString & Append(const char *cs)
void SetMinNodeSize(Double_t sizeInPercent)
void Init(std::vector< TString > &graphTitles)
This function gets some title and it creates a TGraph for every title.
DataSetInfo & DataInfo() const
Bool_t DoRegression() const
Double_t AdaBoostR2(std::vector< const TMVA::Event *> &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
std::vector< Double_t > fHighSigCut
Class that contains all the data information.
Least Squares BDT Loss Function.
Implementation of the SdivSqrtSplusB as separation criterion.
PDF wrapper for histograms; uses user-defined spline interpolation.
ROOT::TThreadExecutor & GetThreadExecutor()
Long64_t GetNTrainingEvents() const
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
virtual void Print(Option_t *option="") const
Print TNamed name and title.
const Event * GetTrainingEvent(Long64_t ievt) const
Implementation of the MisClassificationError as separation criterion.
Bool_t fNoNegWeightsInTraining
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
Bool_t DoMulticlass() const
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
static const uint32_t K[64]
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
std::vector< Bool_t > fIsLowBkgCut
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual void Delete(Option_t *option="")
Delete this object.
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
const Event * GetTestingEvent(Long64_t ievt) const
virtual Double_t Determinant() const
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Float_t GetTarget(UInt_t itgt) const
Bool_t HasTrainingTree() const
Results * GetResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
std::vector< Double_t > fLowBkgCut
Int_t GetNodeType(void) const
Double_t fNodePurityLimit
Service class for 2-Dim histogram classes.
void SetBaggedSampleFraction(Double_t f)
const char * GetName() const
ClassInfo * GetClassInfo(Int_t clNum) const
std::map< TString, Double_t > optimize()
TGraph * GetGraph(const TString &alias) const
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training ...
The TMVA::Interval Class.
Double_t GetFisherCoeff(Int_t ivar) const
Bool_t fTrainWithNegWeights
Bool_t fSkipNormalization
void DeleteResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
delete the results stored for this particular Method instance.
virtual ~MethodBDT(void)
Destructor.
Implementation of the GiniIndex as separation criterion.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
void SetNodePurityLimit(Double_t l)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
Implementation of a Decision Tree.
Double_t GradBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
char * Form(const char *fmt,...)
const TString & GetMethodName() const
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
SeparationBase * fSepType
void Init(void)
Common initialisation with defaults for the BDT-Method.
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
virtual const char * GetPath() const
Returns the full path of the directory.
constexpr Double_t E()
Base of natural log: .
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Implementation of the GiniIndex With Laplace correction as separation criterion.
Long64_t GetNTestEvents() const
UInt_t GetNVariables() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
DecisionTree::EPruneMethod fPruneMethod
static void SetVarIndex(Int_t iVar)
Float_t GetPurity(void) const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA ...
void UpdateTargets(std::vector< const TMVA::Event *> &, UInt_t cls=0)
Calculate residual for all events.
std::vector< Float_t > * fMulticlassReturnVal
Bool_t IsNormalised() const
void SetMaxDepth(Int_t d)
TH1 * GetHist(const TString &alias) const
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
static RooMathCoreReg dummy
void SetAdaBoostBeta(Double_t b)
void SetCurrentType(Types::ETreeType type) const
std::vector< const TMVA::Event * > * fTrainSample
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
VariableInfo & GetVariableInfo(Int_t i)
void AddPreDefVal(const T &)
Double_t Boost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor...
The TMVA::Interval Class.
const TString & GetOptions() const
LossFunctionBDT * fRegressionLossFunctionBDTG
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
virtual Int_t Branch(TCollection *list, Int_t bufsize=32000, Int_t splitlevel=99, const char *name="")
Create one branch for each element in the collection.
#define REGISTER_METHOD(CLASS)
for example
TString fNegWeightTreatment
Abstract ClassifierFactory template that handles arbitrary types.
virtual void SetXTitle(const char *title)
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
IPythonInteractive * fInteractive
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Execute func (with no arguments) nTimes in parallel.
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
Float_t GetResponse(void) const
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Class that is the base-class for a vector of result.
Short_t Max(Short_t a, Short_t b)
A Graph is a graphics object made of two arrays X and Y with npoints each.
virtual DecisionTreeNode * GetLeft() const
std::vector< const TMVA::Event * > fValidationSample
std::vector< DecisionTree * > fForest
virtual DecisionTreeNode * GetRight() const
Bool_t IsSignal(const Event *ev) const
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t fFValidationEvents
std::vector< Double_t > fLowSigCut
std::vector< Float_t > * fRegressionReturnVal
Double_t Atof() const
Return floating-point value contained in string.
void UpdateTargetsRegression(std::vector< const TMVA::Event *> &, Bool_t first=kFALSE)
Calculate current residuals for all events and update targets for next iteration. ...
Types::EAnalysisType GetAnalysisType() const
A TTree object has a header with a name and a title.
Short_t GetSelector() const
std::map< const TMVA::Event *, std::vector< double > > fResiduals
void Store(TObject *obj, const char *alias=0)
static const Int_t fgDebugLevel
virtual void Init(std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap, std::vector< double > &boostWeights)=0
Double_t Sqrt(Double_t x)
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
THist< 2, float, THistStatContent, THistStatUncertainty > TH2F
std::vector< const TMVA::Event * > fSubSample
Timing information for training and evaluation of MVA methods.
Analysis of Boosted Decision Trees.
Int_t CeilNint(Double_t x)
virtual void SetTargets(std::vector< const TMVA::Event *> &evs, std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap)=0
void InitGradBoost(std::vector< const TMVA::Event *> &)
Initialize targets for first tree.
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
void SetSignalReferenceCut(Double_t cut)
virtual const char * GetTitle() const
Returns title of object.
std::vector< double > fBoostWeights
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".