149 using std::make_pair;
167 , fSigToBkgFraction(0)
172 , fBaggedGradBoost(
kFALSE)
176 , fMinNodeSizeS(
"5%")
179 , fMinLinCorrForFisher(.8)
180 , fUseExclusiveVars(0)
182 , fNodePurityLimit(0)
187 , fFValidationEvents(0)
189 , fRandomisedTrees(
kFALSE)
191 , fUsePoissonNvars(0)
192 , fUseNTrainEvents(0)
193 , fBaggedSampleFraction(0)
194 , fNoNegWeightsInTraining(
kFALSE)
195 , fInverseBoostNegWeights(
kFALSE)
196 , fPairNegWeightsGlobal(
kFALSE)
197 , fTrainWithNegWeights(
kFALSE)
207 , fSkipNormalization(
kFALSE)
333 DeclareOptionRef(
fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
335 DeclareOptionRef(
fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
359 DeclareOptionRef(
fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
361 DeclareOptionRef(
fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Possion distribution in each split with RandomisedTree option");
362 DeclareOptionRef(
fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
365 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
370 DeclareOptionRef(
fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in traning sample and *annihilate* them (experimental!)");
405 DeclareOptionRef(
fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
419 DeclareOptionRef(
fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
433 DeclareOptionRef(
fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
434 DeclareOptionRef(
fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
435 DeclareOptionRef(
fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
449 "Use weighted trees or simple average in classification from the forest");
479 Log() <<
kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " <<
fHuberQuantile <<
", does not match this criteria" <<
Endl;
503 <<
"Sorry autmoatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
510 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n" 511 <<
"*MinNodeSize* giving the relative number as percentage of training \n" 512 <<
"events instead. \n" 515 Log() <<
kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recomeded \n" 529 Log() <<
kINFO <<
"the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change" <<
Endl;
530 Log() <<
kINFO <<
"to new default for GradBoost *Pray*" <<
Endl;
531 Log() <<
kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
544 Log() <<
kWARNING <<
"You have chosen to use more than half of your training sample " 545 <<
"to optimize the automatic pruning algorithm. This is probably wasteful " 546 <<
"and your overall results will be degraded. Are you sure you want this?" 551 if (this->
Data()->HasNegativeEventWeights()){
552 Log() <<
kINFO <<
" You are using a Monte Carlo that has also negative weights. " 553 <<
"That should in principle be fine as long as on average you end up with " 554 <<
"something positive. For this you have to make sure that the minimal number " 555 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize=" 557 <<
", (or the deprecated equivalent nEventsMin) you can set this via the " 558 <<
"BDT option string when booking the " 559 <<
"classifier) is large enough to allow for reasonable averaging!!! " 560 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining " 561 <<
"which ignores events with negative weight in the training. " <<
Endl 562 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
567 Log() <<
kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
572 Log() <<
kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
576 Log() <<
kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
580 Log() <<
kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
581 Log() <<
kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
582 Log() <<
kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
587 Log() <<
kINFO <<
" Randomised trees use no pruning" <<
Endl;
593 Log() <<
kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
594 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
601 Log() <<
kERROR <<
" Zero Decision Trees demanded... that does not work !! " 602 <<
" I set it to 1 .. just so that the program does not crash" 619 Log() <<
kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
631 <<
"* \n this has been translated to MaxDepth="<<
fMaxDepth<<
Endl;
644 Log() <<
kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
653 if (sizeInPercent > 0 && sizeInPercent < 50){
657 Log() <<
kFATAL <<
"you have demanded a minimal node size of " 658 << sizeInPercent <<
"% of the training events.. \n" 659 <<
" that somehow does not make sense "<<
Endl;
670 Log() <<
kFATAL <<
"I had problems reading the option MinNodeEvents, which " 671 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
736 Log() <<
kDEBUG <<
" successfully(?) reset the method " <<
Endl;
766 std::vector<const TMVA::Event*> tmpEventSample;
767 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
770 tmpEventSample.push_back(event);
776 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
781 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
793 if (firstNegWeight) {
794 Log() <<
kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
798 }
else if (event->GetWeight()==0){
799 if (firstZeroWeight) {
801 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
805 if (event->GetWeight() < 0) {
810 Log() <<
kWARNING <<
"Events with negative event weights are found and " 811 <<
" will be removed prior to the actual BDT training by global " 812 <<
" paring (and subsequent annihilation) with positiv weight events" 815 Log() <<
kWARNING <<
"Events with negative event weights are USED during " 816 <<
"the BDT training. This might cause problems with small node sizes " 817 <<
"or with the boosting. Please remove negative events from training " 818 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you " 819 <<
"observe problems with the boosting" 827 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
841 <<
"% of training used for validation)" << Endl;
849 Log() <<
kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
850 Log() <<
kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
851 Log() <<
kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
867 Int_t sumSig=0, sumBkg=0;
877 if (sumSigW && sumBkgW){
880 Log() <<
kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = " 882 Log() <<
kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
883 Log() <<
kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
884 Log() <<
kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
885 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
890 Log() <<
kINFO <<
"--> could not determine scaleing factors as either there are " <<
Endl;
891 Log() <<
kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
925 std::vector<const Event*> negEvents;
935 if (totalNegWeights == 0 ) {
936 Log() <<
kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
939 Log() <<
kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
940 Log() <<
kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
941 Log() <<
kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
948 for (
Int_t i=0; i<2; i++){
949 invCov = ((*cov)[i]);
951 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with deterninant=" 953 <<
" did you use the variables that are linear combinations or highly correlated?" 957 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant=" 959 <<
" did you use the variables that are linear combinations?" 968 Log() <<
kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " <<
fEventSample.size() <<
" training events " <<
Endl;
969 Timer timer(negEvents.size(),
"Negative Event paired");
970 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
971 timer.DrawProgressBar( nev );
972 Double_t weight = negEvents[nev]->GetWeight();
973 UInt_t iClassID = negEvents[nev]->GetClass();
974 invCov = ((*cov)[iClassID]);
985 dist += (negEvents[nev]->GetValue(ivar)-
fEventSample[iev]->GetValue(ivar))*
986 (*invCov)[ivar][jvar]*
987 (negEvents[nev]->GetValue(jvar)-
fEventSample[iev]->GetValue(jvar));
990 if (dist < minDist) { iMin=iev; minDist=
dist;}
998 negEvents[nev]->SetBoostWeight( 0 );
1001 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1005 }
else Log() <<
kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1006 weight = negEvents[nev]->GetWeight();
1009 Log() <<
kINFO <<
"<Negative Event Pairing> took: " <<
timer.GetElapsedTime()
1013 totalNegWeights = 0;
1014 totalPosWeights = 0;
1021 std::vector<const Event*> newEventSample;
1042 if (totalNegWeights < 0)
Log() <<
kFATAL <<
" compenstion of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1047 Log() <<
kINFO <<
" after PreProcessing, the Event sample is left with " <<
fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1048 Log() <<
kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1062 std::map<TString,TMVA::Interval*> tuneParameters;
1063 std::map<TString,Double_t> tunedParameters;
1072 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1073 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1074 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1080 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1088 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1092 Log()<<
kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1093 std::map<TString,TMVA::Interval*>::iterator it;
1094 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1096 std::ostringstream oss;
1097 (it->second)->
Print(oss);
1103 tunedParameters=optimize.
optimize();
1105 return tunedParameters;
1114 std::map<TString,Double_t>::iterator it;
1115 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1118 else if (it->first ==
"MinNodeSize" )
SetMinNodeSize (it->second);
1122 else if (it->first ==
"Shrinkage" )
SetShrinkage (it->second);
1125 else Log() <<
kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1144 Log() <<
kERROR <<
" Zero Decision Trees demanded... that does not work !! " 1145 <<
" I set it to 1 .. just so that the program does not crash" 1151 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1160 <<
"please remove the option from the configuration string, or " 1161 <<
"use \"!Normalise\"" 1179 TString hname =
"AdaBooost weight distribution";
1189 hname=
"Boost event weights distribution";
1204 results->
Store(h,
"BoostWeights");
1209 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,
fNTrees,2,0,1.05);
1211 boostMonitor->
SetYTitle(
"ROC Integral");
1212 results->
Store(boostMonitor,
"BoostMonitor");
1214 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1215 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1216 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1223 results->
Store(h,
"BoostWeightsVsTree");
1229 results->
Store(h,
"ErrorFrac");
1232 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1233 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1234 results->
Store(nodesBeforePruningVsTree);
1237 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1238 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1239 results->
Store(nodesAfterPruningVsTree);
1249 Int_t nNodesBeforePruningCount = 0;
1250 Int_t nNodesAfterPruningCount = 0;
1252 Int_t nNodesBeforePruning = 0;
1253 Int_t nNodesAfterPruning = 0;
1263 while (itree <
fNTrees && continueBoost){
1279 Log() <<
kFATAL <<
"Multiclass is currently only supported by gradient boost. " 1280 <<
"Please change boost option accordingly (GradBoost)." 1284 for (
UInt_t i=0;i<nClasses;i++){
1290 fForest.back()->SetUseFisherCuts();
1314 fForest.back()->SetUseFisherCuts();
1322 nNodesBeforePruning =
fForest.back()->CleanTree();
1325 nNodesBeforePruningCount += nNodesBeforePruning;
1326 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1331 std::vector<const Event*> * validationSample =
NULL;
1353 nNodesAfterPruning =
fForest.back()->GetNNodes();
1354 nNodesAfterPruningCount += nNodesAfterPruning;
1355 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1364 if ( itree==
fNTrees-1 || (!(itree%500)) ||
1365 (!(itree%250) && itree <1000)||
1366 (!(itree%100) && itree < 500)||
1367 (!(itree%50) && itree < 250)||
1368 (!(itree%25) && itree < 150)||
1369 (!(itree%10) && itree < 50)||
1370 (!(itree%5) && itree < 20)
1382 Log() <<
kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : " 1386 Log() <<
kDEBUG <<
"\t<Train> average number of nodes before/after pruning : " 1387 << nNodesBeforePruningCount/
GetNTrees() <<
" / " 1396 Log() <<
kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1413 for (
UInt_t itree=0; itree<nTrees; itree++) {
1418 return 2.0/(1.0+
exp(-2.0*sum))-1;
1428 for (std::vector<const TMVA::Event*>::iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1430 if(cls == nClasses-1){
1431 for(
UInt_t i=0;i<nClasses;i++){
1433 for(
UInt_t j=0;j<nClasses;j++){
1438 Double_t res = ((*e)->GetClass()==i)?(1.0-p_cls):(-p_cls);
1445 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1473 std::map<TMVA::DecisionTreeNode*,std::vector<Double_t> > leaves;
1474 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1475 Double_t weight = (*e)->GetWeight();
1477 if ((leaves[node]).empty()){
1478 (leaves[node]).push_back((*e)->GetTarget(cls)* weight);
1479 (leaves[node]).push_back(
fabs((*e)->GetTarget(cls))*(1.0-
fabs((*e)->GetTarget(cls))) * weight* weight);
1482 (leaves[node])[0]+=((*e)->GetTarget(cls)* weight);
1483 (leaves[node])[1]+=
fabs((*e)->GetTarget(cls))*(1.0-
fabs((*e)->GetTarget(cls))) * weight* weight;
1487 iLeave!=leaves.end();++iLeave){
1488 if ((iLeave->second)[1]<1
e-30) (iLeave->second)[1]=1
e-30;
1506 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1507 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1514 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1515 iLeave!=leaves.end();++iLeave){
1517 (iLeave->first)->SetResponse(
fShrinkage*fit);
1534 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1544 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1545 for (
UInt_t i=0;i<nClasses;i++){
1547 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1554 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1579 return ncorrect / (ncorrect + nfalse);
1599 returnVal = this->
GradBoost (eventSample, dt, cls);
1601 returnVal = this->
GradBoost (eventSample, dt);
1624 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1625 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1640 for (
UInt_t iev=0; iev < nevents; iev++){
1643 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1649 std::vector<TH1F*> hS;
1650 std::vector<TH1F*> hB;
1652 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1653 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1654 results->
Store(hS.back(),hS.back()->GetTitle());
1655 results->
Store(hB.back(),hB.back()->GetTitle());
1662 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1663 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1664 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1665 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1667 TH1F *tmpBoostWeights;
1668 std::vector<TH1F*> *
h;
1672 tmpBoostWeights=tmpBoostWeightsS;
1675 tmpBoostWeights=tmpBoostWeightsB;
1716 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1718 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1719 std::map<Node*,Int_t> sigEventsInNode;
1722 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1725 UInt_t iclass=(*e)->GetClass();
1730 sumGlobalwfalse += w * tmpDev;
1731 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1732 if (tmpDev > maxDev) maxDev = tmpDev;
1737 if (!(isSignalType ==
DataInfo().IsSignal(*
e))) {
1738 sumGlobalwfalse+= w;
1745 sumGlobalwfalse+= w*trueType*dtoutput;
1750 err = sumGlobalwfalse/sumGlobalw ;
1754 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1757 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1761 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1764 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1769 Log() <<
kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 1771 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1775 Log() <<
kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1779 std::vector<Double_t> newSumw(sumw.size(),0);
1786 Log() <<
kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 1787 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 1789 <<
"please check why this happens, maybe too many events per node requested ?" 1793 Log() <<
kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1794 <<
") That should not happen, please check your code (i.e... the BDT code), I " 1795 <<
" stop boosting here" <<
Endl;
1799 }
else if (err < 0) {
1800 Log() <<
kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 1801 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 1802 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 1803 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1812 Log() <<
kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1817 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1824 if ( (*e)->GetWeight() > 0 ){
1825 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1830 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1842 if ( (*e)->GetWeight() > 0 ){
1843 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1848 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1851 newSumGlobalw+=(*e)->GetWeight();
1852 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1858 Log() <<
kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1861 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1866 else (*e)->ScaleBoostWeight( globalNormWeight );
1902 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
1904 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1905 std::map<Node*,Int_t> sigEventsInNode;
1907 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1909 sumGlobalWeights += w;
1910 UInt_t iclass=(*e)->GetClass();
1915 Log() <<
kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1921 Bool_t isSelectedSignal = (dtoutput>0);
1922 if (isTrueSignal) trueType = 1;
1926 if (isTrueSignal && isSelectedSignal) cost=Css;
1927 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1928 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1929 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1930 else Log() <<
kERROR <<
"something went wrong in AdaCost" <<
Endl;
1932 sumGlobalCost+= w*trueType*dtoutput*cost;
1938 Log() <<
kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1943 sumGlobalCost /= sumGlobalWeights;
1948 vector<Double_t> newSumClassWeights(sumw.size(),0);
1954 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1958 Bool_t isSelectedSignal = (dtoutput>0);
1959 if (isTrueSignal) trueType = 1;
1963 if (isTrueSignal && isSelectedSignal) cost=Css;
1964 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1965 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1966 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1967 else Log() <<
kERROR <<
"something went wrong in AdaCost" <<
Endl;
1971 if ( (*e)->GetWeight() > 0 ){
1972 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1979 newSumGlobalWeights+=(*e)->GetWeight();
1980 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
1985 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
1986 Log() <<
kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
1989 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1993 else (*e)->ScaleBoostWeight( globalNormWeight );
2032 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2069 if ( !
DoRegression() )
Log() <<
kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2071 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2073 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2078 sumwfalse += w * tmpDev;
2079 sumwfalse2 += w * tmpDev*tmpDev;
2080 if (tmpDev > maxDev) maxDev = tmpDev;
2085 err = sumwfalse/maxDev/sumw ;
2088 err = sumwfalse2/maxDev/maxDev/sumw ;
2092 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2095 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2100 Log() <<
kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 2102 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2110 Log() <<
kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 2111 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 2113 <<
"please check why this happens, maybe too many events per node requested ?" 2117 Log() <<
kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2118 <<
") That should not happen, but is possible for regression trees, and" 2119 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I " 2120 <<
" stop boosting " <<
Endl;
2124 }
else if (err < 0) {
2125 Log() <<
kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 2126 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 2127 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 2128 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2132 Double_t boostWeight = err / (1.-err);
2137 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2139 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2141 if ( (*e)->GetWeight() > 0 ){
2142 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2143 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2144 if (newWeight == 0) {
2145 Log() <<
kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2146 Log() <<
kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2147 Log() <<
kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2148 Log() <<
kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2149 Log() <<
kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2152 Log() <<
kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2155 (*e)->SetBoostWeight( newBoostWeight );
2158 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2160 newSumw+=(*e)->GetWeight();
2164 Double_t normWeight = sumw / newSumw;
2165 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2168 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2206 void* trxml =
fForest[i]->AddXMLTo(wght);
2226 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2261 if(
gTools().HasAttr(parent,
"TreeType")) {
2272 fForest.back()->SetTreeID(i++);
2286 Int_t analysisType(0);
2290 Log() <<
kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2298 istr >> dummy >> iTree >> dummy >> boostWeight;
2300 fForest.back()->Print( std::cout );
2301 Log() <<
kFATAL <<
"Error while reading weight file; mismatch iTree=" 2302 << iTree <<
" i=" << i
2303 <<
" dummy " << dummy
2304 <<
" boostweight " << boostWeight
2350 if (useNTrees > 0 ) nTrees = useNTrees;
2356 for (
UInt_t itree=0; itree<nTrees; itree++) {
2374 std::vector<double> temp;
2377 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2378 temp.push_back(0.0);
2379 for(
UInt_t itree = iClass; itree<
fForest.size(); itree+=nClasses){
2384 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2386 for(
UInt_t j=0;j<nClasses;j++){
2388 norm+=
exp(temp[j]-temp[iClass]);
2390 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
2425 vector< Double_t > response(
fForest.size());
2426 vector< Double_t > weight(
fForest.size());
2435 std::vector< std::vector<Double_t> > vtemp;
2436 vtemp.push_back( response );
2437 vtemp.push_back( weight );
2442 while (sumOfWeights <= totalSumOfWeights/2.) {
2443 sumOfWeights += vtemp[1][t];
2513 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2535 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2536 else Log() <<
kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2569 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2570 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2571 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2572 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2573 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2574 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2575 Log() <<
"weight in the training of the following tree." <<
Endl;
2577 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2578 Log() <<
"using a single descriminant variable at a time. A test event " <<
Endl;
2579 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2580 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2581 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2585 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2586 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2587 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2588 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2589 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2590 Log() <<
"to optimise the BDT performance." <<
Endl;
2594 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2595 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2596 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2597 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2598 Log() <<
"If this number is too large, detailed features " <<
Endl;
2599 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2600 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2601 Log() <<
" typical values from our current expericience for best performance " <<
Endl;
2602 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2604 Log() <<
"The default minimal number is currently set to " <<
Endl;
2605 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2606 Log() <<
"and can be changed by the user." <<
Endl;
2608 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2609 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2610 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2611 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2612 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2613 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2614 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2615 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2627 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2628 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2629 fout <<
"};" << std::endl << std::endl;
2630 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2631 fout <<
"{" << std::endl;
2632 fout <<
" double myMVA = 0;" << std::endl;
2636 fout <<
" if (inputValues["<<ivar<<
"] < " <<
fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2639 fout <<
" if (inputValues["<<ivar<<
"] < "<<
fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2642 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2645 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2651 fout <<
" double norm = 0;" << std::endl;
2653 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2654 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2655 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2656 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2657 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2658 fout <<
" }" << std::endl;
2660 fout <<
" myMVA += current->GetResponse();" << std::endl;
2662 if (
fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2663 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2664 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2666 fout <<
" }" << std::endl;
2668 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2670 else fout <<
" return myMVA /= norm;" << std::endl;
2671 fout <<
"};" << std::endl << std::endl;
2672 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2673 fout <<
"{" << std::endl;
2676 fout <<
" // itree = " << itree << std::endl;
2677 fout <<
" fBoostWeights.push_back(" <<
fBoostWeights[itree] <<
");" << std::endl;
2678 fout <<
" fForest.push_back( " << std::endl;
2680 fout <<
" );" << std::endl;
2682 fout <<
" return;" << std::endl;
2683 fout <<
"};" << std::endl;
2684 fout <<
" " << std::endl;
2685 fout <<
"// Clean up" << std::endl;
2686 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2687 fout <<
"{" << std::endl;
2688 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2689 fout <<
" delete fForest[itree]; " << std::endl;
2690 fout <<
" }" << std::endl;
2691 fout <<
"}" << std::endl;
2703 fout <<
"#define NN new "<<nodeName << std::endl;
2705 fout <<
" " << std::endl;
2706 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2707 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2708 fout <<
" " << std::endl;
2709 fout <<
"class "<<nodeName<<
" {" << std::endl;
2710 fout <<
" " << std::endl;
2711 fout <<
"public:" << std::endl;
2712 fout <<
" " << std::endl;
2713 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2714 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2716 fout <<
" int nFisherCoeff," << std::endl;
2718 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2721 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2722 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2723 fout <<
" fLeft ( left )," << std::endl;
2724 fout <<
" fRight ( right )," << std::endl;
2725 if (
fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2726 fout <<
" fSelector ( selector )," << std::endl;
2727 fout <<
" fCutValue ( cutValue )," << std::endl;
2728 fout <<
" fCutType ( cutType )," << std::endl;
2729 fout <<
" fNodeType ( nodeType )," << std::endl;
2730 fout <<
" fPurity ( purity )," << std::endl;
2731 fout <<
" fResponse ( response ){" << std::endl;
2734 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2737 fout <<
" }" << std::endl << std::endl;
2738 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2739 fout <<
" // test event if it decends the tree at this node to the right" << std::endl;
2740 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2741 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2742 fout <<
" // test event if it decends the tree at this node to the left " << std::endl;
2743 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2744 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2745 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2746 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2747 fout <<
" // return the node type" << std::endl;
2748 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2749 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2750 fout <<
"private:" << std::endl << std::endl;
2751 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2752 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2754 fout <<
" int fNFisherCoeff; // =0 if this node doesn use fisher, else =nvar+1 " << std::endl;
2755 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2757 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2758 fout <<
" double fCutValue; // cut value appplied on this node to discriminate bkg against sig" << std::endl;
2759 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2760 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2761 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2762 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2763 fout <<
"}; " << std::endl;
2764 fout <<
" " << std::endl;
2765 fout <<
"//_______________________________________________________________________" << std::endl;
2766 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2767 fout <<
"{" << std::endl;
2768 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2769 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2770 fout <<
"}; " << std::endl;
2771 fout <<
" " << std::endl;
2772 fout <<
"//_______________________________________________________________________" << std::endl;
2773 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2774 fout <<
"{" << std::endl;
2775 fout <<
" // test event if it decends the tree at this node to the right" << std::endl;
2776 fout <<
" bool result;" << std::endl;
2778 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2779 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2780 fout <<
" }else{" << std::endl;
2781 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2782 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2783 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2784 fout <<
" result = fisher > fCutValue;" << std::endl;
2785 fout <<
" }" << std::endl;
2787 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2789 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2790 fout <<
" else return !result;" << std::endl;
2791 fout <<
"}" << std::endl;
2792 fout <<
" " << std::endl;
2793 fout <<
"//_______________________________________________________________________" << std::endl;
2794 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2795 fout <<
"{" << std::endl;
2796 fout <<
" // test event if it decends the tree at this node to the left" << std::endl;
2797 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2798 fout <<
" else return false;" << std::endl;
2799 fout <<
"}" << std::endl;
2800 fout <<
" " << std::endl;
2801 fout <<
"#endif" << std::endl;
2802 fout <<
" " << std::endl;
2811 Log() <<
kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2814 fout <<
"NN("<<std::endl;
2821 fout <<
", " <<std::endl;
2828 fout <<
", " << std::endl
2829 << std::setprecision(6);
2857 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2859 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2874 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2876 nTotS += (*it)->GetWeight();
2880 nTotB += (*it)->GetWeight();
2888 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2890 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2891 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2892 for( ; it != it_end; ++it ) {
2894 sigWeightCtr += (**it)->GetWeight();
2896 bkgWeightCtr += (**it)->GetWeight();
2898 it->SetCumulativeWeight(
false,bkgWeightCtr);
2899 it->SetCumulativeWeight(
true,sigWeightCtr);
2905 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
2906 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
2911 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
2914 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
2915 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
2917 tmpEffS=nSelS/nTotS;
2918 tmpEffB=nSelB/nTotB;
2922 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS;
fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal;
fIsLowSigCut[ivar]=
kTRUE;}
2929 Log() <<
kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
2930 if (
fDoPreselection)
Log() <<
kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
2931 else Log() <<
kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
Types::EAnalysisType fAnalysisType
void Train(void)
BDT training.
void PreProcessNegativeEventWeights()
o.k.
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
double dist(Rotation3D const &r1, Rotation3D const &r2)
void GetBaggedSubSample(std::vector< const TMVA::Event *> &)
fills fEventSample with fBaggedSampleFraction*NEvents random training events
static long int sum(long int i)
virtual Double_t Fit(std::vector< LossFunctionEventInfo > &evs)=0
Random number generator class based on M.
THist< 1, int, THistStatContent > TH1I
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
MsgLogger & Endl(MsgLogger &ml)
std::vector< Bool_t > fIsLowSigCut
Double_t RegBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
a special boosting only for Regression ...
void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility
std::map< const TMVA::Event *, LossFunctionEventInfo > fLossFunctionEventInfo
Bool_t fPairNegWeightsGlobal
void AddPoint(Double_t x, Double_t y1, Double_t y2)
This function is used only in 2 TGraph case, and it will add new data points to graphs.
void SetUseNvars(Int_t n)
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void Delete(Option_t *option="")
Delete this tree from memory or/and disk.
Bool_t IsConstructedFromWeightFile() const
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
void BDT(TString dataset, const TString &fin="TMVA.root")
TString & ReplaceAll(const TString &s1, const TString &s2)
virtual Int_t Fill()
Fill all branches.
virtual void SetName(const char *name)
Set the name of the TNamed.
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
std::vector< Bool_t > fIsHighSigCut
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void DeclareOptions()
define the options (their key words) that can be set in the option string know options: nTrees number...
std::vector< Double_t > fVariableImportance
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
void DeterminePreselectionCuts(const std::vector< const TMVA::Event *> &eventSample)
find useful preselection cuts that will be applied before and Decision Tree training.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
recursively descends a tree and writes the node instance to the output streem
Double_t fMinLinCorrForFisher
std::vector< const TMVA::Event * > fEventSample
Double_t Bagging()
call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
tomato 1-D histogram with a float per channel (see TH1 documentation)}
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
Short_t Min(Short_t a, Short_t b)
void ToLower()
Change string to lower-case.
virtual void SetYTitle(const char *title)
virtual void SetTitle(const char *title="")
Set graph title.
Double_t AdaBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
the AdaBoost implementation.
UInt_t GetNClasses() const
void ProcessOptions()
the option string is decoded, for available options see "DeclareOptions"
Int_t FloorNint(Double_t x)
void GetHelpMessage() const
Get help message text.
Bool_t GetCutType(void) const
std::vector< Bool_t > fIsHighBkgCut
void SetShrinkage(Double_t s)
Double_t AdaCost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
the AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events...
void MakeClassSpecific(std::ostream &, const TString &) const
make ROOT-independent C++ class for classifier response (classifier-specific implementation) ...
TString GetElapsedTime(Bool_t Scientific=kTRUE)
virtual Double_t GetROCIntegral(TH1D *histS, TH1D *histB) const
calculate the area (integral) under the ROC curve as a overall quality measure of the classification ...
TString fRegressionLossFunctionBDTGS
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
const TString & GetInputLabel(Int_t i) const
std::vector< Double_t > fHighBkgCut
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
returns MVA value: -1 for background, 1 for signal
Double_t fBaggedSampleFraction
Bool_t fInverseBoostNegWeights
Double_t GradBoostRegression(std::vector< const TMVA::Event *> &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as desribed by Friedman 1999.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
set the tuning parameters accoding to the argument
void MakeClassSpecificHeader(std::ostream &, const TString &) const
specific class header
Float_t GetCutValue(void) const
UInt_t GetTrainingTMVAVersionCode() const
const Event * GetEvent() const
Double_t fSigToBkgFraction
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
UInt_t GetNFisherCoeff() const
void Reset(void)
reset the method, as if it had just been instantiated (forget all training etc.)
TString & Append(const char *cs)
void SetMinNodeSize(Double_t sizeInPercent)
void Init(std::vector< TString > &graphTitles)
This function gets some title and it creates a TGraph for every title.
DataSetInfo & DataInfo() const
Bool_t DoRegression() const
Double_t AdaBoostR2(std::vector< const TMVA::Event *> &, DecisionTree *dt)
adaption of the AdaBoost to regression problems (see H.Drucker 1997)
std::vector< Double_t > fHighSigCut
Long64_t GetNTrainingEvents() const
const std::vector< Float_t > & GetMulticlassValues()
get the multiclass MVA response for the BDT classifier
virtual void Print(Option_t *option="") const
Print TNamed name and title.
const Event * GetTrainingEvent(Long64_t ievt) const
Bool_t fNoNegWeightsInTraining
Bool_t DoMulticlass() const
const std::vector< Float_t > & GetRegressionValues()
get the regression value generated by the BDTs
void InitEventSample()
initialize the event sample (i.e. reset the boost-weights... etc)
std::vector< Bool_t > fIsLowBkgCut
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual void Delete(Option_t *option="")
Delete this object.
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
const Event * GetTestingEvent(Long64_t ievt) const
virtual Double_t Determinant() const
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Float_t GetTarget(UInt_t itgt) const
Bool_t HasTrainingTree() const
Results * GetResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
TString info(resultsName+"/"); switch(type) { case Types::kTraining: info += "kTraining/"; break; cas...
std::vector< Double_t > fLowBkgCut
Int_t GetNodeType(void) const
Double_t fNodePurityLimit
Service class for 2-Dim histogram classes.
void SetBaggedSampleFraction(Double_t f)
const char * GetName() const
ClassInfo * GetClassInfo(Int_t clNum) const
std::map< TString, Double_t > optimize()
TGraph * GetGraph(const TString &alias) const
void BoostMonitor(Int_t iTree)
fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training ...
Double_t GetFisherCoeff(Int_t ivar) const
Bool_t fTrainWithNegWeights
Bool_t fSkipNormalization
void DeleteResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
delete the results stored for this particulary Method instance (here appareantly called resultsName i...
virtual ~MethodBDT(void)
destructor Note: fEventSample and ValidationSample are already deleted at the end of TRAIN When they ...
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
void SetNodePurityLimit(Double_t l)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
Double_t GradBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
char * Form(const char *fmt,...)
const TString & GetMethodName() const
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
SeparationBase * fSepType
void Init(void)
common initialisation with defaults for the BDT-Method
void ReadWeightsFromXML(void *parent)
reads the BDT from the xml file
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
virtual const char * GetPath() const
Returns the full path of the directory.
Double_t TestTreeQuality(DecisionTree *dt)
test the tree quality.. in terms of Miscalssification
Long64_t GetNTestEvents() const
UInt_t GetNVariables() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
DecisionTree::EPruneMethod fPruneMethod
static void SetVarIndex(Int_t iVar)
Float_t GetPurity(void) const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
void ReadWeightsFromStream(std::istream &istr)
read the weights (BDT coefficients)
Double_t ApplyPreselectionCuts(const Event *ev)
aply the preselection cuts before even bothing about any Decision Trees in the GetMVA ...
void UpdateTargets(std::vector< const TMVA::Event *> &, UInt_t cls=0)
Calculate residua for all events;.
std::vector< Float_t > * fMulticlassReturnVal
Bool_t IsNormalised() const
void SetMaxDepth(Int_t d)
TH1 * GetHist(const TString &alias) const
void AddWeightsXMLTo(void *parent) const
write weights to XML
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
static RooMathCoreReg dummy
void SetAdaBoostBeta(Double_t b)
void SetCurrentType(Types::ETreeType type) const
std::vector< const TMVA::Event * > * fTrainSample
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
VariableInfo & GetVariableInfo(Int_t i)
void AddPreDefVal(const T &)
Double_t Boost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
apply the boosting alogrithim (the algorithm is selecte via the the "option" given in the constructor...
const TString & GetOptions() const
LossFunctionBDT * fRegressionLossFunctionBDTG
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
call the Optimzier with the set of paremeters and ranges that are meant to be tuned.
virtual Int_t Branch(TCollection *list, Int_t bufsize=32000, Int_t splitlevel=99, const char *name="")
Create one branch for each element in the collection.
#define REGISTER_METHOD(CLASS)
for example
TString fNegWeightTreatment
Abstract ClassifierFactory template that handles arbitrary types.
virtual void SetXTitle(const char *title)
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
IPythonInteractive * fInteractive
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
Float_t GetResponse(void) const
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Short_t Max(Short_t a, Short_t b)
A Graph is a graphics object made of two arrays X and Y with npoints each.
virtual DecisionTreeNode * GetLeft() const
std::vector< const TMVA::Event * > fValidationSample
std::vector< DecisionTree * > fForest
virtual DecisionTreeNode * GetRight() const
Bool_t IsSignal(const Event *ev) const
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t fFValidationEvents
std::vector< Double_t > fLowSigCut
std::vector< Float_t > * fRegressionReturnVal
Double_t Atof() const
Return floating-point value contained in string.
void UpdateTargetsRegression(std::vector< const TMVA::Event *> &, Bool_t first=kFALSE)
Calculate current residuals for all events and update targets for next iteration. ...
Types::EAnalysisType GetAnalysisType() const
A TTree object has a header with a name and a title.
Short_t GetSelector() const
std::map< const TMVA::Event *, std::vector< double > > fResiduals
void Store(TObject *obj, const char *alias=0)
static const Int_t fgDebugLevel
virtual void Init(std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap, std::vector< double > &boostWeights)=0
Double_t Sqrt(Double_t x)
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
THist< 2, float, THistStatContent, THistStatUncertainty > TH2F
double norm(double *x, double *p)
std::vector< const TMVA::Event * > fSubSample
Int_t CeilNint(Double_t x)
virtual void SetTargets(std::vector< const TMVA::Event *> &evs, std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap)=0
void InitGradBoost(std::vector< const TMVA::Event *> &)
initialize targets for first tree
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
void SetSignalReferenceCut(Double_t cut)
virtual const char * GetTitle() const
Returns title of object.
std::vector< double > fBoostWeights
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
the standard constructor for the "boosted decision trees"