#include <algorithm>
#include <math.h>
#include <fstream>
#include "Riostream.h"
#include "TRandom3.h"
#include "TRandom3.h"
#include "TMath.h"
#include "TObjString.h"
#include "TMVA/ClassifierFactory.h"
#include "TMVA/MethodBDT.h"
#include "TMVA/Tools.h"
#include "TMVA/Timer.h"
#include "TMVA/Ranking.h"
#include "TMVA/SdivSqrtSplusB.h"
#include "TMVA/BinarySearchTree.h"
#include "TMVA/SeparationBase.h"
#include "TMVA/GiniIndex.h"
#include "TMVA/GiniIndexWithLaplace.h"
#include "TMVA/CrossEntropy.h"
#include "TMVA/MisClassificationError.h"
#include "TMVA/Results.h"
using std::vector;
REGISTER_METHOD(BDT)
ClassImp(TMVA::MethodBDT)
const Int_t TMVA::MethodBDT::fgDebugLevel = 0;
TMVA::MethodBDT::MethodBDT( const TString& jobName,
const TString& methodTitle,
DataSetInfo& theData,
const TString& theOption,
TDirectory* theTargetDir ) :
TMVA::MethodBase( jobName, Types::kBDT, methodTitle, theData, theOption, theTargetDir )
{
}
TMVA::MethodBDT::MethodBDT( DataSetInfo& theData,
const TString& theWeightFile,
TDirectory* theTargetDir )
: TMVA::MethodBase( Types::kBDT, theData, theWeightFile, theTargetDir )
{
}
Bool_t TMVA::MethodBDT::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets )
{
if( type == Types::kClassification && numberClasses == 2 ) return kTRUE;
if( type == Types::kRegression && numberTargets == 1 ) return kTRUE;
return kFALSE;
}
void TMVA::MethodBDT::DeclareOptions()
{
DeclareOptionRef(fNTrees, "NTrees", "Number of trees in the forest");
DeclareOptionRef(fBoostType, "BoostType", "Boosting type for the trees in the forest");
AddPreDefVal(TString("AdaBoost"));
AddPreDefVal(TString("Bagging"));
AddPreDefVal(TString("RegBoost"));
AddPreDefVal(TString("AdaBoostR2"));
AddPreDefVal(TString("Grad"));
if (DoRegression()) {
fBoostType = "AdaBoostR2";
}else{
fBoostType = "AdaBoost";
}
DeclareOptionRef(fAdaBoostR2Loss="Quadratic", "AdaBoostR2Loss", "Type of Loss function in AdaBoostR2t (Linear,Quadratic or Exponential)");
AddPreDefVal(TString("Linear"));
AddPreDefVal(TString("Quadratic"));
AddPreDefVal(TString("Exponential"));
DeclareOptionRef(fBaggedGradBoost=kFALSE, "UseBaggedGrad","Use only a random subsample of all events for growing the trees in each iteration. (Only valid for GradBoost)");
DeclareOptionRef(fSampleFraction=0.6, "GradBaggingFraction","Defines the fraction of events to be used in each iteration when UseBaggedGrad=kTRUE. (Only valid for GradBoost)");
DeclareOptionRef(fShrinkage=1.0, "Shrinkage", "Learning rate for GradBoost algorithm");
DeclareOptionRef(fAdaBoostBeta=1.0, "AdaBoostBeta", "Parameter for AdaBoost algorithm");
DeclareOptionRef(fRandomisedTrees,"UseRandomisedTrees","Choose at each node splitting a random set of variables");
DeclareOptionRef(fUseNvars,"UseNvars","Number of variables used if randomised tree option is chosen");
DeclareOptionRef(fUseNTrainEvents,"UseNTrainEvents","number of randomly picked training events used in randomised (and bagged) trees");
DeclareOptionRef(fUseWeightedTrees=kTRUE, "UseWeightedTrees",
"Use weighted trees or simple average in classification from the forest");
DeclareOptionRef(fUseYesNoLeaf=kTRUE, "UseYesNoLeaf",
"Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node");
if (DoRegression()) {
fUseYesNoLeaf = kFALSE;
}
DeclareOptionRef(fNodePurityLimit=0.5, "NodePurityLimit", "In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
DeclareOptionRef(fSepTypeS, "SeparationType", "Separation criterion for node splitting");
AddPreDefVal(TString("CrossEntropy"));
AddPreDefVal(TString("GiniIndex"));
AddPreDefVal(TString("GiniIndexWithLaplace"));
AddPreDefVal(TString("MisClassificationError"));
AddPreDefVal(TString("SDivSqrtSPlusB"));
AddPreDefVal(TString("RegressionVariance"));
if (DoRegression()) {
fSepTypeS = "RegressionVariance";
}else{
fSepTypeS = "GiniIndex";
}
DeclareOptionRef(fNodeMinEvents, "nEventsMin", "Minimum number of events required in a leaf node (default: max(20, N_train/(Nvar^2)/10) ) ");
DeclareOptionRef(fNCuts, "nCuts", "Number of steps during node cut optimisation");
DeclareOptionRef(fPruneStrength, "PruneStrength", "Pruning strength");
DeclareOptionRef(fPruneMethodS, "PruneMethod", "Method used for pruning (removal) of statistically insignificant branches");
AddPreDefVal(TString("NoPruning"));
AddPreDefVal(TString("ExpectedError"));
AddPreDefVal(TString("CostComplexity"));
DeclareOptionRef(fPruneBeforeBoost=kFALSE, "PruneBeforeBoost", "Flag to prune the tree before applying boosting algorithm");
DeclareOptionRef(fFValidationEvents=0.5, "PruningValFraction", "Fraction of events to use for optimizing automatic pruning.");
DeclareOptionRef(fNNodesMax=100000,"NNodesMax","Max number of nodes in tree");
if (DoRegression()) {
DeclareOptionRef(fMaxDepth=50,"MaxDepth","Max depth of the decision tree allowed");
}else{
DeclareOptionRef(fMaxDepth=3,"MaxDepth","Max depth of the decision tree allowed");
}
}
void TMVA::MethodBDT::DeclareCompatibilityOptions() {
MethodBase::DeclareCompatibilityOptions();
DeclareOptionRef(fSampleSizeFraction=1.0,"SampleSizeFraction","Relative size of bagged event sample to original size of the data sample" );
DeclareOptionRef(fNoNegWeightsInTraining,"NoNegWeightsInTraining","Ignore negative event weights in the training process" );
}
void TMVA::MethodBDT::ProcessOptions()
{
fSepTypeS.ToLower();
if (fSepTypeS == "misclassificationerror") fSepType = new MisClassificationError();
else if (fSepTypeS == "giniindex") fSepType = new GiniIndex();
else if (fSepTypeS == "giniindexwithlaplace") fSepType = new GiniIndexWithLaplace();
else if (fSepTypeS == "crossentropy") fSepType = new CrossEntropy();
else if (fSepTypeS == "sdivsqrtsplusb") fSepType = new SdivSqrtSplusB();
else if (fSepTypeS == "regressionvariance") fSepType = NULL;
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<ProcessOptions> unknown Separation Index option called" << Endl;
}
fPruneMethodS.ToLower();
if (fPruneMethodS == "expectederror") fPruneMethod = DecisionTree::kExpectedErrorPruning;
else if (fPruneMethodS == "costcomplexity") fPruneMethod = DecisionTree::kCostComplexityPruning;
else if (fPruneMethodS == "nopruning") fPruneMethod = DecisionTree::kNoPruning;
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<ProcessOptions> unknown PruneMethod option called" << Endl;
}
if (fPruneStrength < 0 && (fPruneMethod != DecisionTree::kNoPruning) && fBoostType!="Grad") fAutomatic = kTRUE;
else fAutomatic = kFALSE;
if (fAutomatic && fPruneMethod==DecisionTree::kExpectedErrorPruning){
Log() << kFATAL
<< "Sorry autmoatic pruning strength determination is not implemented yet for ExpectedErrorPruning" << Endl;
}
fAdaBoostR2Loss.ToLower();
if (fBoostType!="Grad") fBaggedGradBoost=kFALSE;
else fPruneMethod = DecisionTree::kNoPruning;
if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
if (fAutomatic && fFValidationEvents > 0.5) {
Log() << kWARNING << "You have chosen to use more than half of your training sample "
<< "to optimize the automatic pruning algorithm. This is probably wasteful "
<< "and your overall results will be degraded. Are you sure you want this?"
<< Endl;
}
if (this->Data()->HasNegativeEventWeights()){
Log() << kINFO << " You are using a Monte Carlo that has also negative weights. "
<< "That should in principle be fine as long as on average you end up with "
<< "something positive. For this you have to make sure that the minimal number "
<< "of (un-weighted) events demanded for a tree node (currently you use: nEventsMin="
<<fNodeMinEvents<<", you can set this via the BDT option string when booking the "
<< "classifier) is large enough to allow for reasonable averaging!!! "
<< " If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
<< "which ignores events with negative weight in the training. " << Endl
<< Endl << "Note: You'll get a WARNING message during the training if that should ever happen" << Endl;
}
if (DoRegression()) {
if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
Log() << kWARNING << "Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" << Endl;
fUseYesNoLeaf = kFALSE;
}
if (fSepType != NULL){
Log() << kWARNING << "Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" << Endl;
fSepType = NULL;
}
}
if (fRandomisedTrees){
Log() << kINFO << " Randomised trees use *bagging* as *boost* method and no pruning" << Endl;
fPruneMethod = DecisionTree::kNoPruning;
fBoostType = "Bagging";
}
}
void TMVA::MethodBDT::Init( void )
{
fNTrees = 400;
if (fAnalysisType == Types::kClassification || fAnalysisType == Types::kMulticlass ) {
fMaxDepth = 3;
fBoostType = "AdaBoost";
}else {
fMaxDepth = 100;
fBoostType = "AdaBoostR2";
fAdaBoostR2Loss = "Quadratic";
}
fNodeMinEvents = TMath::Max( Int_t(40), Int_t( Data()->GetNTrainingEvents() / (10*GetNvar()*GetNvar())) );
fNCuts = 20;
fPruneMethodS = "CostComplexity";
fPruneMethod = DecisionTree::kCostComplexityPruning;
fPruneStrength = -1.0;
fFValidationEvents = 0.5;
fRandomisedTrees = kFALSE;
fUseNvars = (GetNvar()>12) ? UInt_t(GetNvar()/8) : TMath::Max(UInt_t(2),UInt_t(GetNvar()/3));
fUseNTrainEvents = Data()->GetNTrainingEvents();
fNNodesMax = 1000000;
fShrinkage = 1.0;
SetSignalReferenceCut( 0 );
}
TMVA::MethodBDT::~MethodBDT( void )
{
for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
for (UInt_t i=0; i<fValidationSample.size(); i++) delete fValidationSample[i];
for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
}
void TMVA::MethodBDT::InitEventSample( void )
{
if (!HasTrainingTree()) Log() << kFATAL << "<Init> Data().TrainingTree() is zero pointer" << Endl;
UInt_t nevents = Data()->GetNTrainingEvents();
Bool_t first=kTRUE;
for (UInt_t ievt=0; ievt<nevents; ievt++) {
Event* event = new Event( *GetTrainingEvent(ievt) );
if (!IgnoreEventsWithNegWeightsInTraining() || event->GetWeight() > 0) {
if (first && event->GetWeight() < 0) {
first = kFALSE;
Log() << kINFO << "Events with negative event weights are ignored during "
<< "the BDT training (option IgnoreNegWeightsInTraining is now enabled)"
<< Endl;
continue;
}
if (fAutomatic) {
Double_t modulo = 1.0/(fFValidationEvents);
Int_t imodulo = static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ? ceil(modulo) : floor(modulo) );
if (ievt % imodulo == 0) fValidationSample.push_back( event );
else fEventSample.push_back( event );
}
else {
fEventSample.push_back(event);
}
}
}
if (fAutomatic) {
Log() << kINFO << "<InitEventSample> Internally I use " << fEventSample.size()
<< " for Training and " << fValidationSample.size()
<< " for Pruning Validation (" << ((Float_t)fValidationSample.size())/((Float_t)fEventSample.size()+fValidationSample.size())*100.0
<< "% of training used for validation)" << Endl;
}
}
void TMVA::MethodBDT::Train()
{
InitEventSample();
if (IsNormalised()) Log() << kFATAL << "\"Normalise\" option cannot be used with BDT; "
<< "please remove the option from the configuration string, or "
<< "use \"!Normalise\""
<< Endl;
Log() << kINFO << "Training "<< fNTrees << " Decision Trees ... patience please" << Endl;
Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
Int_t nBins;
Double_t xMin,xMax;
TString hname = "AdaBooost weight distribution";
nBins= 100;
xMin = 0;
xMax = 30;
if (DoRegression()) {
nBins= 100;
xMin = 0;
xMax = 1;
hname="Boost event weights distribution";
}
TH1* h = new TH1F("BoostWeight",hname,nBins,xMin,xMax);
h->SetXTitle("boost weight");
results->Store(h, "BoostWeights");
h = new TH1F("BoostWeightVsTree","Boost weights vs tree",fNTrees,0,fNTrees);
h->SetXTitle("#tree");
h->SetYTitle("boost weight");
results->Store(h, "BoostWeightsVsTree");
h = new TH1F("ErrFractHist","error fraction vs tree number",fNTrees,0,fNTrees);
h->SetXTitle("#tree");
h->SetYTitle("error fraction");
results->Store(h, "ErrorFrac");
TH1* nodesBeforePruningVsTree = new TH1I("NodesBeforePruning","nodes before pruning",fNTrees,0,fNTrees);
nodesBeforePruningVsTree->SetXTitle("#tree");
nodesBeforePruningVsTree->SetYTitle("#tree nodes");
results->Store(nodesBeforePruningVsTree);
TH1* nodesAfterPruningVsTree = new TH1I("NodesAfterPruning","nodes after pruning",fNTrees,0,fNTrees);
nodesAfterPruningVsTree->SetXTitle("#tree");
nodesAfterPruningVsTree->SetYTitle("#tree nodes");
results->Store(nodesAfterPruningVsTree);
fMonitorNtuple= new TTree("MonitorNtuple","BDT variables");
fMonitorNtuple->Branch("iTree",&fITree,"iTree/I");
fMonitorNtuple->Branch("boostWeight",&fBoostWeight,"boostWeight/D");
fMonitorNtuple->Branch("errorFraction",&fErrorFraction,"errorFraction/D");
Timer timer( fNTrees, GetName() );
Int_t nNodesBeforePruningCount = 0;
Int_t nNodesAfterPruningCount = 0;
Int_t nNodesBeforePruning = 0;
Int_t nNodesAfterPruning = 0;
TH1D *alpha = new TH1D("alpha","PruneStrengths",fNTrees,0,fNTrees);
alpha->SetXTitle("#tree");
alpha->SetYTitle("PruneStrength");
if(fBoostType=="Grad"){
InitGradBoost(fEventSample);
}
for (int itree=0; itree<fNTrees; itree++) {
timer.DrawProgressBar( itree );
fForest.push_back( new DecisionTree( fSepType, fNodeMinEvents, fNCuts,
fRandomisedTrees, fUseNvars, fNNodesMax, fMaxDepth,
itree, fNodePurityLimit, itree));
if (fBaggedGradBoost) nNodesBeforePruning = fForest.back()->BuildTree(fSubSample);
else nNodesBeforePruning = fForest.back()->BuildTree(fEventSample);
if (fBoostType!="Grad")
if (fUseYesNoLeaf && !DoRegression() ){
nNodesBeforePruning = fForest.back()->CleanTree();
}
nNodesBeforePruningCount += nNodesBeforePruning;
nodesBeforePruningVsTree->SetBinContent(itree+1,nNodesBeforePruning);
fForest.back()->SetPruneMethod(fPruneMethod);
fForest.back()->SetPruneStrength(fPruneStrength);
std::vector<Event*> * validationSample = NULL;
if(fAutomatic) validationSample = &fValidationSample;
if(fBoostType=="Grad"){
this->Boost(fEventSample, fForest.back(), itree);
}
else {
if(!fPruneBeforeBoost) {
fBoostWeights.push_back( this->Boost(fEventSample, fForest.back(), itree) );
Double_t pruneStrength = fForest.back()->PruneTree(validationSample);
alpha->SetBinContent(itree+1,pruneStrength);
}
else {
Double_t pruneStrength = fForest.back()->PruneTree(validationSample);
alpha->SetBinContent(itree+1,pruneStrength);
fBoostWeights.push_back( this->Boost(fEventSample, fForest.back(), itree) );
}
if (fUseYesNoLeaf && !DoRegression() ){
fForest.back()->CleanTree();
}
}
nNodesAfterPruning = fForest.back()->GetNNodes();
nNodesAfterPruningCount += nNodesAfterPruning;
nodesAfterPruningVsTree->SetBinContent(itree+1,nNodesAfterPruning);
fITree = itree;
fMonitorNtuple->Fill();
}
alpha->Write();
Log() << kINFO << "<Train> elapsed time: " << timer.GetElapsedTime()
<< " " << Endl;
if (fPruneMethod == DecisionTree::kNoPruning) {
Log() << kINFO << "<Train> average number of nodes (w/o pruning) : "
<< nNodesBeforePruningCount/fNTrees << Endl;
}
else {
Log() << kINFO << "<Train> average number of nodes before/after pruning : "
<< nNodesBeforePruningCount/fNTrees << " / "
<< nNodesAfterPruningCount/fNTrees
<< Endl;
}
}
void TMVA::MethodBDT::GetRandomSubSample()
{
UInt_t nevents = fEventSample.size();
UInt_t nfraction = static_cast<UInt_t>(fSampleFraction*Data()->GetNTrainingEvents());
if (fSubSample.size()!=0) fSubSample.clear();
TRandom3 *trandom = new TRandom3(fForest.size());
for (UInt_t ievt=0; ievt<nfraction; ievt++) {
fSubSample.push_back(fEventSample[(static_cast<UInt_t>(trandom->Uniform(nevents)-1))]);
}
}
Double_t TMVA::MethodBDT::GetGradBoostMVA(TMVA::Event& e, UInt_t nTrees)
{
Double_t sum=0;
for (UInt_t itree=0; itree<nTrees; itree++) {
sum += fForest[itree]->CheckEvent(e,kFALSE);
}
return 2.0/(1.0+exp(-2.0*sum))-1;
}
void TMVA::MethodBDT::UpdateTargets(vector<TMVA::Event*> eventSample)
{
UInt_t iValue=0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
fBoostWeights[iValue]+=fForest.back()->CheckEvent(*(*e),kFALSE);
Double_t p_sig=1.0/(1.0+exp(-2.0*fBoostWeights[iValue]));
Double_t res = ((*e)->IsSignal()?1:0)-p_sig;
(*e)->SetTarget(0,res);
iValue++;
}
}
void TMVA::MethodBDT::UpdateTargetsRegression(vector<TMVA::Event*> eventSample, Bool_t first)
{
vector<Double_t> absResiduals;
vector< vector<Double_t> > temp;
UInt_t i=0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if(first){
fRegResiduals.push_back((*e)->GetTarget(0)-fBoostWeights[i]);
}
else{
fRegResiduals[i]-=fForest.back()->CheckEvent(*(*e),kFALSE);
}
absResiduals.push_back(fabs(fRegResiduals[i]));
i++;
}
temp.push_back(absResiduals);
temp.push_back(fInitialWeights);
fTransitionPoint = GetWeightedQuantile(temp,0.9);
i=0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if(absResiduals[i]<=fTransitionPoint)
(*e)->SetTarget(0,fRegResiduals[i]);
else
(*e)->SetTarget(0,fTransitionPoint*(fRegResiduals[i]<0?-1.0:1.0));
i++;
}
}
Double_t TMVA::MethodBDT::GetWeightedQuantile(vector< vector<Double_t> > &vec, const Double_t quantile, const Double_t SumOfWeights){
gTools().UsefulSortAscending( vec );
Double_t norm = fSumOfWeights;
if(SumOfWeights!=0.0) norm = SumOfWeights;
Double_t temp = 0.0;
Int_t i = 0;
while(temp <= norm*quantile){
temp += vec[1][i];
i++;
}
return vec[0][i];
}
Double_t TMVA::MethodBDT::GradBoost( vector<TMVA::Event*> eventSample, DecisionTree *dt )
{
std::map<TMVA::DecisionTreeNode*,vector<Double_t> > leaves;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
if ((leaves[node]).size()==0){
(leaves[node]).push_back((*e)->GetTarget(0) * (*e)->GetWeight());
(leaves[node]).push_back(fabs((*e)->GetTarget(0))*(1.0-fabs((*e)->GetTarget(0))) * (*e)->GetWeight() * (*e)->GetWeight());
}
else {
(leaves[node])[0]+=((*e)->GetTarget(0) * (*e)->GetWeight());
(leaves[node])[1]+=fabs((*e)->GetTarget(0))*(1.0-fabs((*e)->GetTarget(0))) *
((*e)->GetWeight()) * ((*e)->GetWeight());
}
}
for (std::map<TMVA::DecisionTreeNode*,vector<Double_t> >::iterator iLeave=leaves.begin();
iLeave!=leaves.end();++iLeave){
if ((iLeave->second)[1]<1e-30) (iLeave->second)[1]=1e-30;
(iLeave->first)->SetResponse(fShrinkage*0.5*(iLeave->second)[0]/((iLeave->second)[1]));
}
UpdateTargets(eventSample);
if (fBaggedGradBoost) GetRandomSubSample();
return 1;
}
Double_t TMVA::MethodBDT::GradBoostRegression( vector<TMVA::Event*> eventSample, DecisionTree *dt )
{
std::map<TMVA::DecisionTreeNode*,vector< vector<Double_t> > > leaves;
UInt_t i =0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
if(leaves[node].size()==0){
(leaves[node]).push_back(vector<Double_t>());
(leaves[node]).push_back(vector<Double_t>());
}
(leaves[node])[0].push_back(fRegResiduals[i]);
(leaves[node])[1].push_back((*e)->GetWeight());
i++;
}
for (std::map<TMVA::DecisionTreeNode*,vector<vector<Double_t> > >::iterator iLeave=leaves.begin();
iLeave!=leaves.end();++iLeave){
Double_t LeaveWeight = 0;
for(UInt_t j=0;j<((iLeave->second)[0].size());j++){
LeaveWeight+=((iLeave->second)[1][j]);
}
Double_t shift=0,diff= 0;
Double_t ResidualMedian = GetWeightedQuantile(iLeave->second,0.5,LeaveWeight);
for(UInt_t j=0;j<((iLeave->second)[0].size());j++){
diff = (iLeave->second)[0][j]-ResidualMedian;
shift+=1.0/((iLeave->second)[0].size())*((diff<0)?-1.0:1.0)*TMath::Min(fTransitionPoint,fabs(diff));
}
(iLeave->first)->SetResponse(fShrinkage*(ResidualMedian+shift));
}
UpdateTargetsRegression(eventSample);
return 1;
}
void TMVA::MethodBDT::InitGradBoost( vector<TMVA::Event*> eventSample)
{
fSepType=NULL;
if(DoRegression()){
vector< vector<Double_t> > weightedTargetValues;
vector<Double_t> targets;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
targets.push_back((*e)->GetTarget(0));
fInitialWeights.push_back((*e)->GetWeight());
fSumOfWeights+=(*e)->GetWeight();
}
weightedTargetValues.push_back(targets);
weightedTargetValues.push_back(fInitialWeights);
Double_t weightedMedian = GetWeightedQuantile(weightedTargetValues,0.5);
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
fBoostWeights.push_back(weightedMedian);
}
UpdateTargetsRegression(eventSample,kTRUE);
}
else{
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t r = ((*e)->IsSignal()?1:0)-0.5;
(*e)->SetTarget(0,r);
fBoostWeights.push_back(0);
}
}
if (fBaggedGradBoost) GetRandomSubSample();
}
Double_t TMVA::MethodBDT::TestTreeQuality( DecisionTree *dt )
{
Double_t ncorrect=0, nfalse=0;
for (UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
Bool_t isSignalType= (dt->CheckEvent(*(fValidationSample[ievt])) > fNodePurityLimit ) ? 1 : 0;
if (isSignalType == ((fValidationSample[ievt])->IsSignal()) ) {
ncorrect += fValidationSample[ievt]->GetWeight();
}
else{
nfalse += fValidationSample[ievt]->GetWeight();
}
}
return ncorrect / (ncorrect + nfalse);
}
Double_t TMVA::MethodBDT::Boost( vector<TMVA::Event*> eventSample, DecisionTree *dt, Int_t iTree )
{
if (fBoostType=="AdaBoost") return this->AdaBoost (eventSample, dt);
else if (fBoostType=="Bagging") return this->Bagging (eventSample, iTree);
else if (fBoostType=="RegBoost") return this->RegBoost (eventSample, dt);
else if (fBoostType=="AdaBoostR2") return this->AdaBoostR2(eventSample, dt);
else if (fBoostType=="Grad"){
if(DoRegression())
return this->GradBoostRegression(eventSample, dt);
else
return this->GradBoost (eventSample, dt);
}
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<Boost> unknown boost option called" << Endl;
}
return -1;
}
Double_t TMVA::MethodBDT::AdaBoost( vector<TMVA::Event*> eventSample, DecisionTree *dt )
{
Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
Double_t maxDev=0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
sumw += w;
if ( DoRegression() ) {
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
sumwfalse += w * tmpDev;
sumwfalse2 += w * tmpDev*tmpDev;
if (tmpDev > maxDev) maxDev = tmpDev;
}else{
Bool_t isSignalType = (dt->CheckEvent(*(*e),fUseYesNoLeaf) > fNodePurityLimit );
if (!(isSignalType == (*e)->IsSignal())) {
sumwfalse+= w;
}
}
}
err = sumwfalse/sumw ;
if ( DoRegression() ) {
if (fAdaBoostR2Loss=="linear"){
err = sumwfalse/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="quadratic"){
err = sumwfalse2/maxDev/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="exponential"){
err = 0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
err += w * (1 - exp (-tmpDev/maxDev)) / sumw;
}
}
else {
Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
<< " namely " << fAdaBoostR2Loss << "\n"
<< "and this is not implemented... a typo in the options ??" <<Endl;
}
}
Double_t newSumw=0;
Double_t boostWeight=1.;
if (err >= 0.5) {
Log() << kWARNING << " The error rate in the BDT boosting is > 0.5. ("<< err
<< ") That should not happen, please check your code (i.e... the BDT code), I "
<< " set it to 0.5.. just to continue.." << Endl;
err = 0.5;
} else if (err < 0) {
Log() << kWARNING << " The error rate in the BDT boosting is < 0. That can happen"
<< " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
<< " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
<< " for the time being I set it to its absolute value.. just to continue.." << Endl;
err = TMath::Abs(err);
}
if (fAdaBoostBeta == 1) {
boostWeight = (1.-err)/err;
}
else {
boostWeight = TMath::Power((1.0 - err)/err, fAdaBoostBeta);
}
Results* results = Data()->GetResults(GetMethodName(),Types::kTraining, Types::kMaxAnalysisType);
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if ((!( (dt->CheckEvent(*(*e),fUseYesNoLeaf) > fNodePurityLimit ) == (*e)->IsSignal())) || DoRegression()) {
Double_t boostfactor = boostWeight;
if (DoRegression()) boostfactor = TMath::Power(1/boostWeight,(1.-TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
if ( (*e)->GetWeight() > 0 ){
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
if (DoRegression()) results->GetHist("BoostWeights")->Fill(boostfactor);
} else {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
}
}
newSumw+=(*e)->GetWeight();
}
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * sumw / newSumw );
}
if (!(DoRegression()))results->GetHist("BoostWeights")->Fill(boostWeight);
results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),boostWeight);
results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
fBoostWeight = boostWeight;
fErrorFraction = err;
return TMath::Log(boostWeight);
}
Double_t TMVA::MethodBDT::Bagging( vector<TMVA::Event*> eventSample, Int_t iTree )
{
Double_t newSumw=0;
Double_t newWeight;
TRandom3 *trandom = new TRandom3(iTree);
Double_t eventFraction = fUseNTrainEvents/Data()->GetNTrainingEvents();
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
newWeight = trandom->PoissonD(eventFraction);
(*e)->SetBoostWeight(newWeight);
newSumw+=(*e)->GetBoostWeight();
}
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * eventSample.size() / newSumw );
}
return 1.;
}
Double_t TMVA::MethodBDT::RegBoost( vector<TMVA::Event*> , DecisionTree* )
{
return 1;
}
Double_t TMVA::MethodBDT::AdaBoostR2( vector<TMVA::Event*> eventSample, DecisionTree *dt )
{
if ( !DoRegression() ) Log() << kFATAL << "Somehow you chose a regression boost method for a classification job" << Endl;
Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
Double_t maxDev=0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
sumw += w;
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
sumwfalse += w * tmpDev;
sumwfalse2 += w * tmpDev*tmpDev;
if (tmpDev > maxDev) maxDev = tmpDev;
}
if (fAdaBoostR2Loss=="linear"){
err = sumwfalse/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="quadratic"){
err = sumwfalse2/maxDev/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="exponential"){
err = 0;
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
err += w * (1 - exp (-tmpDev/maxDev)) / sumw;
}
}
else {
Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
<< " namely " << fAdaBoostR2Loss << "\n"
<< "and this is not implemented... a typo in the options ??" <<Endl;
}
if (err >= 0.5) {
Log() << kFATAL << " The error rate in the BDT boosting is > 0.5. "
<< " i.e. " << err
<< " That should induce a stop condition of the boosting " << Endl;
} else if (err < 0) {
Log() << kWARNING << " The error rate in the BDT boosting is < 0. That can happen"
<< " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
<< " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
<< " for the time being I set it to its absolute value.. just to continue.." << Endl;
err = TMath::Abs(err);
}
Double_t boostWeight = err / (1.-err);
Double_t newSumw=0;
Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, Types::kMaxAnalysisType);
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t boostfactor = TMath::Power(boostWeight,(1.-TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
results->GetHist("BoostWeights")->Fill(boostfactor);
if ( (*e)->GetWeight() > 0 ){
Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
if (newWeight == 0) {
std::cout << "Weight= " << (*e)->GetWeight() << std::endl;
std::cout << "BoostWeight= " << (*e)->GetBoostWeight() << std::endl;
std::cout << "boostweight="<<boostWeight << " err= " <<err << std::endl;
std::cout << "NewBoostWeight= " << newBoostWeight << std::endl;
std::cout << "boostfactor= " << boostfactor << std::endl;
std::cout << "maxDev = " << maxDev << std::endl;
std::cout << "tmpDev = " << TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) ) << std::endl;
std::cout << "target = " << (*e)->GetTarget(0) << std::endl;
std::cout << "estimate = " << dt->CheckEvent(*(*e),kFALSE) << std::endl;
}
(*e)->SetBoostWeight( newBoostWeight );
} else {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
}
newSumw+=(*e)->GetWeight();
}
for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * sumw / newSumw );
}
results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),1./boostWeight);
results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
fBoostWeight = boostWeight;
fErrorFraction = err;
return TMath::Log(1./boostWeight);
}
void TMVA::MethodBDT::AddWeightsXMLTo( void* parent ) const
{
void* wght = gTools().xmlengine().NewChild(parent, 0, "Weights");
gTools().AddAttr( wght, "NTrees", fForest.size() );
gTools().AddAttr( wght, "TreeType", fForest.back()->GetAnalysisType() );
for (UInt_t i=0; i< fForest.size(); i++) {
void* trxml = fForest[i]->AddXMLTo(wght);
gTools().AddAttr( trxml, "boostWeight", fBoostWeights[i] );
gTools().AddAttr( trxml, "itree", i );
}
}
void TMVA::MethodBDT::ReadWeightsFromXML(void* parent) {
UInt_t i;
for (i=0; i<fForest.size(); i++) delete fForest[i];
fForest.clear();
fBoostWeights.clear();
UInt_t ntrees;
UInt_t analysisType;
Float_t boostWeight;
gTools().ReadAttr( parent, "NTrees", ntrees );
gTools().ReadAttr( parent, "TreeType", analysisType );
void* ch = gTools().xmlengine().GetChild(parent);
i=0;
while(ch) {
fForest.push_back( dynamic_cast<DecisionTree*>( BinaryTree::CreateFromXML(ch, GetTrainingTMVAVersionCode()) ) );
fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
fForest.back()->SetTreeID(i++);
gTools().ReadAttr(ch,"boostWeight",boostWeight);
fBoostWeights.push_back(boostWeight);
ch = gTools().xmlengine().GetNext(ch);
}
}
void TMVA::MethodBDT::ReadWeightsFromStream( istream& istr )
{
TString var, dummy;
Int_t analysisType(0);
istr >> dummy >> fNTrees;
Log() << kINFO << "Read " << fNTrees << " Decision trees" << Endl;
for (UInt_t i=0;i<fForest.size();i++) delete fForest[i];
fForest.clear();
fBoostWeights.clear();
Int_t iTree;
Double_t boostWeight;
for (int i=0;i<fNTrees;i++) {
istr >> dummy >> iTree >> dummy >> boostWeight;
if (iTree != i) {
fForest.back()->Print( cout );
Log() << kFATAL << "Error while reading weight file; mismatch iTree="
<< iTree << " i=" << i
<< " dummy " << dummy
<< " boostweight " << boostWeight
<< Endl;
}
fForest.push_back( new DecisionTree() );
fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
fForest.back()->SetTreeID(i);
fForest.back()->Read(istr, GetTrainingTMVAVersionCode());
fBoostWeights.push_back(boostWeight);
}
}
Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err ){
return this->GetMvaValue( err, 0 );
}
Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err, UInt_t useNTrees )
{
if (err != 0) *err = -1;
UInt_t nTrees = fForest.size();
if (useNTrees > 0 ) nTrees = useNTrees;
if (fBoostType=="Grad") return GetGradBoostMVA(const_cast<TMVA::Event&>(*GetEvent()),nTrees);
Double_t myMVA = 0;
Double_t norm = 0;
for (UInt_t itree=0; itree<nTrees; itree++) {
if (fUseWeightedTrees) {
myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(*GetEvent(),fUseYesNoLeaf);
norm += fBoostWeights[itree];
}
else {
myMVA += fForest[itree]->CheckEvent(*GetEvent(),fUseYesNoLeaf);
norm += 1;
}
}
return myMVA /= norm;
}
const std::vector<Float_t> & TMVA::MethodBDT::GetRegressionValues()
{
if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>();
fRegressionReturnVal->clear();
Double_t myMVA = 0;
Double_t norm = 0;
if (fBoostType=="AdaBoostR2") {
vector< Double_t > response(fForest.size());
vector< Double_t > weight(fForest.size());
Double_t totalSumOfWeights = 0;
for (UInt_t itree=0; itree<fForest.size(); itree++) {
response[itree] = fForest[itree]->CheckEvent(*GetEvent(),kFALSE);
weight[itree] = fBoostWeights[itree];
totalSumOfWeights += fBoostWeights[itree];
}
vector< vector<Double_t> > vtemp;
vtemp.push_back( response );
vtemp.push_back( weight );
gTools().UsefulSortAscending( vtemp );
Int_t t=0;
Double_t sumOfWeights = 0;
while (sumOfWeights <= totalSumOfWeights/2.) {
sumOfWeights += vtemp[1][t];
t++;
}
Double_t rVal=0;
Int_t count=0;
for (UInt_t i= TMath::Max(UInt_t(0),UInt_t(t-(fForest.size()/6)-0.5));
i< TMath::Min(UInt_t(fForest.size()),UInt_t(t+(fForest.size()/6)+0.5)); i++) {
count++;
rVal+=vtemp[0][i];
}
fRegressionReturnVal->push_back( rVal/Double_t(count));
}
else if(fBoostType=="Grad"){
for (UInt_t itree=0; itree<fForest.size(); itree++) {
myMVA += fForest[itree]->CheckEvent(*GetEvent(),kFALSE);
}
fRegressionReturnVal->push_back( myMVA+fBoostWeights[0]);
}
else{
for (UInt_t itree=0; itree<fForest.size(); itree++) {
if (fUseWeightedTrees) {
myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(*GetEvent(),kFALSE);
norm += fBoostWeights[itree];
}
else {
myMVA += fForest[itree]->CheckEvent(*GetEvent(),kFALSE);
norm += 1;
}
}
fRegressionReturnVal->push_back( myMVA/norm );
}
return *fRegressionReturnVal;
}
void TMVA::MethodBDT::WriteMonitoringHistosToFile( void ) const
{
Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
fMonitorNtuple->Write();
}
vector< Double_t > TMVA::MethodBDT::GetVariableImportance()
{
fVariableImportance.resize(GetNvar());
Double_t sum=0;
for (int itree = 0; itree < fNTrees; itree++) {
vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
for (UInt_t i=0; i< relativeImportance.size(); i++) {
fVariableImportance[i] += relativeImportance[i];
}
}
for (UInt_t i=0; i< fVariableImportance.size(); i++) sum += fVariableImportance[i];
for (UInt_t i=0; i< fVariableImportance.size(); i++) fVariableImportance[i] /= sum;
return fVariableImportance;
}
Double_t TMVA::MethodBDT::GetVariableImportance( UInt_t ivar )
{
vector<Double_t> relativeImportance = this->GetVariableImportance();
if (ivar < (UInt_t)relativeImportance.size()) return relativeImportance[ivar];
else Log() << kFATAL << "<GetVariableImportance> ivar = " << ivar << " is out of range " << Endl;
return -1;
}
const TMVA::Ranking* TMVA::MethodBDT::CreateRanking()
{
fRanking = new Ranking( GetName(), "Variable Importance" );
vector< Double_t> importance(this->GetVariableImportance());
for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
fRanking->AddRank( Rank( GetInputLabel(ivar), importance[ivar] ) );
}
return fRanking;
}
void TMVA::MethodBDT::GetHelpMessage() const
{
Log() << Endl;
Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "Boosted Decision Trees are a collection of individual decision" << Endl;
Log() << "trees which form a multivariate classifier by (weighted) majority " << Endl;
Log() << "vote of the individual trees. Consecutive decision trees are " << Endl;
Log() << "trained using the original training data set with re-weighted " << Endl;
Log() << "events. By default, the AdaBoost method is employed, which gives " << Endl;
Log() << "events that were misclassified in the previous tree a larger " << Endl;
Log() << "weight in the training of the following tree." << Endl;
Log() << Endl;
Log() << "Decision trees are a sequence of binary splits of the data sample" << Endl;
Log() << "using a single descriminant variable at a time. A test event " << Endl;
Log() << "ending up after the sequence of left-right splits in a final " << Endl;
Log() << "(\"leaf\") node is classified as either signal or background" << Endl;
Log() << "depending on the majority type of training events in that node." << Endl;
Log() << Endl;
Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "By the nature of the binary splits performed on the individual" << Endl;
Log() << "variables, decision trees do not deal well with linear correlations" << Endl;
Log() << "between variables (they need to approximate the linear split in" << Endl;
Log() << "the two dimensional space by a sequence of splits on the two " << Endl;
Log() << "variables individually). Hence decorrelation could be useful " << Endl;
Log() << "to optimise the BDT performance." << Endl;
Log() << Endl;
Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "The two most important parameters in the configuration are the " << Endl;
Log() << "minimal number of events requested by a leaf node (option " << Endl;
Log() << "\"nEventsMin\"). If this number is too large, detailed features " << Endl;
Log() << "in the parameter space cannot be modelled. If it is too small, " << Endl;
Log() << "the risk to overtrain rises." << Endl;
Log() << " (Imagine the decision tree is split until the leaf node contains" << Endl;
Log() << " only a single event. In such a case, no training event is " << Endl;
Log() << " misclassified, while the situation will look very different" << Endl;
Log() << " for the test sample.)" << Endl;
Log() << Endl;
Log() << "The default minimal number is currently set to " << Endl;
Log() << " max(20, (N_training_events / N_variables^2 / 10)) " << Endl;
Log() << "and can be changed by the user." << Endl;
Log() << Endl;
Log() << "The other crucial parameter, the pruning strength (\"PruneStrength\")," << Endl;
Log() << "is also related to overtraining. It is a regularisation parameter " << Endl;
Log() << "that is used when determining after the training which splits " << Endl;
Log() << "are considered statistically insignificant and are removed. The" << Endl;
Log() << "user is advised to carefully watch the BDT screen output for" << Endl;
Log() << "the comparison between efficiencies obtained on the training and" << Endl;
Log() << "the independent test sample. They should be equal within statistical" << Endl;
Log() << "errors, in order to minimize statistical fluctuations in different samples." << Endl;
}
void TMVA::MethodBDT::MakeClassSpecific( std::ostream& fout, const TString& className ) const
{
fout << " std::vector<BDT_DecisionTreeNode*> fForest; // i.e. root nodes of decision trees" << endl;
fout << " std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << endl;
fout << "};" << endl << endl;
fout << "double " << className << "::GetMvaValue__( const std::vector<double>& inputValues ) const" << endl;
fout << "{" << endl;
fout << " double myMVA = 0;" << endl;
fout << " double norm = 0;" << endl;
fout << " for (unsigned int itree=0; itree<fForest.size(); itree++){" << endl;
fout << " BDT_DecisionTreeNode *current = fForest[itree];" << endl;
fout << " while (current->GetNodeType() == 0) { //intermediate node" << endl;
fout << " if (current->GoesRight(inputValues)) current=(BDT_DecisionTreeNode*)current->GetRight();" << endl;
fout << " else current=(BDT_DecisionTreeNode*)current->GetLeft();" << endl;
fout << " }" << endl;
if (fBoostType=="Grad"){
fout << " myMVA += current->GetResponse();" << endl;
}
else if (fUseWeightedTrees) {
if (fUseYesNoLeaf) fout << " myMVA += fBoostWeights[itree] * current->GetNodeType();" << endl;
else fout << " myMVA += fBoostWeights[itree] * current->GetPurity();" << endl;
fout << " norm += fBoostWeights[itree];" << endl;
}
else {
if (fUseYesNoLeaf) fout << " myMVA += current->GetNodeType();" << endl;
else fout << " myMVA += current->GetPurity();" << endl;
fout << " norm += 1.;" << endl;
}
fout << " }" << endl;
if (fBoostType=="Grad"){
fout << " return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << endl;
}
else fout << " return myMVA /= norm;" << endl;
fout << "};" << endl << endl;
fout << "void " << className << "::Initialize()" << endl;
fout << "{" << endl;
for (int itree=0; itree<fNTrees; itree++) {
fout << " // itree = " << itree << endl;
fout << " fBoostWeights.push_back(" << fBoostWeights[itree] << ");" << endl;
fout << " fForest.push_back( " << endl;
this->MakeClassInstantiateNode((DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
fout <<" );" << endl;
}
fout << " return;" << endl;
fout << "};" << endl;
fout << " " << endl;
fout << "// Clean up" << endl;
fout << "inline void " << className << "::Clear() " << endl;
fout << "{" << endl;
fout << " for (unsigned int itree=0; itree<fForest.size(); itree++) { " << endl;
fout << " delete fForest[itree]; " << endl;
fout << " }" << endl;
fout << "}" << endl;
}
void TMVA::MethodBDT::MakeClassSpecificHeader( std::ostream& fout, const TString& ) const
{
fout << "#ifndef NN" << endl;
fout << "#define NN new BDT_DecisionTreeNode" << endl;
fout << "#endif" << endl;
fout << " " << endl;
fout << "#ifndef BDT_DecisionTreeNode__def" << endl;
fout << "#define BDT_DecisionTreeNode__def" << endl;
fout << " " << endl;
fout << "class BDT_DecisionTreeNode {" << endl;
fout << " " << endl;
fout << "public:" << endl;
fout << " " << endl;
fout << " // constructor of an essentially \"empty\" node floating in space" << endl;
fout << " BDT_DecisionTreeNode ( BDT_DecisionTreeNode* left," << endl;
fout << " BDT_DecisionTreeNode* right," << endl;
fout << " double cutValue, Bool_t cutType, int selector," << endl;
fout << " int nodeType, double purity, double response ) :" << endl;
fout << " fLeft ( left )," << endl;
fout << " fRight ( right )," << endl;
fout << " fCutValue( cutValue )," << endl;
fout << " fCutType ( cutType )," << endl;
fout << " fSelector( selector )," << endl;
fout << " fNodeType( nodeType )," << endl;
fout << " fPurity ( purity )," << endl;
fout << " fResponse( response ){}" << endl << endl;
fout << " virtual ~BDT_DecisionTreeNode();" << endl << endl;
fout << " // test event if it decends the tree at this node to the right" << endl;
fout << " virtual Bool_t GoesRight( const std::vector<double>& inputValues ) const;" << endl;
fout << " BDT_DecisionTreeNode* GetRight( void ) {return fRight; };" << endl << endl;
fout << " // test event if it decends the tree at this node to the left " << endl;
fout << " virtual Bool_t GoesLeft ( const std::vector<double>& inputValues ) const;" << endl;
fout << " BDT_DecisionTreeNode* GetLeft( void ) { return fLeft; }; " << endl << endl;
fout << " // return S/(S+B) (purity) at this node (from training)" << endl << endl;
fout << " double GetPurity( void ) const { return fPurity; } " << endl;
fout << " // return the node type" << endl;
fout << " int GetNodeType( void ) const { return fNodeType; }" << endl;
fout << " double GetResponse(void) const {return fResponse;}" << endl << endl;
fout << "private:" << endl << endl;
fout << " BDT_DecisionTreeNode* fLeft; // pointer to the left daughter node" << endl;
fout << " BDT_DecisionTreeNode* fRight; // pointer to the right daughter node" << endl;
fout << " double fCutValue; // cut value appplied on this node to discriminate bkg against sig" << endl;
fout << " Bool_t fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << endl;
fout << " int fSelector; // index of variable used in node selection (decision tree) " << endl;
fout << " int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << endl;
fout << " double fPurity; // Purity of node from training"<< endl;
fout << " double fResponse; // Regression response value of node" << endl;
fout << "}; " << endl;
fout << " " << endl;
fout << "//_______________________________________________________________________" << endl;
fout << "BDT_DecisionTreeNode::~BDT_DecisionTreeNode()" << endl;
fout << "{" << endl;
fout << " if (fLeft != NULL) delete fLeft;" << endl;
fout << " if (fRight != NULL) delete fRight;" << endl;
fout << "}; " << endl;
fout << " " << endl;
fout << "//_______________________________________________________________________" << endl;
fout << "Bool_t BDT_DecisionTreeNode::GoesRight( const std::vector<double>& inputValues ) const" << endl;
fout << "{" << endl;
fout << " // test event if it decends the tree at this node to the right" << endl;
fout << " Bool_t result = (inputValues[fSelector] > fCutValue );" << endl;
fout << " if (fCutType == true) return result; //the cuts are selecting Signal ;" << endl;
fout << " else return !result;" << endl;
fout << "}" << endl;
fout << " " << endl;
fout << "//_______________________________________________________________________" << endl;
fout << "Bool_t BDT_DecisionTreeNode::GoesLeft( const std::vector<double>& inputValues ) const" << endl;
fout << "{" << endl;
fout << " // test event if it decends the tree at this node to the left" << endl;
fout << " if (!this->GoesRight(inputValues)) return true;" << endl;
fout << " else return false;" << endl;
fout << "}" << endl;
fout << " " << endl;
fout << "#endif" << endl;
fout << " " << endl;
}
void TMVA::MethodBDT::MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout, const TString& className ) const
{
if (n == NULL) {
Log() << kFATAL << "MakeClassInstantiateNode: started with undefined node" <<Endl;
return ;
}
fout << "NN("<<endl;
if (n->GetLeft() != NULL){
this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetLeft() , fout, className);
}
else {
fout << "0";
}
fout << ", " <<endl;
if (n->GetRight() != NULL){
this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetRight(), fout, className );
}
else {
fout << "0";
}
fout << ", " << endl
<< setprecision(6)
<< n->GetCutValue() << ", "
<< n->GetCutType() << ", "
<< n->GetSelector() << ", "
<< n->GetNodeType() << ", "
<< n->GetPurity() << ","
<< n->GetResponse() << ") ";
}