150#include <unordered_map>
171 , fSigToBkgFraction(0)
176 , fBaggedGradBoost(
kFALSE)
180 , fMinNodeSizeS(
"5%")
183 , fMinLinCorrForFisher(.8)
184 , fUseExclusiveVars(0)
186 , fNodePurityLimit(0)
191 , fFValidationEvents(0)
193 , fRandomisedTrees(
kFALSE)
195 , fUsePoissonNvars(0)
196 , fUseNTrainEvents(0)
197 , fBaggedSampleFraction(0)
198 , fNoNegWeightsInTraining(
kFALSE)
199 , fInverseBoostNegWeights(
kFALSE)
200 , fPairNegWeightsGlobal(
kFALSE)
201 , fTrainWithNegWeights(
kFALSE)
211 , fSkipNormalization(
kFALSE)
226 , fSigToBkgFraction(0)
231 , fBaggedGradBoost(
kFALSE)
235 , fMinNodeSizeS(
"5%")
238 , fMinLinCorrForFisher(.8)
239 , fUseExclusiveVars(0)
241 , fNodePurityLimit(0)
246 , fFValidationEvents(0)
248 , fRandomisedTrees(
kFALSE)
250 , fUsePoissonNvars(0)
251 , fUseNTrainEvents(0)
252 , fBaggedSampleFraction(0)
253 , fNoNegWeightsInTraining(
kFALSE)
254 , fInverseBoostNegWeights(
kFALSE)
255 , fPairNegWeightsGlobal(
kFALSE)
256 , fTrainWithNegWeights(
kFALSE)
266 , fSkipNormalization(
kFALSE)
336 DeclareOptionRef(fNTrees,
"NTrees",
"Number of trees in the forest");
337 if (DoRegression()) {
338 DeclareOptionRef(fMaxDepth=50,
"MaxDepth",
"Max depth of the decision tree allowed");
340 DeclareOptionRef(fMaxDepth=3,
"MaxDepth",
"Max depth of the decision tree allowed");
343 TString tmp=
"5%";
if (DoRegression()) tmp=
"0.2%";
344 DeclareOptionRef(fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
346 DeclareOptionRef(fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
348 DeclareOptionRef(fBoostType,
"BoostType",
"Boosting type for the trees in the forest (note: AdaCost is still experimental)");
350 AddPreDefVal(
TString(
"AdaBoost"));
351 AddPreDefVal(
TString(
"RealAdaBoost"));
352 AddPreDefVal(
TString(
"AdaCost"));
353 AddPreDefVal(
TString(
"Bagging"));
355 AddPreDefVal(
TString(
"AdaBoostR2"));
357 if (DoRegression()) {
358 fBoostType =
"AdaBoostR2";
360 fBoostType =
"AdaBoost";
362 DeclareOptionRef(fAdaBoostR2Loss=
"Quadratic",
"AdaBoostR2Loss",
"Type of Loss function in AdaBoostR2");
363 AddPreDefVal(
TString(
"Linear"));
364 AddPreDefVal(
TString(
"Quadratic"));
365 AddPreDefVal(
TString(
"Exponential"));
367 DeclareOptionRef(fBaggedBoost=
kFALSE,
"UseBaggedBoost",
"Use only a random subsample of all events for growing the trees in each boost iteration.");
368 DeclareOptionRef(fShrinkage = 1.0,
"Shrinkage",
"Learning rate for BoostType=Grad algorithm");
369 DeclareOptionRef(fAdaBoostBeta=.5,
"AdaBoostBeta",
"Learning rate for AdaBoost algorithm");
370 DeclareOptionRef(fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
371 DeclareOptionRef(fUseNvars,
"UseNvars",
"Size of the subset of variables used with RandomisedTree option");
372 DeclareOptionRef(fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
373 DeclareOptionRef(fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
375 DeclareOptionRef(fUseYesNoLeaf=
kTRUE,
"UseYesNoLeaf",
376 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
377 if (DoRegression()) {
381 DeclareOptionRef(fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
382 AddPreDefVal(
TString(
"InverseBoostNegWeights"));
383 AddPreDefVal(
TString(
"IgnoreNegWeightsInTraining"));
384 AddPreDefVal(
TString(
"NoNegWeightsInTraining"));
385 AddPreDefVal(
TString(
"PairNegWeightsGlobal"));
390 DeclareOptionRef(fCss=1.,
"Css",
"AdaCost: cost of true signal selected signal");
391 DeclareOptionRef(fCts_sb=1.,
"Cts_sb",
"AdaCost: cost of true signal selected bkg");
392 DeclareOptionRef(fCtb_ss=1.,
"Ctb_ss",
"AdaCost: cost of true bkg selected signal");
393 DeclareOptionRef(fCbb=1.,
"Cbb",
"AdaCost: cost of true bkg selected bkg ");
395 DeclareOptionRef(fNodePurityLimit=0.5,
"NodePurityLimit",
"In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
398 DeclareOptionRef(fSepTypeS,
"SeparationType",
"Separation criterion for node splitting");
399 AddPreDefVal(
TString(
"CrossEntropy"));
400 AddPreDefVal(
TString(
"GiniIndex"));
401 AddPreDefVal(
TString(
"GiniIndexWithLaplace"));
402 AddPreDefVal(
TString(
"MisClassificationError"));
403 AddPreDefVal(
TString(
"SDivSqrtSPlusB"));
404 AddPreDefVal(
TString(
"RegressionVariance"));
405 if (DoRegression()) {
406 fSepTypeS =
"RegressionVariance";
408 fSepTypeS =
"GiniIndex";
411 DeclareOptionRef(fRegressionLossFunctionBDTGS =
"Huber",
"RegressionLossFunctionBDTG",
"Loss function for BDTG regression.");
412 AddPreDefVal(
TString(
"Huber"));
413 AddPreDefVal(
TString(
"AbsoluteDeviation"));
414 AddPreDefVal(
TString(
"LeastSquares"));
416 DeclareOptionRef(fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
418 DeclareOptionRef(fDoBoostMonitor=
kFALSE,
"DoBoostMonitor",
"Create control plot with ROC integral vs tree number");
420 DeclareOptionRef(fUseFisherCuts=
kFALSE,
"UseFisherCuts",
"Use multivariate splits using the Fisher criterion");
421 DeclareOptionRef(fMinLinCorrForFisher=.8,
"MinLinCorrForFisher",
"The minimum linear correlation between two variables demanded for use in Fisher criterion in node splitting");
422 DeclareOptionRef(fUseExclusiveVars=
kFALSE,
"UseExclusiveVars",
"Variables already used in fisher criterion are not anymore analysed individually for node splitting");
425 DeclareOptionRef(fDoPreselection=
kFALSE,
"DoPreselection",
"and and apply automatic pre-selection for 100% efficient signal (bkg) cuts prior to training");
428 DeclareOptionRef(fSigToBkgFraction=1,
"SigToBkgFraction",
"Sig to Bkg ratio used in Training (similar to NodePurityLimit, which cannot be used in real adaboost");
430 DeclareOptionRef(fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
431 AddPreDefVal(
TString(
"NoPruning"));
432 AddPreDefVal(
TString(
"ExpectedError"));
433 AddPreDefVal(
TString(
"CostComplexity"));
435 DeclareOptionRef(fPruneStrength,
"PruneStrength",
"Pruning strength");
437 DeclareOptionRef(fFValidationEvents=0.5,
"PruningValFraction",
"Fraction of events to use for optimizing automatic pruning.");
439 DeclareOptionRef(fSkipNormalization=
kFALSE,
"SkipNormalization",
"Skip normalization at initialization, to keep expectation value of BDT output according to the fraction of events");
442 DeclareOptionRef(fMinNodeEvents=0,
"nEventsMin",
"deprecated: Use MinNodeSize (in % of training events) instead");
444 DeclareOptionRef(fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
445 DeclareOptionRef(fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
446 DeclareOptionRef(fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
447 DeclareOptionRef(fNNodesMax,
"NNodesMax",
"deprecated: Use MaxDepth instead to limit the tree size" );
459 DeclareOptionRef(fHistoricBool=
kTRUE,
"UseWeightedTrees",
460 "Use weighted trees or simple average in classification from the forest");
461 DeclareOptionRef(fHistoricBool=
kFALSE,
"PruneBeforeBoost",
"Flag to prune the tree before applying boosting algorithm");
462 DeclareOptionRef(fHistoricBool=
kFALSE,
"RenormByClass",
"Individually re-normalize each event class to the original size after boosting");
464 AddPreDefVal(
TString(
"NegWeightTreatment"),
TString(
"IgnoreNegWeights"));
475 else if (fSepTypeS ==
"giniindex") fSepType =
new GiniIndex();
477 else if (fSepTypeS ==
"crossentropy") fSepType =
new CrossEntropy();
478 else if (fSepTypeS ==
"sdivsqrtsplusb") fSepType =
new SdivSqrtSplusB();
479 else if (fSepTypeS ==
"regressionvariance") fSepType = NULL;
481 Log() << kINFO << GetOptions() <<
Endl;
482 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " << fSepTypeS <<
" called" <<
Endl;
485 if(!(fHuberQuantile >= 0.0 && fHuberQuantile <= 1.0)){
486 Log() << kINFO << GetOptions() <<
Endl;
487 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " << fHuberQuantile <<
", does not match this criteria" <<
Endl;
491 fRegressionLossFunctionBDTGS.ToLower();
492 if (fRegressionLossFunctionBDTGS ==
"huber") fRegressionLossFunctionBDTG =
new HuberLossFunctionBDT(fHuberQuantile);
496 Log() << kINFO << GetOptions() <<
Endl;
497 Log() << kFATAL <<
"<ProcessOptions> unknown Regression Loss Function BDT option " << fRegressionLossFunctionBDTGS <<
" called" <<
Endl;
500 fPruneMethodS.ToLower();
505 Log() << kINFO << GetOptions() <<
Endl;
506 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " << fPruneMethodS <<
" option called" <<
Endl;
512 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
516 if (fMinNodeEvents > 0){
517 fMinNodeSize =
Double_t(fMinNodeEvents*100.) / Data()->GetNTrainingEvents();
518 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " << fMinNodeEvents<<
" ** the min absolute number \n"
519 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n"
520 <<
"*MinNodeSize* giving the relative number as percentage of training \n"
521 <<
"events instead. \n"
522 <<
"nEventsMin="<<fMinNodeEvents<<
"--> MinNodeSize="<<fMinNodeSize<<
"%"
524 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n"
525 <<
" *MinNodeSize* = " << fMinNodeSizeS <<
" option !!" <<
Endl ;
526 fMinNodeSizeS =
Form(
"%F3.2",fMinNodeSize);
529 SetMinNodeSize(fMinNodeSizeS);
533 fAdaBoostR2Loss.ToLower();
535 if (fBoostType==
"Grad") {
537 if (fNegWeightTreatment==
"InverseBoostNegWeights"){
538 Log() << kINFO <<
"the option NegWeightTreatment=InverseBoostNegWeights does"
539 <<
" not exist for BoostType=Grad" <<
Endl;
540 Log() << kINFO <<
"--> change to new default NegWeightTreatment=Pray" <<
Endl;
541 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
542 fNegWeightTreatment=
"Pray";
543 fNoNegWeightsInTraining=
kFALSE;
545 }
else if (fBoostType==
"RealAdaBoost"){
546 fBoostType =
"AdaBoost";
548 }
else if (fBoostType==
"AdaCost"){
552 if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
553 if (fAutomatic && fFValidationEvents > 0.5) {
554 Log() << kWARNING <<
"You have chosen to use more than half of your training sample "
555 <<
"to optimize the automatic pruning algorithm. This is probably wasteful "
556 <<
"and your overall results will be degraded. Are you sure you want this?"
561 if (this->Data()->HasNegativeEventWeights()){
562 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. "
563 <<
"That should in principle be fine as long as on average you end up with "
564 <<
"something positive. For this you have to make sure that the minimal number "
565 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
566 << fMinNodeSizeS <<
" ("<< fMinNodeSize <<
"%)"
567 <<
", (or the deprecated equivalent nEventsMin) you can set this via the "
568 <<
"BDT option string when booking the "
569 <<
"classifier) is large enough to allow for reasonable averaging!!! "
570 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
571 <<
"which ignores events with negative weight in the training. " <<
Endl
572 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
575 if (DoRegression()) {
576 if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
577 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
581 if (fSepType != NULL){
582 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
586 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
590 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
591 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
592 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
596 if (fRandomisedTrees){
597 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
602 if (fUseFisherCuts) {
603 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
604 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
611 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
612 <<
" I set it to 1 .. just so that the program does not crash"
617 fNegWeightTreatment.ToLower();
618 if (fNegWeightTreatment ==
"ignorenegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
619 else if (fNegWeightTreatment ==
"nonegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
620 else if (fNegWeightTreatment ==
"inverseboostnegweights") fInverseBoostNegWeights =
kTRUE;
621 else if (fNegWeightTreatment ==
"pairnegweightsglobal") fPairNegWeightsGlobal =
kTRUE;
622 else if (fNegWeightTreatment ==
"pray")
Log() << kDEBUG <<
"Yes, good luck with praying " <<
Endl;
624 Log() << kINFO << GetOptions() <<
Endl;
625 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " << fNegWeightTreatment <<
" requested" <<
Endl;
628 if (fNegWeightTreatment ==
"pairnegweightsglobal")
629 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
636 while (tmp < fNNodesMax){
640 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<fNNodesMax
641 <<
"* \n this has been translated to MaxDepth="<<fMaxDepth<<
Endl;
645 if (fUseNTrainEvents>0){
646 fBaggedSampleFraction = (
Double_t) fUseNTrainEvents/Data()->GetNTrainingEvents();
647 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<fUseNTrainEvents
648 <<
"* \n this has been translated to BaggedSampleFraction="<<fBaggedSampleFraction<<
"(%)"<<
Endl;
651 if (fBoostType==
"Bagging") fBaggedBoost =
kTRUE;
652 if (fBaggedGradBoost){
653 fBaggedBoost =
kTRUE;
654 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
662 if (sizeInPercent > 0 && sizeInPercent < 50){
663 fMinNodeSize=sizeInPercent;
666 Log() << kFATAL <<
"you have demanded a minimal node size of "
667 << sizeInPercent <<
"% of the training events.. \n"
668 <<
" that somehow does not make sense "<<
Endl;
678 if (sizeInPercent.
IsFloat()) SetMinNodeSize(sizeInPercent.
Atof());
680 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which "
681 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
693 fBoostType =
"AdaBoost";
694 if(DataInfo().GetNClasses()!=0)
698 fBoostType =
"AdaBoostR2";
699 fAdaBoostR2Loss =
"Quadratic";
700 if(DataInfo().GetNClasses()!=0)
706 fPruneMethodS =
"NoPruning";
710 fFValidationEvents = 0.5;
711 fRandomisedTrees =
kFALSE;
714 fUsePoissonNvars =
kTRUE;
719 SetSignalReferenceCut( 0 );
732 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
735 fBoostWeights.clear();
736 if (fMonitorNtuple) { fMonitorNtuple->Delete(); fMonitorNtuple=NULL; }
737 fVariableImportance.clear();
739 fLossFunctionEventInfo.clear();
744 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
756 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
764 if (!HasTrainingTree())
Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
766 if (fEventSample.size() > 0) {
768 for (
UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
771 UInt_t nevents = Data()->GetNTrainingEvents();
773 std::vector<const TMVA::Event*> tmpEventSample;
774 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
776 Event*
event =
new Event( *GetTrainingEvent(ievt) );
777 tmpEventSample.push_back(event);
780 if (!DoRegression()) DeterminePreselectionCuts(tmpEventSample);
781 else fDoPreselection =
kFALSE;
783 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
788 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
791 Event*
event =
new Event( *GetTrainingEvent(ievt) );
792 if (fDoPreselection){
793 if (
TMath::Abs(ApplyPreselectionCuts(event)) > 0.05) {
799 if (event->GetWeight() < 0 && (IgnoreEventsWithNegWeightsInTraining() || fNoNegWeightsInTraining)){
800 if (firstNegWeight) {
801 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
805 }
else if (event->GetWeight()==0){
806 if (firstZeroWeight) {
808 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
812 if (event->GetWeight() < 0) {
813 fTrainWithNegWeights=
kTRUE;
816 if (fPairNegWeightsGlobal){
817 Log() << kWARNING <<
"Events with negative event weights are found and "
818 <<
" will be removed prior to the actual BDT training by global "
819 <<
" paring (and subsequent annihilation) with positiv weight events"
822 Log() << kWARNING <<
"Events with negative event weights are USED during "
823 <<
"the BDT training. This might cause problems with small node sizes "
824 <<
"or with the boosting. Please remove negative events from training "
825 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
826 <<
"observe problems with the boosting"
833 Double_t modulo = 1.0/(fFValidationEvents);
834 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
835 if (ievt % imodulo == 0) fValidationSample.push_back( event );
836 else fEventSample.push_back( event );
839 fEventSample.push_back(event);
845 Log() << kINFO <<
"<InitEventSample> Internally I use " << fEventSample.size()
846 <<
" for Training and " << fValidationSample.size()
847 <<
" for Pruning Validation (" << ((
Float_t)fValidationSample.size())/((
Float_t)fEventSample.size()+fValidationSample.size())*100.0
848 <<
"% of training used for validation)" <<
Endl;
852 if (fPairNegWeightsGlobal) PreProcessNegativeEventWeights();
855 if (DoRegression()) {
857 }
else if (DoMulticlass()) {
859 }
else if (!fSkipNormalization) {
861 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
862 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
863 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
877 Double_t nevents = fEventSample.size();
879 Int_t sumSig=0, sumBkg=0;
880 for (
UInt_t ievt=0; ievt<fEventSample.size(); ievt++) {
881 if ((DataInfo().IsSignal(fEventSample[ievt])) ) {
882 sumSigW += fEventSample[ievt]->GetWeight();
885 sumBkgW += fEventSample[ievt]->GetWeight();
889 if (sumSigW && sumBkgW){
890 Double_t normSig = nevents/((1+fSigToBkgFraction)*sumSigW)*fSigToBkgFraction;
891 Double_t normBkg = nevents/((1+fSigToBkgFraction)*sumBkgW); ;
892 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = "
893 << fSigToBkgFraction <<
Endl;
894 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
895 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
896 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
897 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
898 if ((DataInfo().IsSignal(fEventSample[ievt])) ) fEventSample[ievt]->SetBoostWeight(normSig);
899 else fEventSample[ievt]->SetBoostWeight(normBkg);
902 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
903 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
908 fTrainSample = &fEventSample;
910 GetBaggedSubSample(fEventSample);
911 fTrainSample = &fSubSample;
937 std::vector<const Event*> negEvents;
938 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
939 if (fEventSample[iev]->GetWeight() < 0) {
940 totalNegWeights += fEventSample[iev]->GetWeight();
941 negEvents.push_back(fEventSample[iev]);
943 totalPosWeights += fEventSample[iev]->GetWeight();
945 totalWeights += fEventSample[iev]->GetWeight();
947 if (totalNegWeights == 0 ) {
948 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
951 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
952 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
953 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
960 for (
Int_t i=0; i<2; i++){
961 invCov = ((*cov)[i]);
963 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant="
965 <<
" did you use the variables that are linear combinations or highly correlated?"
969 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant="
971 <<
" did you use the variables that are linear combinations?"
980 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " << fEventSample.size() <<
" training events " <<
Endl;
981 Timer timer(negEvents.size(),
"Negative Event paired");
982 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
984 Double_t weight = negEvents[nev]->GetWeight();
985 UInt_t iClassID = negEvents[nev]->GetClass();
986 invCov = ((*cov)[iClassID]);
992 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
993 if (iClassID==fEventSample[iev]->
GetClass() && fEventSample[iev]->GetWeight() > 0){
995 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++){
996 for (
UInt_t jvar=0; jvar<GetNvar(); jvar++){
997 dist += (negEvents[nev]->GetValue(ivar)-fEventSample[iev]->GetValue(ivar))*
998 (*invCov)[ivar][jvar]*
999 (negEvents[nev]->GetValue(jvar)-fEventSample[iev]->GetValue(jvar));
1002 if (
dist < minDist) { iMin=iev; minDist=
dist;}
1008 Double_t newWeight = (negEvents[nev]->GetWeight() + fEventSample[iMin]->GetWeight());
1010 negEvents[nev]->SetBoostWeight( 0 );
1011 fEventSample[iMin]->SetBoostWeight( newWeight/fEventSample[iMin]->GetOriginalWeight() );
1013 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1014 fEventSample[iMin]->SetBoostWeight( 0 );
1017 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1018 weight = negEvents[nev]->GetWeight();
1025 totalNegWeights = 0;
1026 totalPosWeights = 0;
1033 std::vector<const Event*> newEventSample;
1035 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
1036 if (fEventSample[iev]->GetWeight() < 0) {
1037 totalNegWeights += fEventSample[iev]->GetWeight();
1038 totalWeights += fEventSample[iev]->GetWeight();
1040 totalPosWeights += fEventSample[iev]->GetWeight();
1041 totalWeights += fEventSample[iev]->GetWeight();
1043 if (fEventSample[iev]->GetWeight() > 0) {
1044 newEventSample.push_back(
new Event(*fEventSample[iev]));
1045 if (fEventSample[iev]->
GetClass() == fSignalClass){
1046 sigWeight += fEventSample[iev]->GetWeight();
1049 bkgWeight += fEventSample[iev]->GetWeight();
1054 if (totalNegWeights < 0)
Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1056 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1057 fEventSample = newEventSample;
1059 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " << fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1060 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1072 std::map<TString,TMVA::Interval*> tuneParameters;
1073 std::map<TString,Double_t> tunedParameters;
1082 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1084 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1089 if (fBoostType==
"AdaBoost"){
1090 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1092 }
else if (fBoostType==
"Grad"){
1093 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1095 }
else if (fBoostType==
"Bagging" && fRandomisedTrees){
1098 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1102 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1103 std::map<TString,TMVA::Interval*>::iterator it;
1104 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1105 Log() << kWARNING << it->first <<
Endl;
1106 std::ostringstream oss;
1107 (it->second)->
Print(oss);
1113 tunedParameters=optimize.
optimize();
1115 return tunedParameters;
1124 std::map<TString,Double_t>::iterator it;
1125 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1126 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1127 if (it->first ==
"MaxDepth" ) SetMaxDepth ((
Int_t)it->second);
1128 else if (it->first ==
"MinNodeSize" ) SetMinNodeSize (it->second);
1129 else if (it->first ==
"NTrees" ) SetNTrees ((
Int_t)it->second);
1130 else if (it->first ==
"NodePurityLimit") SetNodePurityLimit (it->second);
1131 else if (it->first ==
"AdaBoostBeta" ) SetAdaBoostBeta (it->second);
1132 else if (it->first ==
"Shrinkage" ) SetShrinkage (it->second);
1133 else if (it->first ==
"UseNvars" ) SetUseNvars ((
Int_t)it->second);
1134 else if (it->first ==
"BaggedSampleFraction" ) SetBaggedSampleFraction (it->second);
1135 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1153 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
1154 <<
" I set it to 1 .. just so that the program does not crash"
1159 if (fInteractive && fInteractive->NotInitialized()){
1160 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1161 fInteractive->Init(titles);
1163 fIPyMaxIter = fNTrees;
1164 fExitFromTraining =
false;
1168 if (IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; "
1169 <<
"please remove the option from the configuration string, or "
1170 <<
"use \"!Normalise\""
1174 Log() << kINFO <<
"Regression Loss Function: "<< fRegressionLossFunctionBDTG->Name() <<
Endl;
1176 Log() << kINFO <<
"Training "<< fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1178 Log() << kDEBUG <<
"Training with maximal depth = " <<fMaxDepth
1179 <<
", MinNodeEvents=" << fMinNodeEvents
1180 <<
", NTrees="<<fNTrees
1181 <<
", NodePurityLimit="<<fNodePurityLimit
1182 <<
", AdaBoostBeta="<<fAdaBoostBeta
1188 TString hname =
"AdaBooost weight distribution";
1194 if (DoRegression()) {
1198 hname=
"Boost event weights distribution";
1203 TH1*
h =
new TH1F(
Form(
"%s_BoostWeight",DataInfo().GetName()),hname,nBins,xMin,xMax);
1204 TH1* nodesBeforePruningVsTree =
new TH1I(
Form(
"%s_NodesBeforePruning",DataInfo().GetName()),
"nodes before pruning",fNTrees,0,fNTrees);
1205 TH1* nodesAfterPruningVsTree =
new TH1I(
Form(
"%s_NodesAfterPruning",DataInfo().GetName()),
"nodes after pruning",fNTrees,0,fNTrees);
1209 if(!DoMulticlass()){
1212 h->SetXTitle(
"boost weight");
1213 results->
Store(
h,
"BoostWeights");
1217 if (fDoBoostMonitor){
1218 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,fNTrees,2,0,1.05);
1220 boostMonitor->
SetYTitle(
"ROC Integral");
1221 results->
Store(boostMonitor,
"BoostMonitor");
1223 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1224 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1225 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1229 h =
new TH1F(
"BoostWeightVsTree",
"Boost weights vs tree",fNTrees,0,fNTrees);
1230 h->SetXTitle(
"#tree");
1231 h->SetYTitle(
"boost weight");
1232 results->
Store(
h,
"BoostWeightsVsTree");
1235 h =
new TH1F(
"ErrFractHist",
"error fraction vs tree number",fNTrees,0,fNTrees);
1236 h->SetXTitle(
"#tree");
1237 h->SetYTitle(
"error fraction");
1238 results->
Store(
h,
"ErrorFrac");
1241 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1242 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1243 results->
Store(nodesBeforePruningVsTree);
1246 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1247 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1248 results->
Store(nodesAfterPruningVsTree);
1252 fMonitorNtuple=
new TTree(
"MonitorNtuple",
"BDT variables");
1253 fMonitorNtuple->Branch(
"iTree",&fITree,
"iTree/I");
1254 fMonitorNtuple->Branch(
"boostWeight",&fBoostWeight,
"boostWeight/D");
1255 fMonitorNtuple->Branch(
"errorFraction",&fErrorFraction,
"errorFraction/D");
1257 Timer timer( fNTrees, GetName() );
1258 Int_t nNodesBeforePruningCount = 0;
1259 Int_t nNodesAfterPruningCount = 0;
1261 Int_t nNodesBeforePruning = 0;
1262 Int_t nNodesAfterPruning = 0;
1264 if(fBoostType==
"Grad"){
1265 InitGradBoost(fEventSample);
1272 while (itree < fNTrees && continueBoost){
1273 if (fExitFromTraining)
break;
1274 fIPyCurrentIter = itree;
1287 if (fBoostType!=
"Grad"){
1288 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. "
1289 <<
"Please change boost option accordingly (BoostType=Grad)." <<
Endl;
1292 UInt_t nClasses = DataInfo().GetNClasses();
1293 for (
UInt_t i=0;i<nClasses;i++){
1297 fForest.push_back(
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), i,
1298 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1299 itree*nClasses+i, fNodePurityLimit, itree*nClasses+1));
1300 fForest.back()->SetNVars(GetNvar());
1301 if (fUseFisherCuts) {
1302 fForest.back()->SetUseFisherCuts();
1303 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1304 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1308 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1309 Double_t bw = this->Boost(*fTrainSample, fForest.back(),i);
1311 fBoostWeights.push_back(bw);
1313 fBoostWeights.push_back(0);
1314 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1323 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1324 itree, fNodePurityLimit, itree);
1326 fForest.push_back(dt);
1327 fForest.back()->SetNVars(GetNvar());
1328 if (fUseFisherCuts) {
1329 fForest.back()->SetUseFisherCuts();
1330 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1331 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1334 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1336 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad") {
1337 nNodesBeforePruning = fForest.back()->CleanTree();
1340 nNodesBeforePruningCount += nNodesBeforePruning;
1341 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1343 fForest.back()->SetPruneMethod(fPruneMethod);
1344 fForest.back()->SetPruneStrength(fPruneStrength);
1346 std::vector<const Event*> * validationSample = NULL;
1347 if(fAutomatic) validationSample = &fValidationSample;
1348 Double_t bw = this->Boost(*fTrainSample, fForest.back());
1350 fBoostWeights.push_back(bw);
1352 fBoostWeights.push_back(0);
1353 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1362 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad"){
1363 fForest.back()->CleanTree();
1365 nNodesAfterPruning = fForest.back()->GetNNodes();
1366 nNodesAfterPruningCount += nNodesAfterPruning;
1367 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1370 fInteractive->AddPoint(itree, fBoostWeight, fErrorFraction);
1373 fMonitorNtuple->Fill();
1374 if (fDoBoostMonitor){
1375 if (! DoRegression() ){
1376 if ( itree==fNTrees-1 || (!(itree%500)) ||
1377 (!(itree%250) && itree <1000)||
1378 (!(itree%100) && itree < 500)||
1379 (!(itree%50) && itree < 250)||
1380 (!(itree%25) && itree < 150)||
1381 (!(itree%10) && itree < 50)||
1382 (!(itree%5) && itree < 20)
1383 ) BoostMonitor(itree);
1394 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : "
1395 << nNodesBeforePruningCount/GetNTrees() <<
Endl;
1398 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : "
1399 << nNodesBeforePruningCount/GetNTrees() <<
" / "
1400 << nNodesAfterPruningCount/GetNTrees()
1408 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1409 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1410 for (
UInt_t i=0; i<fValidationSample.size(); i++)
delete fValidationSample[i];
1411 fEventSample.clear();
1412 fValidationSample.clear();
1414 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
1425 for (
UInt_t itree=0; itree<nTrees; itree++) {
1430 return 2.0/(1.0+
exp(-2.0*
sum))-1;
1438 if (DoMulticlass()) {
1439 UInt_t nClasses = DataInfo().GetNClasses();
1440 Bool_t isLastClass = (cls == nClasses - 1);
1452 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1455 auto update_residuals = [&residuals, &lastTree, cls](
const TMVA::Event *
e) {
1459 auto update_residuals_last = [&residuals, &lastTree, cls, nClasses](
const TMVA::Event *
e) {
1462 auto &residualsThisEvent = residuals[
e];
1464 std::vector<Double_t> expCache(nClasses, 0.0);
1465 std::transform(residualsThisEvent.begin(),
1466 residualsThisEvent.begin() + nClasses,
1467 expCache.begin(), [](
Double_t d) { return exp(d); });
1469 Double_t exp_sum = std::accumulate(expCache.begin(),
1470 expCache.begin() + nClasses,
1473 for (
UInt_t i = 0; i < nClasses; i++) {
1474 Double_t p_cls = expCache[i] / exp_sum;
1476 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1483 .
Foreach(update_residuals_last, eventSample);
1486 .
Foreach(update_residuals, eventSample);
1492 std::vector<Double_t> expCache;
1494 expCache.resize(nClasses);
1497 for (
auto e : eventSample) {
1498 fResiduals[
e].at(cls) += fForest.back()->CheckEvent(
e,
kFALSE);
1500 auto &residualsThisEvent = fResiduals[
e];
1501 std::transform(residualsThisEvent.begin(),
1502 residualsThisEvent.begin() + nClasses,
1503 expCache.begin(), [](
Double_t d) { return exp(d); });
1505 Double_t exp_sum = std::accumulate(expCache.begin(),
1506 expCache.begin() + nClasses,
1509 for (
UInt_t i = 0; i < nClasses; i++) {
1510 Double_t p_cls = expCache[i] / exp_sum;
1512 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1519 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1522 UInt_t signalClass = DataInfo().GetSignalClassIndex();
1525 auto update_residuals = [&residuals, &lastTree, signalClass](
const TMVA::Event *
e) {
1526 double & residualAt0 = residuals[
e].at(0);
1529 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1530 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1536 .
Foreach(update_residuals, eventSample);
1538 for (
auto e : eventSample) {
1539 double & residualAt0 = residuals[
e].at(0);
1542 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1543 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1566 auto f = [
this, &nPartitions](
UInt_t partition = 0) ->
Int_t {
1567 Int_t start = 1.0 * partition / nPartitions * this->fEventSample.size();
1568 Int_t end = (partition + 1.0) / nPartitions * this->fEventSample.size();
1570 for (
Int_t i = start; i < end; ++i) {
1589 fRegressionLossFunctionBDTG->SetTargets(eventSample, fLossFunctionEventInfo);
1603 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1604 for (
auto e : eventSample) {
1607 auto &
v = leaves[node];
1608 auto target =
e->GetTarget(cls);
1609 v.sumWeightTarget += target * weight;
1610 v.sum2 +=
fabs(target) * (1.0 -
fabs(target)) * weight;
1612 for (
auto &iLeave : leaves) {
1613 constexpr auto minValue = 1
e-30;
1614 if (iLeave.second.sum2 < minValue) {
1615 iLeave.second.sum2 = minValue;
1617 const Double_t K = DataInfo().GetNClasses();
1618 iLeave.first->SetResponse(fShrinkage * (
K - 1) /
K * iLeave.second.sumWeightTarget / iLeave.second.sum2);
1623 DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
1635 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1636 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1638 (leaves[node]).push_back(fLossFunctionEventInfo[*
e]);
1645 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1646 iLeave!=leaves.end();++iLeave){
1647 Double_t fit = fRegressionLossFunctionBDTG->Fit(iLeave->second);
1648 (iLeave->first)->SetResponse(fShrinkage*fit);
1651 UpdateTargetsRegression(*fTrainSample);
1666 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1670 fRegressionLossFunctionBDTG->Init(fLossFunctionEventInfo, fBoostWeights);
1671 UpdateTargetsRegression(*fTrainSample,
kTRUE);
1675 else if(DoMulticlass()){
1676 UInt_t nClasses = DataInfo().GetNClasses();
1677 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1678 for (
UInt_t i=0;i<nClasses;i++){
1680 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1682 fResiduals[*
e].push_back(0);
1687 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1688 Double_t r = (DataInfo().IsSignal(*
e)?1:0)-0.5;
1690 fResiduals[*
e].push_back(0);
1701 for (
UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
1702 Bool_t isSignalType= (dt->
CheckEvent(fValidationSample[ievt]) > fNodePurityLimit ) ? 1 : 0;
1704 if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
1705 ncorrect += fValidationSample[ievt]->GetWeight();
1708 nfalse += fValidationSample[ievt]->GetWeight();
1712 return ncorrect / (ncorrect + nfalse);
1723 if (fBoostType==
"AdaBoost") returnVal = this->AdaBoost (eventSample, dt);
1724 else if (fBoostType==
"AdaCost") returnVal = this->AdaCost (eventSample, dt);
1725 else if (fBoostType==
"Bagging") returnVal = this->Bagging ( );
1726 else if (fBoostType==
"RegBoost") returnVal = this->RegBoost (eventSample, dt);
1727 else if (fBoostType==
"AdaBoostR2") returnVal = this->AdaBoostR2(eventSample, dt);
1728 else if (fBoostType==
"Grad"){
1730 returnVal = this->GradBoostRegression(eventSample, dt);
1731 else if(DoMulticlass())
1732 returnVal = this->GradBoost (eventSample, dt, cls);
1734 returnVal = this->GradBoost (eventSample, dt);
1737 Log() << kINFO << GetOptions() <<
Endl;
1738 Log() << kFATAL <<
"<Boost> unknown boost option " << fBoostType<<
" called" <<
Endl;
1742 GetBaggedSubSample(fEventSample);
1757 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1758 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1762 UInt_t signalClassNr = DataInfo().GetClassInfo(
"Signal")->GetNumber();
1772 UInt_t nevents = Data()->GetNTestEvents();
1773 for (
UInt_t iev=0; iev < nevents; iev++){
1774 const Event*
event = GetTestingEvent(iev);
1776 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1778 tmp->
Fill(PrivateGetMvaValue(event),event->GetWeight());
1782 std::vector<TH1F*> hS;
1783 std::vector<TH1F*> hB;
1784 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1785 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1786 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1787 results->
Store(hS.back(),hS.back()->GetTitle());
1788 results->
Store(hB.back(),hB.back()->GetTitle());
1792 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1793 if (fEventSample[iev]->GetBoostWeight() > max) max = 1.01*fEventSample[iev]->GetBoostWeight();
1795 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1796 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1797 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1798 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1800 TH1F *tmpBoostWeights;
1801 std::vector<TH1F*> *
h;
1803 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1804 if (fEventSample[iev]->
GetClass() == signalClassNr) {
1805 tmpBoostWeights=tmpBoostWeightsS;
1808 tmpBoostWeights=tmpBoostWeightsB;
1811 tmpBoostWeights->
Fill(fEventSample[iev]->GetBoostWeight());
1812 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1813 (*h)[ivar]->Fill(fEventSample[iev]->GetValue(ivar),fEventSample[iev]->GetWeight());
1849 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1851 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
1854 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1857 UInt_t iclass=(*e)->GetClass();
1860 if ( DoRegression() ) {
1862 sumGlobalwfalse += w * tmpDev;
1863 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1864 if (tmpDev > maxDev) maxDev = tmpDev;
1869 if (!(isSignalType == DataInfo().IsSignal(*
e))) {
1870 sumGlobalwfalse+= w;
1875 if (DataInfo().IsSignal(*
e)) trueType = 1;
1877 sumGlobalwfalse+= w*trueType*dtoutput;
1882 err = sumGlobalwfalse/sumGlobalw ;
1883 if ( DoRegression() ) {
1885 if (fAdaBoostR2Loss==
"linear"){
1886 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1888 else if (fAdaBoostR2Loss==
"quadratic"){
1889 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1891 else if (fAdaBoostR2Loss==
"exponential"){
1893 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1896 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1901 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
1902 <<
" namely " << fAdaBoostR2Loss <<
"\n"
1903 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1907 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1911 std::vector<Double_t> newSumw(sumw.size(),0);
1914 if (err >= 0.5 && fUseYesNoLeaf) {
1918 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
1919 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
1921 <<
"please check why this happens, maybe too many events per node requested ?"
1925 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1926 <<
") That should not happen, please check your code (i.e... the BDT code), I "
1927 <<
" stop boosting here" <<
Endl;
1931 }
else if (err < 0) {
1932 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
1933 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
1934 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
1935 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1939 boostWeight =
TMath::Log((1.-err)/err)*fAdaBoostBeta;
1941 boostWeight =
TMath::Log((1.+err)/(1-err))*fAdaBoostBeta;
1944 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1949 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1951 if (fUseYesNoLeaf||DoRegression()){
1952 if ((!( (dt->
CheckEvent(*
e,fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*
e))) || DoRegression()) {
1956 if ( (*e)->GetWeight() > 0 ){
1957 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1959 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1961 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1962 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1970 if (DataInfo().IsSignal(*
e)) trueType = 1;
1974 if ( (*e)->GetWeight() > 0 ){
1975 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1977 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1979 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1980 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1983 newSumGlobalw+=(*e)->GetWeight();
1984 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1990 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1993 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1997 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
1998 else (*e)->ScaleBoostWeight( globalNormWeight );
2001 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2005 fBoostWeight = boostWeight;
2006 fErrorFraction = err;
2032 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
2034 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
2036 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2038 sumGlobalWeights += w;
2039 UInt_t iclass=(*e)->GetClass();
2043 if ( DoRegression() ) {
2044 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2049 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2050 Bool_t isSelectedSignal = (dtoutput>0);
2051 if (isTrueSignal) trueType = 1;
2055 if (isTrueSignal && isSelectedSignal) cost=Css;
2056 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2057 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2058 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2059 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2061 sumGlobalCost+= w*trueType*dtoutput*cost;
2066 if ( DoRegression() ) {
2067 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2072 sumGlobalCost /= sumGlobalWeights;
2077 vector<Double_t> newSumClassWeights(sumw.size(),0);
2079 Double_t boostWeight =
TMath::Log((1+sumGlobalCost)/(1-sumGlobalCost)) * fAdaBoostBeta;
2083 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2086 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2087 Bool_t isSelectedSignal = (dtoutput>0);
2088 if (isTrueSignal) trueType = 1;
2092 if (isTrueSignal && isSelectedSignal) cost=Css;
2093 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2094 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2095 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2096 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2099 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2100 if ( (*e)->GetWeight() > 0 ){
2101 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
2103 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2105 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
2108 newSumGlobalWeights+=(*e)->GetWeight();
2109 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
2114 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
2115 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
2118 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2121 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
2122 else (*e)->ScaleBoostWeight( globalNormWeight );
2126 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2130 fBoostWeight = boostWeight;
2131 fErrorFraction = err;
2158 if (!fSubSample.empty()) fSubSample.clear();
2160 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2161 n = trandom->
PoissonD(fBaggedSampleFraction);
2162 for (
Int_t i=0;i<
n;i++) fSubSample.push_back(*
e);
2196 if ( !DoRegression() )
Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2198 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2200 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2205 sumwfalse += w * tmpDev;
2206 sumwfalse2 += w * tmpDev*tmpDev;
2207 if (tmpDev > maxDev) maxDev = tmpDev;
2211 if (fAdaBoostR2Loss==
"linear"){
2212 err = sumwfalse/maxDev/sumw ;
2214 else if (fAdaBoostR2Loss==
"quadratic"){
2215 err = sumwfalse2/maxDev/maxDev/sumw ;
2217 else if (fAdaBoostR2Loss==
"exponential"){
2219 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2222 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2227 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
2228 <<
" namely " << fAdaBoostR2Loss <<
"\n"
2229 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2237 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
2238 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
2240 <<
"please check why this happens, maybe too many events per node requested ?"
2244 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2245 <<
") That should not happen, but is possible for regression trees, and"
2246 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
2247 <<
" stop boosting " <<
Endl;
2251 }
else if (err < 0) {
2252 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
2253 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
2254 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
2255 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2259 Double_t boostWeight = err / (1.-err);
2264 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2266 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2268 if ( (*e)->GetWeight() > 0 ){
2269 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2270 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2271 if (newWeight == 0) {
2272 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2273 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2274 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2275 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2276 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2277 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2279 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2282 (*e)->SetBoostWeight( newBoostWeight );
2285 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2287 newSumw+=(*e)->GetWeight();
2291 Double_t normWeight = sumw / newSumw;
2292 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2295 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2302 fBoostWeight = boostWeight;
2303 fErrorFraction = err;
2315 if (fDoPreselection){
2316 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2317 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%d",ivar), fIsLowBkgCut[ivar]);
2318 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%dValue",ivar), fLowBkgCut[ivar]);
2319 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%d",ivar), fIsLowSigCut[ivar]);
2320 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%dValue",ivar), fLowSigCut[ivar]);
2321 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%d",ivar), fIsHighBkgCut[ivar]);
2322 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%dValue",ivar),fHighBkgCut[ivar]);
2323 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%d",ivar), fIsHighSigCut[ivar]);
2324 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%dValue",ivar),fHighSigCut[ivar]);
2330 gTools().
AddAttr( wght,
"AnalysisType", fForest.back()->GetAnalysisType() );
2332 for (
UInt_t i=0; i< fForest.size(); i++) {
2333 void* trxml = fForest[i]->AddXMLTo(wght);
2344 for (i=0; i<fForest.size(); i++)
delete fForest[i];
2346 fBoostWeights.clear();
2353 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2354 fIsLowBkgCut.resize(GetNvar());
2355 fLowBkgCut.resize(GetNvar());
2356 fIsLowSigCut.resize(GetNvar());
2357 fLowSigCut.resize(GetNvar());
2358 fIsHighBkgCut.resize(GetNvar());
2359 fHighBkgCut.resize(GetNvar());
2360 fIsHighSigCut.resize(GetNvar());
2361 fHighSigCut.resize(GetNvar());
2365 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2367 fIsLowBkgCut[ivar]=tmpBool;
2369 fLowBkgCut[ivar]=tmpDouble;
2371 fIsLowSigCut[ivar]=tmpBool;
2373 fLowSigCut[ivar]=tmpDouble;
2375 fIsHighBkgCut[ivar]=tmpBool;
2377 fHighBkgCut[ivar]=tmpDouble;
2379 fIsHighSigCut[ivar]=tmpBool;
2381 fHighSigCut[ivar]=tmpDouble;
2388 if(
gTools().HasAttr(parent,
"TreeType")) {
2399 fForest.back()->SetTreeID(i++);
2401 fBoostWeights.push_back(boostWeight);
2413 Int_t analysisType(0);
2416 istr >>
dummy >> fNTrees;
2417 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2419 for (
UInt_t i=0;i<fForest.size();i++)
delete fForest[i];
2421 fBoostWeights.clear();
2424 for (
int i=0;i<fNTrees;i++) {
2425 istr >>
dummy >> iTree >>
dummy >> boostWeight;
2427 fForest.back()->Print( std::cout );
2428 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree="
2429 << iTree <<
" i=" << i
2430 <<
" dummy " <<
dummy
2431 <<
" boostweight " << boostWeight
2436 fForest.back()->SetTreeID(i);
2437 fForest.back()->
Read(istr, GetTrainingTMVAVersionCode());
2438 fBoostWeights.push_back(boostWeight);
2445 return this->GetMvaValue( err, errUpper, 0 );
2455 const Event* ev = GetEvent();
2456 if (fDoPreselection) {
2457 Double_t val = ApplyPreselectionCuts(ev);
2460 return PrivateGetMvaValue(ev, err, errUpper, useNTrees);
2472 NoErrorCalc(err, errUpper);
2476 UInt_t nTrees = fForest.size();
2478 if (useNTrees > 0 ) nTrees = useNTrees;
2480 if (fBoostType==
"Grad")
return GetGradBoostMVA(ev,nTrees);
2484 for (
UInt_t itree=0; itree<nTrees; itree++) {
2486 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,fUseYesNoLeaf);
2487 norm += fBoostWeights[itree];
2499 if (fMulticlassReturnVal == NULL) fMulticlassReturnVal =
new std::vector<Float_t>();
2500 fMulticlassReturnVal->clear();
2502 UInt_t nClasses = DataInfo().GetNClasses();
2503 std::vector<Double_t> temp(nClasses);
2504 auto forestSize = fForest.size();
2507 std::vector<TMVA::DecisionTree *> forest = fForest;
2508 auto get_output = [&
e, &forest, &temp, forestSize, nClasses](
UInt_t iClass) {
2509 for (
UInt_t itree = iClass; itree < forestSize; itree += nClasses) {
2510 temp[iClass] += forest[itree]->CheckEvent(
e,
kFALSE);
2520 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2521 temp[classOfTree] += fForest[itree]->CheckEvent(
e,
kFALSE);
2522 if (++classOfTree == nClasses) classOfTree = 0;
2528 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){return exp(d);});
2530 Double_t exp_sum = std::accumulate(temp.begin(), temp.end(), 0.0);
2532 for (
UInt_t i = 0; i < nClasses; i++) {
2533 Double_t p_cls = temp[i] / exp_sum;
2534 (*fMulticlassReturnVal).push_back(p_cls);
2537 return *fMulticlassReturnVal;
2546 if (fRegressionReturnVal == NULL) fRegressionReturnVal =
new std::vector<Float_t>();
2547 fRegressionReturnVal->clear();
2549 const Event * ev = GetEvent();
2554 if (fBoostType==
"AdaBoostR2") {
2565 vector< Double_t > response(fForest.size());
2566 vector< Double_t > weight(fForest.size());
2569 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2570 response[itree] = fForest[itree]->CheckEvent(ev,
kFALSE);
2571 weight[itree] = fBoostWeights[itree];
2572 totalSumOfWeights += fBoostWeights[itree];
2575 std::vector< std::vector<Double_t> > vtemp;
2576 vtemp.push_back( response );
2577 vtemp.push_back( weight );
2582 while (sumOfWeights <= totalSumOfWeights/2.) {
2583 sumOfWeights += vtemp[1][t];
2597 else if(fBoostType==
"Grad"){
2598 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2599 myMVA += fForest[itree]->CheckEvent(ev,
kFALSE);
2602 evT->
SetTarget(0, myMVA+fBoostWeights[0] );
2605 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2607 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,
kFALSE);
2608 norm += fBoostWeights[itree];
2616 const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
2617 fRegressionReturnVal->push_back( evT2->
GetTarget(0) );
2622 return *fRegressionReturnVal;
2631 Log() << kDEBUG <<
"\tWrite monitoring histograms to file: " << BaseDir()->GetPath() <<
Endl;
2635 fMonitorNtuple->
Write();
2646 fVariableImportance.resize(GetNvar());
2647 for (
UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
2648 fVariableImportance[ivar]=0;
2651 for (
UInt_t itree = 0; itree < GetNTrees(); itree++) {
2652 std::vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
2653 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2654 fVariableImportance[i] += fBoostWeights[itree] * relativeImportance[i];
2658 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++){
2659 fVariableImportance[ivar] =
TMath::Sqrt(fVariableImportance[ivar]);
2660 sum += fVariableImportance[ivar];
2662 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++) fVariableImportance[ivar] /=
sum;
2664 return fVariableImportance;
2674 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2675 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2676 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2687 fRanking =
new Ranking( GetName(),
"Variable Importance" );
2688 vector< Double_t> importance(this->GetVariableImportance());
2690 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++) {
2692 fRanking->AddRank(
Rank( GetInputLabel(ivar), importance[ivar] ) );
2706 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2707 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2708 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2709 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2710 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2711 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2712 Log() <<
"weight in the training of the following tree." <<
Endl;
2714 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2715 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2716 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2717 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2718 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2722 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2723 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2724 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2725 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2726 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2727 Log() <<
"to optimise the BDT performance." <<
Endl;
2731 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2732 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2733 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2734 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2735 Log() <<
"If this number is too large, detailed features " <<
Endl;
2736 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2737 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2738 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2739 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2741 Log() <<
"The default minimal number is currently set to " <<
Endl;
2742 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2743 Log() <<
"and can be changed by the user." <<
Endl;
2745 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2746 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2747 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2748 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2749 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2750 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2751 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2752 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2764 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2765 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2766 fout <<
"};" << std::endl << std::endl;
2769 fout <<
"std::vector<double> ReadBDTG::GetMulticlassValues__( const std::vector<double>& inputValues ) const" << std::endl;
2770 fout <<
"{" << std::endl;
2771 fout <<
" uint nClasses = " << DataInfo().GetNClasses() <<
";" << std::endl;
2772 fout <<
" std::vector<double> fMulticlassReturnVal;" << std::endl;
2773 fout <<
" fMulticlassReturnVal.reserve(nClasses);" << std::endl;
2775 fout <<
" std::vector<double> temp(nClasses);" << std::endl;
2776 fout <<
" auto forestSize = fForest.size();" << std::endl;
2777 fout <<
" // trees 0, nClasses, 2*nClasses, ... belong to class 0" << std::endl;
2778 fout <<
" // trees 1, nClasses+1, 2*nClasses+1, ... belong to class 1 and so forth" << std::endl;
2779 fout <<
" uint classOfTree = 0;" << std::endl;
2780 fout <<
" for (uint itree = 0; itree < forestSize; ++itree) {" << std::endl;
2781 fout <<
" BDTGNode *current = fForest[itree];" << std::endl;
2782 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2783 fout <<
" if (current->GoesRight(inputValues)) current=(BDTGNode*)current->GetRight();" << std::endl;
2784 fout <<
" else current=(BDTGNode*)current->GetLeft();" << std::endl;
2785 fout <<
" }" << std::endl;
2786 fout <<
" temp[classOfTree] += current->GetResponse();" << std::endl;
2787 fout <<
" if (++classOfTree == nClasses) classOfTree = 0; // cheap modulo" << std::endl;
2788 fout <<
" }" << std::endl;
2790 fout <<
" // we want to calculate sum of exp(temp[j] - temp[i]) for all i,j (i!=j)" << std::endl;
2791 fout <<
" // first calculate exp(), then replace minus with division." << std::endl;
2792 fout <<
" std::transform(temp.begin(), temp.end(), temp.begin(), [](double d){return exp(d);});" << std::endl;
2794 fout <<
" for(uint iClass=0; iClass<nClasses; iClass++){" << std::endl;
2795 fout <<
" double norm = 0.0;" << std::endl;
2796 fout <<
" for(uint j=0;j<nClasses;j++){" << std::endl;
2797 fout <<
" if(iClass!=j)" << std::endl;
2798 fout <<
" norm += temp[j] / temp[iClass];" << std::endl;
2799 fout <<
" }" << std::endl;
2800 fout <<
" fMulticlassReturnVal.push_back(1.0/(1.0+norm));" << std::endl;
2801 fout <<
" }" << std::endl;
2803 fout <<
" return fMulticlassReturnVal;" << std::endl;
2804 fout <<
"}" << std::endl;
2806 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2807 fout <<
"{" << std::endl;
2808 fout <<
" double myMVA = 0;" << std::endl;
2809 if (fDoPreselection){
2810 for (
UInt_t ivar = 0; ivar< fIsLowBkgCut.size(); ivar++){
2811 if (fIsLowBkgCut[ivar]){
2812 fout <<
" if (inputValues["<<ivar<<
"] < " << fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2814 if (fIsLowSigCut[ivar]){
2815 fout <<
" if (inputValues["<<ivar<<
"] < "<< fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2817 if (fIsHighBkgCut[ivar]){
2818 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2820 if (fIsHighSigCut[ivar]){
2821 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2826 if (fBoostType!=
"Grad"){
2827 fout <<
" double norm = 0;" << std::endl;
2829 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2830 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2831 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2832 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2833 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2834 fout <<
" }" << std::endl;
2835 if (fBoostType==
"Grad"){
2836 fout <<
" myMVA += current->GetResponse();" << std::endl;
2838 if (fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2839 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2840 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2842 fout <<
" }" << std::endl;
2843 if (fBoostType==
"Grad"){
2844 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2846 else fout <<
" return myMVA /= norm;" << std::endl;
2847 fout <<
"}" << std::endl << std::endl;
2850 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2851 fout <<
"{" << std::endl;
2852 fout <<
" double inf = std::numeric_limits<double>::infinity();" << std::endl;
2853 fout <<
" double nan = std::numeric_limits<double>::quiet_NaN();" << std::endl;
2855 for (
UInt_t itree=0; itree<GetNTrees(); itree++) {
2856 fout <<
" // itree = " << itree << std::endl;
2857 fout <<
" fBoostWeights.push_back(" << fBoostWeights[itree] <<
");" << std::endl;
2858 fout <<
" fForest.push_back( " << std::endl;
2859 this->MakeClassInstantiateNode((
DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
2860 fout <<
" );" << std::endl;
2862 fout <<
" return;" << std::endl;
2863 fout <<
"};" << std::endl;
2865 fout <<
"// Clean up" << std::endl;
2866 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2867 fout <<
"{" << std::endl;
2868 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2869 fout <<
" delete fForest[itree]; " << std::endl;
2870 fout <<
" }" << std::endl;
2871 fout <<
"}" << std::endl;
2883 fout <<
"#include <algorithm>" << std::endl;
2884 fout <<
"#include <limits>" << std::endl;
2887 fout <<
"#define NN new "<<nodeName << std::endl;
2890 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2891 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2893 fout <<
"class "<<nodeName<<
" {" << std::endl;
2895 fout <<
"public:" << std::endl;
2897 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2898 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2899 if (fUseFisherCuts){
2900 fout <<
" int nFisherCoeff," << std::endl;
2901 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2902 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2905 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2906 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2907 fout <<
" fLeft ( left )," << std::endl;
2908 fout <<
" fRight ( right )," << std::endl;
2909 if (fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2910 fout <<
" fSelector ( selector )," << std::endl;
2911 fout <<
" fCutValue ( cutValue )," << std::endl;
2912 fout <<
" fCutType ( cutType )," << std::endl;
2913 fout <<
" fNodeType ( nodeType )," << std::endl;
2914 fout <<
" fPurity ( purity )," << std::endl;
2915 fout <<
" fResponse ( response ){" << std::endl;
2916 if (fUseFisherCuts){
2917 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2918 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2921 fout <<
" }" << std::endl << std::endl;
2922 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2923 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2924 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2925 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2926 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2927 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2928 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2929 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2930 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2931 fout <<
" // return the node type" << std::endl;
2932 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2933 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2934 fout <<
"private:" << std::endl << std::endl;
2935 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2936 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2937 if (fUseFisherCuts){
2938 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2939 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2941 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2942 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2943 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2944 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2945 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2946 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2947 fout <<
"}; " << std::endl;
2949 fout <<
"//_______________________________________________________________________" << std::endl;
2950 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2951 fout <<
"{" << std::endl;
2952 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2953 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2954 fout <<
"}; " << std::endl;
2956 fout <<
"//_______________________________________________________________________" << std::endl;
2957 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2958 fout <<
"{" << std::endl;
2959 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2960 fout <<
" bool result;" << std::endl;
2961 if (fUseFisherCuts){
2962 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2963 fout <<
" result = (inputValues[fSelector] >= fCutValue );" << std::endl;
2964 fout <<
" }else{" << std::endl;
2965 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2966 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2967 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2968 fout <<
" result = fisher > fCutValue;" << std::endl;
2969 fout <<
" }" << std::endl;
2971 fout <<
" result = (inputValues[fSelector] >= fCutValue );" << std::endl;
2973 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2974 fout <<
" else return !result;" << std::endl;
2975 fout <<
"}" << std::endl;
2977 fout <<
"//_______________________________________________________________________" << std::endl;
2978 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2979 fout <<
"{" << std::endl;
2980 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2981 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2982 fout <<
" else return false;" << std::endl;
2983 fout <<
"}" << std::endl;
2985 fout <<
"#endif" << std::endl;
2995 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2998 fout <<
"NN("<<std::endl;
2999 if (
n->GetLeft() != NULL){
3000 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetLeft() , fout, className);
3005 fout <<
", " <<std::endl;
3006 if (
n->GetRight() != NULL){
3007 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetRight(), fout, className );
3012 fout <<
", " << std::endl
3013 << std::setprecision(6);
3014 if (fUseFisherCuts){
3015 fout <<
n->GetNFisherCoeff() <<
", ";
3016 for (
UInt_t i=0; i< GetNVariables()+1; i++) {
3017 if (
n->GetNFisherCoeff() == 0 ){
3020 fout <<
n->GetFisherCoeff(i) <<
", ";
3024 fout <<
n->GetSelector() <<
", "
3025 <<
n->GetCutValue() <<
", "
3026 <<
n->GetCutType() <<
", "
3027 <<
n->GetNodeType() <<
", "
3028 <<
n->GetPurity() <<
","
3029 <<
n->GetResponse() <<
") ";
3040 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
3042 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
3044 fIsLowSigCut.assign(GetNvar(),
kFALSE);
3045 fIsLowBkgCut.assign(GetNvar(),
kFALSE);
3046 fIsHighSigCut.assign(GetNvar(),
kFALSE);
3047 fIsHighBkgCut.assign(GetNvar(),
kFALSE);
3049 fLowSigCut.assign(GetNvar(),0.);
3050 fLowBkgCut.assign(GetNvar(),0.);
3051 fHighSigCut.assign(GetNvar(),0.);
3052 fHighBkgCut.assign(GetNvar(),0.);
3057 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
3058 if (DataInfo().IsSignal(*it)){
3059 nTotS += (*it)->GetWeight();
3063 nTotB += (*it)->GetWeight();
3069 for(
UInt_t ivar = 0; ivar < GetNvar(); ivar++ ) {
3071 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
3073 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
3074 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
3075 for( ; it != it_end; ++it ) {
3076 if (DataInfo().IsSignal(**it))
3077 sigWeightCtr += (**it)->GetWeight();
3079 bkgWeightCtr += (**it)->GetWeight();
3081 it->SetCumulativeWeight(
false,bkgWeightCtr);
3082 it->SetCumulativeWeight(
true,sigWeightCtr);
3087 Double_t dVal = (DataInfo().GetVariableInfo(ivar).GetMax() - DataInfo().GetVariableInfo(ivar).GetMin())/100. ;
3088 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
3089 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
3094 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
3097 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
3098 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
3100 tmpEffS=nSelS/nTotS;
3101 tmpEffB=nSelB/nTotB;
3104 if (nSelS==0 && tmpEffB>effB) {effB=tmpEffB; fLowBkgCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowBkgCut[ivar]=
kTRUE;}
3105 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS; fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowSigCut[ivar]=
kTRUE;}
3106 else if (nSelB==nTotB && tmpRejS>rejS) {rejS=tmpRejS; fHighSigCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighSigCut[ivar]=
kTRUE;}
3107 else if (nSelS==nTotS && tmpRejB>rejB) {rejB=tmpRejB; fHighBkgCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighBkgCut[ivar]=
kTRUE;}
3112 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
3113 if (fDoPreselection)
Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
3114 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
3115 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3116 if (fIsLowBkgCut[ivar]){
3117 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " << fLowBkgCut[ivar] <<
Endl;
3119 if (fIsLowSigCut[ivar]){
3120 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " << fLowSigCut[ivar] <<
Endl;
3122 if (fIsHighBkgCut[ivar]){
3123 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " << fHighBkgCut[ivar] <<
Endl;
3125 if (fIsHighSigCut[ivar]){
3126 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " << fHighSigCut[ivar] <<
Endl;
3141 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3142 if (fIsLowBkgCut[ivar]){
3143 if (ev->
GetValue(ivar) < fLowBkgCut[ivar]) result = -1;
3145 if (fIsLowSigCut[ivar]){
3146 if (ev->
GetValue(ivar) < fLowSigCut[ivar]) result = 1;
3148 if (fIsHighBkgCut[ivar]){
3149 if (ev->
GetValue(ivar) > fHighBkgCut[ivar]) result = -1;
3151 if (fIsHighSigCut[ivar]){
3152 if (ev->
GetValue(ivar) > fHighSigCut[ivar]) result = 1;
#define REGISTER_METHOD(CLASS)
for example
static RooMathCoreReg dummy
char * Form(const char *fmt,...)
A pseudo container class which is a generator of indices.
A TGraph is an object made of two arrays X and Y with npoints each.
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
virtual void SetName(const char *name="")
Set graph name.
virtual void SetTitle(const char *title="")
Change (i.e.
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
1-D histogram with a float per channel (see TH1 documentation)}
1-D histogram with an int per channel (see TH1 documentation)}
virtual void SetXTitle(const char *title)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
virtual void SetYTitle(const char *title)
2-D histogram with a float per channel (see TH1 documentation)}
Service class for 2-Dim histogram classes.
Absolute Deviation BDT Loss Function.
static void SetVarIndex(Int_t iVar)
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
static Config & Instance()
static function: returns TMVA instance
Implementation of the CrossEntropy as separation criterion.
Class that contains all the data information.
Implementation of a Decision Tree.
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
Float_t GetTarget(UInt_t itgt) const
void Foreach(Function func, unsigned int nTimes, unsigned nChunks=0)
wrap TExecutor::Foreach
unsigned int GetPoolSize() const
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Implementation of the GiniIndex With Laplace correction as separation criterion.
Implementation of the GiniIndex as separation criterion.
The TMVA::Interval Class.
Least Squares BDT Loss Function.
The TMVA::Interval Class.
Analysis of Boosted Decision Trees.
void Init(void)
Common initialisation with defaults for the BDT-Method.
static const Int_t fgDebugLevel
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training .
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
Double_t AdaBoostR2(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
void GetHelpMessage() const
Get help message text.
LossFunctionBDT * fRegressionLossFunctionBDTG
void DeterminePreselectionCuts(const std::vector< const TMVA::Event * > &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
Double_t GradBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
Double_t AdaCost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events....
void DeclareOptions()
Define the options (their key words).
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
Double_t Boost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor.
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
void UpdateTargets(std::vector< const TMVA::Event * > &, UInt_t cls=0)
Calculate residual for all events.
void UpdateTargetsRegression(std::vector< const TMVA::Event * > &, Bool_t first=kFALSE)
Calculate residuals for all events and update targets for next iter.
Double_t GradBoostRegression(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual ~MethodBDT(void)
Destructor.
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
Double_t RegBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA .
void SetMinNodeSize(Double_t sizeInPercent)
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
void PreProcessNegativeEventWeights()
O.k.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
Double_t AdaBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaBoost implementation.
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
void InitGradBoost(std::vector< const TMVA::Event * > &)
Initialize targets for first tree.
void Train(void)
BDT training.
void GetBaggedSubSample(std::vector< const TMVA::Event * > &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
SeparationBase * fSepType
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.).
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
Virtual base Class for all MVA method.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Implementation of the MisClassificationError as separation criterion.
std::map< TString, Double_t > optimize()
PDF wrapper for histograms; uses user-defined spline interpolation.
Ranking for variables in method (implementation)
Class that is the base-class for a vector of result.
TGraph * GetGraph(const TString &alias) const
TH1 * GetHist(const TString &alias) const
void Store(TObject *obj, const char *alias=0)
Implementation of the SdivSqrtSplusB as separation criterion.
Timing information for training and evaluation of MVA methods.
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Singleton class for Global types used by TMVA.
virtual Double_t Determinant() const
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual const char * GetTitle() const
Returns title of object.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
virtual void Delete(Option_t *option="")
Delete this object.
virtual Int_t Read(const char *name)
Read contents of object with specified name from the current directory.
Random number generator class based on M.
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
Double_t Atof() const
Return floating-point value contained in string.
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
TString & ReplaceAll(const TString &s1, const TString &s2)
TString & Append(const char *cs)
A TTree represents a columnar dataset.
static const uint32_t K[64]
void Print(std::ostream &os, const OptionType &opt)
double dist(Rotation3D const &r1, Rotation3D const &r2)
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
TSeq< unsigned int > TSeqU
void GetMethodName(TString &name, TKey *mkey)
create variable transformations
void BDT(TString dataset, const TString &fin="TMVA.root")
MsgLogger & Endl(MsgLogger &ml)
Short_t Max(Short_t a, Short_t b)
Int_t FloorNint(Double_t x)
constexpr Double_t E()
Base of natural log:
Double_t Sqrt(Double_t x)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Int_t CeilNint(Double_t x)
Short_t Min(Short_t a, Short_t b)
static long int sum(long int i)