Logo ROOT   6.14/05
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TMVA::DecisionTree *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation of a Decision Tree *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://mva.sourceforge.net/license.txt) *
29  * *
30  **********************************************************************************/
31 
32 /*! \class TMVA::DecisionTree
33 \ingroup TMVA
34 
35 Implementation of a Decision Tree
36 
37 In a decision tree successive decision nodes are used to categorize the
38 events out of the sample as either signal or background. Each node
39 uses only a single discriminating variable to decide if the event is
40 signal-like ("goes right") or background-like ("goes left"). This
41 forms a tree like structure with "baskets" at the end (leave nodes),
42 and an event is classified as either signal or background according to
43 whether the basket where it ends up has been classified signal or
44 background during the training. Training of a decision tree is the
45 process to define the "cut criteria" for each node. The training
46 starts with the root node. Here one takes the full training event
47 sample and selects the variable and corresponding cut value that gives
48 the best separation between signal and background at this stage. Using
49 this cut criterion, the sample is then divided into two subsamples, a
50 signal-like (right) and a background-like (left) sample. Two new nodes
51 are then created for each of the two sub-samples and they are
52 constructed using the same mechanism as described for the root
53 node. The devision is stopped once a certain node has reached either a
54 minimum number of events, or a minimum or maximum signal purity. These
55 leave nodes are then called "signal" or "background" if they contain
56 more signal respective background events from the training sample.
57 
58 */
59 
60 #include <iostream>
61 #include <algorithm>
62 #include <vector>
63 #include <limits>
64 #include <fstream>
65 #include <algorithm>
66 #include <cassert>
67 
68 #include "TRandom3.h"
69 #include "TMath.h"
70 #include "TMatrix.h"
71 #include "TStopwatch.h"
72 
73 #include "TMVA/MsgLogger.h"
74 #include "TMVA/DecisionTree.h"
75 #include "TMVA/DecisionTreeNode.h"
76 #include "TMVA/BinarySearchTree.h"
77 
78 #include "TMVA/Tools.h"
79 #include "TMVA/Config.h"
80 
81 #include "TMVA/GiniIndex.h"
82 #include "TMVA/CrossEntropy.h"
84 #include "TMVA/SdivSqrtSplusB.h"
85 #include "TMVA/Event.h"
86 #include "TMVA/BDTEventWrapper.h"
87 #include "TMVA/IPruneTool.h"
90 
91 const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
92 
93 using std::vector;
94 
96 
97 bool almost_equal_float(float x, float y, int ulp=4){
98  // the machine epsilon has to be scaled to the magnitude of the values used
99  // and multiplied by the desired precision in ULPs (units in the last place)
100  return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
101  // unless the result is subnormal
102  || std::abs(x-y) < std::numeric_limits<float>::min();
103 }
104 
105 bool almost_equal_double(double x, double y, int ulp=4){
106  // the machine epsilon has to be scaled to the magnitude of the values used
107  // and multiplied by the desired precision in ULPs (units in the last place)
108  return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
109  // unless the result is subnormal
110  || std::abs(x-y) < std::numeric_limits<double>::min();
111 }
112 
113 ////////////////////////////////////////////////////////////////////////////////
114 /// default constructor using the GiniIndex as separation criterion,
115 /// no restrictions on minium number of events in a leave note or the
116 /// separation gain in the node splitting
117 
119  BinaryTree(),
120  fNvars (0),
121  fNCuts (-1),
122  fUseFisherCuts (kFALSE),
123  fMinLinCorrForFisher (1),
124  fUseExclusiveVars (kTRUE),
125  fSepType (NULL),
126  fRegType (NULL),
127  fMinSize (0),
128  fMinNodeSize (1),
129  fMinSepGain (0),
130  fUseSearchTree(kFALSE),
131  fPruneStrength(0),
132  fPruneMethod (kNoPruning),
133  fNNodesBeforePruning(0),
134  fNodePurityLimit(0.5),
135  fRandomisedTree (kFALSE),
136  fUseNvars (0),
137  fUsePoissonNvars(kFALSE),
138  fMyTrandom (NULL),
139  fMaxDepth (999999),
140  fSigClass (0),
141  fTreeID (0),
142  fAnalysisType (Types::kClassification),
143  fDataSetInfo (NULL)
144 
145 {
146  #ifdef R__USE_IMT
148  #endif
149 }
150 
151 ////////////////////////////////////////////////////////////////////////////////
152 /// constructor specifying the separation type, the min number of
153 /// events in a no that is still subjected to further splitting, the
154 /// number of bins in the grid used in applying the cut for the node
155 /// splitting.
156 
158  Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
159  UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
160  BinaryTree(),
161  fNvars (0),
162  fNCuts (nCuts),
166  fSepType (sepType),
167  fRegType (NULL),
168  fMinSize (0),
169  fMinNodeSize (minSize),
170  fMinSepGain (0),
172  fPruneStrength (0),
175  fNodePurityLimit(purityLimit),
176  fRandomisedTree (randomisedTree),
177  fUseNvars (useNvars),
178  fUsePoissonNvars(usePoissonNvars),
179  fMyTrandom (new TRandom3(iSeed)),
180  fMaxDepth (nMaxDepth),
181  fSigClass (cls),
182  fTreeID (treeID),
183  fAnalysisType (Types::kClassification),
184  fDataSetInfo (dataInfo)
185 {
186  if (sepType == NULL) { // it is interpreted as a regression tree, where
187  // currently the separation type (simple least square)
188  // cannot be chosen freely)
191  if ( nCuts <=0 ) {
192  fNCuts = 200;
193  Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
194  << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
195  << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
196  << Endl;
197  }
198  }else{
200  }
201 
202  #ifdef R__USE_IMT
204  #endif
205 }
206 
207 ////////////////////////////////////////////////////////////////////////////////
208 /// copy constructor that creates a true copy, i.e. a completely independent tree
209 /// the node copy will recursively copy all the nodes
210 
212  BinaryTree(),
213  fNvars (d.fNvars),
214  fNCuts (d.fNCuts),
218  fSepType (d.fSepType),
219  fRegType (d.fRegType),
220  fMinSize (d.fMinSize),
228  fUseNvars (d.fUseNvars),
230  fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
231  fMaxDepth (d.fMaxDepth),
232  fSigClass (d.fSigClass),
233  fTreeID (d.fTreeID),
236 {
237  this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
238  this->SetParentTreeInNodes();
239  fNNodes = d.fNNodes;
240 
241 }
242 
243 
244 ////////////////////////////////////////////////////////////////////////////////
245 /// destructor
246 
248 {
249  // destruction of the tree nodes done in the "base class" BinaryTree
250 
251  if (fMyTrandom) delete fMyTrandom;
252  if (fRegType) delete fRegType;
253 }
254 
255 ////////////////////////////////////////////////////////////////////////////////
256 /// descend a tree to find all its leaf nodes, fill max depth reached in the
257 /// tree at the same time.
258 
260 {
261  if (n == NULL) { //default, start at the tree top, then descend recursively
262  n = this->GetRoot();
263  if (n == NULL) {
264  Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
265  return ;
266  }
267  }
268 
269  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
270  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
271  return;
272  } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
273  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
274  return;
275  }
276  else {
277  if (this->GetLeftDaughter(n) != NULL) {
278  this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
279  }
280  if (this->GetRightDaughter(n) != NULL) {
281  this->SetParentTreeInNodes( this->GetRightDaughter(n) );
282  }
283  }
284  n->SetParentTree(this);
285  if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
286  return;
287 }
288 
289 ////////////////////////////////////////////////////////////////////////////////
290 /// re-create a new tree (decision tree or search tree) from XML
291 
293  std::string type("");
294  gTools().ReadAttr(node,"type", type);
295  DecisionTree* dt = new DecisionTree();
296 
297  dt->ReadXML( node, tmva_Version_Code );
298  return dt;
299 }
300 
301 // #### Multithreaded DecisionTree::BuildTree
302 #ifdef R__USE_IMT
303 //====================================================================================
304 // Had to define this struct to enable parallelization.
305 // Using BuildNodeInfo, each thread can return the information needed.
306 // After each thread returns we can then merge the results, hence the + operator.
307 //====================================================================================
308 struct BuildNodeInfo{
309 
310  BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
311 
312  nvars = fNvars;
313  xmin = std::vector<Float_t>(nvars);
314  xmax = std::vector<Float_t>(nvars);
315 
316  // #### the initial min and max for each feature
317  for (Int_t ivar=0; ivar<fNvars; ivar++) {
318  const Double_t val = evt->GetValueFast(ivar);
319  xmin[ivar]=val;
320  xmax[ivar]=val;
321  }
322  };
323 
324  BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
325 
326  nvars = fNvars;
327  xmin = std::vector<Float_t>(nvars);
328  xmax = std::vector<Float_t>(nvars);
329 
330  // #### the initial min and max for each feature
331  for (Int_t ivar=0; ivar<fNvars; ivar++) {
332  xmin[ivar]=inxmin[ivar];
333  xmax[ivar]=inxmax[ivar];
334  }
335  };
336  BuildNodeInfo(){};
337 
338  Int_t nvars = 0;
339  Double_t s = 0;
340  Double_t suw = 0;
341  Double_t sub = 0;
342  Double_t b = 0;
343  Double_t buw = 0;
344  Double_t bub = 0;
345  Double_t target = 0;
346  Double_t target2 = 0;
347  std::vector<Float_t> xmin;
348  std::vector<Float_t> xmax;
349 
350  // #### Define the addition operator for BuildNodeInfo
351  // #### Make sure both BuildNodeInfos have the same nvars if we add them
352  BuildNodeInfo operator+(const BuildNodeInfo& other)
353  {
354  BuildNodeInfo ret(nvars, xmin, xmax);
355  if(nvars != other.nvars)
356  {
357  std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
358  return ret;
359  }
360  ret.s = s + other.s;
361  ret.suw = suw + other.suw;
362  ret.sub = sub + other.sub;
363  ret.b = b + other.b;
364  ret.buw = buw + other.buw;
365  ret.bub = bub + other.bub;
366  ret.target = target + other.target;
367  ret.target2 = target2 + other.target2;
368 
369  // xmin is the min of both, xmax is the max of both
370  for(Int_t i=0; i<nvars; i++)
371  {
372  ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
373  ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
374  }
375  return ret;
376  };
377 
378 };
379 //===========================================================================
380 // Done with BuildNodeInfo declaration
381 //===========================================================================
382 
383 
384 ////////////////////////////////////////////////////////////////////////////////
385 /// building the decision tree by recursively calling the splitting of
386 /// one (root-) node into two daughter nodes (returns the number of nodes)
387 
388 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
390 {
391  if (node==NULL) {
392  //start with the root node
393  node = new TMVA::DecisionTreeNode();
394  fNNodes = 1;
395  this->SetRoot(node);
396  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
397  this->GetRoot()->SetPos('s');
398  this->GetRoot()->SetDepth(0);
399  this->GetRoot()->SetParentTree(this);
400  fMinSize = fMinNodeSize/100. * eventSample.size();
401  if (GetTreeID()==0){
402  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
403  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
404  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
405  }
406  }
407 
408  UInt_t nevents = eventSample.size();
409 
410  if (nevents > 0 ) {
411  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
412  fVariableImportance.resize(fNvars);
413  }
414  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
415 
416  // sum up the totals
417  // sig and bkg for classification
418  // err and err2 for regression
419 
420  // #### Set up prerequisite info for multithreading
421  UInt_t nPartitions = fNumPoolThreads;
422  auto seeds = ROOT::TSeqU(nPartitions);
423 
424  // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
425  auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
426 
427  Int_t start = 1.0*partition/nPartitions*eventSample.size();
428  Int_t end = (partition+1.0)/nPartitions*eventSample.size();
429 
430  BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
431 
432  for(Int_t iev=start; iev<end; iev++){
433  const TMVA::Event* evt = eventSample[iev];
434  const Double_t weight = evt->GetWeight();
435  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
436  if (evt->GetClass() == fSigClass) {
437  nodeInfof.s += weight;
438  nodeInfof.suw += 1;
439  nodeInfof.sub += orgWeight;
440  }
441  else {
442  nodeInfof.b += weight;
443  nodeInfof.buw += 1;
444  nodeInfof.bub += orgWeight;
445  }
446  if ( DoRegression() ) {
447  const Double_t tgt = evt->GetTarget(0);
448  nodeInfof.target +=weight*tgt;
449  nodeInfof.target2+=weight*tgt*tgt;
450  }
451 
452  // save the min and max for each feature
453  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
454  const Double_t val = evt->GetValueFast(ivar);
455  if (iev==start){
456  nodeInfof.xmin[ivar]=val;
457  nodeInfof.xmax[ivar]=val;
458  }
459  if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
460  if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
461  }
462  }
463  return nodeInfof;
464  };
465 
466  // #### Need an initial struct to pass to std::accumulate
467  BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
468 
469  // #### Run the threads in parallel then merge the results
470  auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
471  BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
472  //NodeInfo nodeInfo(fNvars);
473 
474  if (nodeInfo.s+nodeInfo.b < 0) {
475  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
476  << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
477  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
478  << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
479  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
480  << "to allow for reasonable averaging!!!" << Endl
481  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
482  << "with negative weight in the training." << Endl;
483  double nBkg=0.;
484  for (UInt_t i=0; i<eventSample.size(); i++) {
485  if (eventSample[i]->GetClass() != fSigClass) {
486  nBkg += eventSample[i]->GetWeight();
487  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
488  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
489  }
490  }
491  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
492  }
493 
494  node->SetNSigEvents(nodeInfo.s);
495  node->SetNBkgEvents(nodeInfo.b);
496  node->SetNSigEvents_unweighted(nodeInfo.suw);
497  node->SetNBkgEvents_unweighted(nodeInfo.buw);
498  node->SetNSigEvents_unboosted(nodeInfo.sub);
499  node->SetNBkgEvents_unboosted(nodeInfo.bub);
500  node->SetPurity();
501  if (node == this->GetRoot()) {
502  node->SetNEvents(nodeInfo.s+nodeInfo.b);
503  node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
504  node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
505  }
506 
507  // save the min and max for each feature
508  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
509  node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
510  node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
511  }
512 
513  // I now demand the minimum number of events for both daughter nodes. Hence if the number
514  // of events in the parent node is not at least two times as big, I don't even need to try
515  // splitting
516 
517  // ask here for actual "events" independent of their weight.. OR the weighted events
518  // to exceed the min requested number of events per daughter node
519  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
520  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
521  // std::cout << "------------------------------------------------------------------"<<std::endl;
522  // std::cout << "------------------------------------------------------------------"<<std::endl;
523  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
524  if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
525  && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
526 
527  // Train the node and figure out the separation gain and split points
528  Double_t separationGain;
529  if (fNCuts > 0){
530  separationGain = this->TrainNodeFast(eventSample, node);
531  }
532  else {
533  separationGain = this->TrainNodeFull(eventSample, node);
534  }
535 
536  // The node has been trained and there is nothing to be gained by splitting
537  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
538  // no cut can actually do anything to improve the node
539  // hence, naturally, the current node is a leaf node
540  if (DoRegression()) {
541  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
542  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
543  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
544  node->SetRMS(0);
545  }else{
546  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
547  }
548  }
549  else {
550  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
551  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
552  else node->SetNodeType(-1);
553  }
554  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
555  }
556  else {
557  // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
558  // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
559  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
560  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
561 
562  Double_t nRight=0, nLeft=0;
563  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
564 
565  for (UInt_t ie=0; ie< nevents ; ie++) {
566  if (node->GoesRight(*eventSample[ie])) {
567  rightSample.push_back(eventSample[ie]);
568  nRight += eventSample[ie]->GetWeight();
569  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
570  }
571  else {
572  leftSample.push_back(eventSample[ie]);
573  nLeft += eventSample[ie]->GetWeight();
574  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
575  }
576  }
577  // sanity check
578  if (leftSample.empty() || rightSample.empty()) {
579 
580  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
581  << "--- Hence new node == old node ... check" << Endl
582  << "--- left:" << leftSample.size()
583  << " right:" << rightSample.size() << Endl
584  << " while the separation is thought to be " << separationGain
585  << "\n when cutting on variable " << node->GetSelector()
586  << " at value " << node->GetCutValue()
587  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
588  }
589 
590  // continue building daughter nodes for the left and the right eventsample
591  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
592  fNNodes++;
593  rightNode->SetNEvents(nRight);
594  rightNode->SetNEvents_unboosted(nRightUnBoosted);
595  rightNode->SetNEvents_unweighted(rightSample.size());
596 
597  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
598 
599  fNNodes++;
600  leftNode->SetNEvents(nLeft);
601  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
602  leftNode->SetNEvents_unweighted(leftSample.size());
603 
604  node->SetNodeType(0);
605  node->SetLeft(leftNode);
606  node->SetRight(rightNode);
607 
608  this->BuildTree(rightSample, rightNode);
609  this->BuildTree(leftSample, leftNode );
610 
611  }
612  }
613  else{ // it is a leaf node
614  if (DoRegression()) {
615  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
616  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
617  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
618  node->SetRMS(0);
619  }else{
620  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
621  }
622  }
623  else {
624  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
625  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
626  else node->SetNodeType(-1);
627  // loop through the event sample ending up in this node and check for events with negative weight
628  // those "cannot" be boosted normally. Hence, if there is one of those
629  // is misclassified, find randomly as many events with positive weights in this
630  // node as needed to get the same absolute number of weight, and mark them as
631  // "not to be boosted" in order to make up for not boosting the negative weight event
632  }
633 
634 
635  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
636  }
637 
638  // if (IsRootNode) this->CleanTree();
639  return fNNodes;
640 }
641 
642 // Standard DecisionTree::BuildTree (multithreading is not enabled)
643 #else
644 
645 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
647 {
648  if (node==NULL) {
649  //start with the root node
650  node = new TMVA::DecisionTreeNode();
651  fNNodes = 1;
652  this->SetRoot(node);
653  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
654  this->GetRoot()->SetPos('s');
655  this->GetRoot()->SetDepth(0);
656  this->GetRoot()->SetParentTree(this);
657  fMinSize = fMinNodeSize/100. * eventSample.size();
658  if (GetTreeID()==0){
659  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
660  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
661  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
662  }
663  }
664 
665  UInt_t nevents = eventSample.size();
666 
667  if (nevents > 0 ) {
668  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
669  fVariableImportance.resize(fNvars);
670  }
671  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
672 
673  Double_t s=0, b=0;
674  Double_t suw=0, buw=0;
675  Double_t sub=0, bub=0; // unboosted!
676  Double_t target=0, target2=0;
677  Float_t *xmin = new Float_t[fNvars];
678  Float_t *xmax = new Float_t[fNvars];
679 
680  // initializing xmin and xmax for each variable
681  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
682  xmin[ivar]=xmax[ivar]=0;
683  }
684  // sum up the totals
685  // sig and bkg for classification
686  // err and err2 for regression
687  for (UInt_t iev=0; iev<eventSample.size(); iev++) {
688  const TMVA::Event* evt = eventSample[iev];
689  const Double_t weight = evt->GetWeight();
690  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
691  if (evt->GetClass() == fSigClass) {
692  s += weight;
693  suw += 1;
694  sub += orgWeight;
695  }
696  else {
697  b += weight;
698  buw += 1;
699  bub += orgWeight;
700  }
701  if ( DoRegression() ) {
702  const Double_t tgt = evt->GetTarget(0);
703  target +=weight*tgt;
704  target2+=weight*tgt*tgt;
705  }
706 
707  // save the min and max for each feature
708  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
709  const Double_t val = evt->GetValueFast(ivar);
710  if (iev==0) xmin[ivar]=xmax[ivar]=val;
711  if (val < xmin[ivar]) xmin[ivar]=val;
712  if (val > xmax[ivar]) xmax[ivar]=val;
713  }
714  }
715 
716 
717  if (s+b < 0) {
718  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
719  << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
720  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
721  << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
722  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
723  << "to allow for reasonable averaging!!!" << Endl
724  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
725  << "with negative weight in the training." << Endl;
726  double nBkg=0.;
727  for (UInt_t i=0; i<eventSample.size(); i++) {
728  if (eventSample[i]->GetClass() != fSigClass) {
729  nBkg += eventSample[i]->GetWeight();
730  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
731  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
732  }
733  }
734  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
735  }
736 
737  node->SetNSigEvents(s);
738  node->SetNBkgEvents(b);
739  node->SetNSigEvents_unweighted(suw);
740  node->SetNBkgEvents_unweighted(buw);
741  node->SetNSigEvents_unboosted(sub);
742  node->SetNBkgEvents_unboosted(bub);
743  node->SetPurity();
744  if (node == this->GetRoot()) {
745  node->SetNEvents(s+b);
746  node->SetNEvents_unweighted(suw+buw);
747  node->SetNEvents_unboosted(sub+bub);
748  }
749 
750  // save the min and max for each feature
751  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
752  node->SetSampleMin(ivar,xmin[ivar]);
753  node->SetSampleMax(ivar,xmax[ivar]);
754 
755  }
756  delete[] xmin;
757  delete[] xmax;
758 
759  // I now demand the minimum number of events for both daughter nodes. Hence if the number
760  // of events in the parent node is not at least two times as big, I don't even need to try
761  // splitting
762 
763  // ask here for actuall "events" independent of their weight.. OR the weighted events
764  // to execeed the min requested number of events per dauther node
765  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
766  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
767  // std::cout << "------------------------------------------------------------------"<<std::endl;
768  // std::cout << "------------------------------------------------------------------"<<std::endl;
769  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
770  if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
771  && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
772  Double_t separationGain;
773  if (fNCuts > 0){
774  separationGain = this->TrainNodeFast(eventSample, node);
775  } else {
776  separationGain = this->TrainNodeFull(eventSample, node);
777  }
778  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
779  /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
780  // no cut can actually do anything to improve the node
781  // hence, naturally, the current node is a leaf node
782  if (DoRegression()) {
783  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
784  node->SetResponse(target/(s+b));
785  if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
786  node->SetRMS(0);
787  }else{
788  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
789  }
790  }
791  else {
793 
794  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
795  else node->SetNodeType(-1);
796  }
797  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
798 
799  } else {
800 
801  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
802  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
803 
804  Double_t nRight=0, nLeft=0;
805  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
806 
807  for (UInt_t ie=0; ie< nevents ; ie++) {
808  if (node->GoesRight(*eventSample[ie])) {
809  rightSample.push_back(eventSample[ie]);
810  nRight += eventSample[ie]->GetWeight();
811  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
812  }
813  else {
814  leftSample.push_back(eventSample[ie]);
815  nLeft += eventSample[ie]->GetWeight();
816  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
817  }
818  }
819 
820  // sanity check
821  if (leftSample.empty() || rightSample.empty()) {
822 
823  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
824  << "--- Hence new node == old node ... check" << Endl
825  << "--- left:" << leftSample.size()
826  << " right:" << rightSample.size() << Endl
827  << " while the separation is thought to be " << separationGain
828  << "\n when cutting on variable " << node->GetSelector()
829  << " at value " << node->GetCutValue()
830  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
831  }
832 
833  // continue building daughter nodes for the left and the right eventsample
834  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
835  fNNodes++;
836  rightNode->SetNEvents(nRight);
837  rightNode->SetNEvents_unboosted(nRightUnBoosted);
838  rightNode->SetNEvents_unweighted(rightSample.size());
839 
840  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
841 
842  fNNodes++;
843  leftNode->SetNEvents(nLeft);
844  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
845  leftNode->SetNEvents_unweighted(leftSample.size());
846 
847  node->SetNodeType(0);
848  node->SetLeft(leftNode);
849  node->SetRight(rightNode);
850 
851  this->BuildTree(rightSample, rightNode);
852  this->BuildTree(leftSample, leftNode );
853 
854  }
855  }
856  else{ // it is a leaf node
857  if (DoRegression()) {
858  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
859  node->SetResponse(target/(s+b));
860  if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
861  node->SetRMS(0);
862  }else{
863  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
864  }
865  }
866  else {
868  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
869  else node->SetNodeType(-1);
870  // loop through the event sample ending up in this node and check for events with negative weight
871  // those "cannot" be boosted normally. Hence, if there is one of those
872  // is misclassified, find randomly as many events with positive weights in this
873  // node as needed to get the same absolute number of weight, and mark them as
874  // "not to be boosted" in order to make up for not boosting the negative weight event
875  }
876 
877 
878  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
879  }
880 
881  // if (IsRootNode) this->CleanTree();
882  return fNNodes;
883 }
884 
885 #endif
886 
887 ////////////////////////////////////////////////////////////////////////////////
888 /// fill the existing the decision tree structure by filling event
889 /// in from the top node and see where they happen to end up
890 
891 void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
892 {
893  for (UInt_t i=0; i<eventSample.size(); i++) {
894  this->FillEvent(*(eventSample[i]),NULL);
895  }
896 }
897 
898 ////////////////////////////////////////////////////////////////////////////////
899 /// fill the existing the decision tree structure by filling event
900 /// in from the top node and see where they happen to end up
901 
903  TMVA::DecisionTreeNode *node )
904 {
905  if (node == NULL) { // that's the start, take the Root node
906  node = this->GetRoot();
907  }
908 
909  node->IncrementNEvents( event.GetWeight() );
911 
912  if (event.GetClass() == fSigClass) {
913  node->IncrementNSigEvents( event.GetWeight() );
915  }
916  else {
917  node->IncrementNBkgEvents( event.GetWeight() );
919  }
921  node->GetNBkgEvents()));
922 
923  if (node->GetNodeType() == 0) { //intermediate node --> go down
924  if (node->GoesRight(event))
925  this->FillEvent(event, node->GetRight());
926  else
927  this->FillEvent(event, node->GetLeft());
928  }
929 }
930 
931 ////////////////////////////////////////////////////////////////////////////////
932 /// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
933 
935 {
936  if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
937 
938 }
939 
940 ////////////////////////////////////////////////////////////////////////////////
941 /// remove those last splits that result in two leaf nodes that
942 /// are both of the type (i.e. both signal or both background)
943 /// this of course is only a reasonable thing to do when you use
944 /// "YesOrNo" leafs, while it might loose s.th. if you use the
945 /// purity information in the nodes.
946 /// --> hence I don't call it automatically in the tree building
947 
949 {
950  if (node==NULL) {
951  node = this->GetRoot();
952  }
953 
954  DecisionTreeNode *l = node->GetLeft();
955  DecisionTreeNode *r = node->GetRight();
956 
957  if (node->GetNodeType() == 0) {
958  this->CleanTree(l);
959  this->CleanTree(r);
960  if (l->GetNodeType() * r->GetNodeType() > 0) {
961 
962  this->PruneNode(node);
963  }
964  }
965  // update the number of nodes after the cleaning
966  return this->CountNodes();
967 
968 }
969 
970 ////////////////////////////////////////////////////////////////////////////////
971 /// prune (get rid of internal nodes) the Decision tree to avoid overtraining
972 /// several different pruning methods can be applied as selected by the
973 /// variable "fPruneMethod".
974 
976 {
977  IPruneTool* tool(NULL);
978  PruningInfo* info(NULL);
979 
980  if( fPruneMethod == kNoPruning ) return 0.0;
981 
983  // tool = new ExpectedErrorPruneTool(logfile);
984  tool = new ExpectedErrorPruneTool();
986  {
987  tool = new CostComplexityPruneTool();
988  }
989  else {
990  Log() << kFATAL << "Selected pruning method not yet implemented "
991  << Endl;
992  }
993 
994  if(!tool) return 0.0;
995 
997  if(tool->IsAutomatic()) {
998  if(validationSample == NULL){
999  Log() << kFATAL << "Cannot automate the pruning algorithm without an "
1000  << "independent validation sample!" << Endl;
1001  }else if(validationSample->size() == 0) {
1002  Log() << kFATAL << "Cannot automate the pruning algorithm with "
1003  << "independent validation sample of ZERO events!" << Endl;
1004  }
1005  }
1006 
1007  info = tool->CalculatePruningInfo(this,validationSample);
1008  Double_t pruneStrength=0;
1009  if(!info) {
1010  Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1011  << Endl;
1012  } else {
1013  pruneStrength = info->PruneStrength;
1014 
1015  // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1016  // << " has quality index " << info->QualityIndex << Endl;
1017 
1018 
1019  for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1020 
1021  PruneNode(info->PruneSequence[i]);
1022  }
1023  // update the number of nodes after the pruning
1024  this->CountNodes();
1025  }
1026 
1027  delete tool;
1028  delete info;
1029 
1030  return pruneStrength;
1031 };
1032 
1033 
1034 ////////////////////////////////////////////////////////////////////////////////
1035 /// run the validation sample through the (pruned) tree and fill in the nodes
1036 /// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1037 /// and Background events from the validation sample. This is then later used
1038 /// when asking for the "tree quality" ..
1039 
1040 void TMVA::DecisionTree::ApplyValidationSample( const EventConstList* validationSample ) const
1041 {
1043  for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1044  CheckEventWithPrunedTree((*validationSample)[ievt]);
1045  }
1046 }
1047 
1048 ////////////////////////////////////////////////////////////////////////////////
1049 /// return the misclassification rate of a pruned tree
1050 /// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1051 /// any node, hence this tree quality testing will stop there, hence test
1052 /// the pruned tree (while the full tree is still in place for normal/later use)
1053 
1055 {
1056  if (n == NULL) { // default, start at the tree top, then descend recursively
1057  n = this->GetRoot();
1058  if (n == NULL) {
1059  Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1060  return 0;
1061  }
1062  }
1063 
1064  if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1065  return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1066  TestPrunedTreeQuality( n->GetRight(), mode ));
1067  }
1068  else { // terminal leaf (in a pruned subtree of T_max at least)
1069  if (DoRegression()) {
1070  Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1071  return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1072  }
1073  else {
1074  if (mode == 0) {
1075  if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1076  return n->GetNBValidation();
1077  else
1078  return n->GetNSValidation();
1079  }
1080  else if ( mode == 1 ) {
1081  // calculate the weighted error using the pruning validation sample
1082  return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1083  }
1084  else {
1085  throw std::string("Unknown ValidationQualityMode");
1086  }
1087  }
1088  }
1089 }
1090 
1091 ////////////////////////////////////////////////////////////////////////////////
1092 /// pass a single validation event through a pruned decision tree
1093 /// on the way down the tree, fill in all the "intermediate" information
1094 /// that would normally be there from training.
1095 
1097 {
1098  DecisionTreeNode* current = this->GetRoot();
1099  if (current == NULL) {
1100  Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1101  }
1102 
1103  while(current != NULL) {
1104  if(e->GetClass() == fSigClass)
1105  current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1106  else
1107  current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1108 
1109  if (e->GetNTargets() > 0) {
1110  current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1111  current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1112  }
1113 
1114  if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1115  current = NULL;
1116  }
1117  else {
1118  if (current->GoesRight(*e))
1119  current = (TMVA::DecisionTreeNode*)current->GetRight();
1120  else
1121  current = (TMVA::DecisionTreeNode*)current->GetLeft();
1122  }
1123  }
1124 }
1125 
1126 ////////////////////////////////////////////////////////////////////////////////
1127 /// calculate the normalization factor for a pruning validation sample
1128 
1130 {
1131  Double_t sumWeights = 0.0;
1132  for( EventConstList::const_iterator it = validationSample->begin();
1133  it != validationSample->end(); ++it ) {
1134  sumWeights += (*it)->GetWeight();
1135  }
1136  return sumWeights;
1137 }
1138 
1139 ////////////////////////////////////////////////////////////////////////////////
1140 /// return the number of terminal nodes in the sub-tree below Node n
1141 
1143 {
1144  if (n == NULL) { // default, start at the tree top, then descend recursively
1145  n = this->GetRoot();
1146  if (n == NULL) {
1147  Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1148  return 0;
1149  }
1150  }
1151 
1152  UInt_t countLeafs=0;
1153 
1154  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1155  countLeafs += 1;
1156  }
1157  else {
1158  if (this->GetLeftDaughter(n) != NULL) {
1159  countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1160  }
1161  if (this->GetRightDaughter(n) != NULL) {
1162  countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1163  }
1164  }
1165  return countLeafs;
1166 }
1167 
1168 ////////////////////////////////////////////////////////////////////////////////
1169 /// descend a tree to find all its leaf nodes
1170 
1172 {
1173  if (n == NULL) { // default, start at the tree top, then descend recursively
1174  n = this->GetRoot();
1175  if (n == NULL) {
1176  Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1177  return ;
1178  }
1179  }
1180 
1181  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1182  // do nothing
1183  }
1184  else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1185  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1186  return;
1187  }
1188  else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1189  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1190  return;
1191  }
1192  else {
1193  if (this->GetLeftDaughter(n) != NULL) {
1194  this->DescendTree( this->GetLeftDaughter(n) );
1195  }
1196  if (this->GetRightDaughter(n) != NULL) {
1197  this->DescendTree( this->GetRightDaughter(n) );
1198  }
1199  }
1200 }
1201 
1202 ////////////////////////////////////////////////////////////////////////////////
1203 /// prune away the subtree below the node
1204 
1206 {
1207  DecisionTreeNode *l = node->GetLeft();
1208  DecisionTreeNode *r = node->GetRight();
1209 
1210  node->SetRight(NULL);
1211  node->SetLeft(NULL);
1212  node->SetSelector(-1);
1213  node->SetSeparationGain(-1);
1214  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1215  else node->SetNodeType(-1);
1216  this->DeleteNode(l);
1217  this->DeleteNode(r);
1218  // update the stored number of nodes in the Tree
1219  this->CountNodes();
1220 
1221 }
1222 
1223 ////////////////////////////////////////////////////////////////////////////////
1224 /// prune a node temporarily (without actually deleting its descendants
1225 /// which allows testing the pruned tree quality for many different
1226 /// pruning stages without "touching" the tree.
1227 
1229  if(node == NULL) return;
1230  node->SetNTerminal(1);
1231  node->SetSubTreeR( node->GetNodeR() );
1232  node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1233  node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1234  node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1235 }
1236 
1237 ////////////////////////////////////////////////////////////////////////////////
1238 /// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1239 /// is coded as a sequence of left-right moves starting from the root, coded as
1240 /// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1241 
1243 {
1244  Node* current = this->GetRoot();
1245 
1246  for (UInt_t i =0; i < depth; i++) {
1247  ULong_t tmp = 1 << i;
1248  if ( tmp & sequence) current = this->GetRightDaughter(current);
1249  else current = this->GetLeftDaughter(current);
1250  }
1251 
1252  return current;
1253 }
1254 
1255 ////////////////////////////////////////////////////////////////////////////////
1256 ///
1257 
1258 void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1259  for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1260  if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1261  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1262  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1263  }
1265  else useNvars = fUseNvars;
1266 
1267  UInt_t nSelectedVars = 0;
1268  while (nSelectedVars < useNvars) {
1269  Double_t bla = fMyTrandom->Rndm()*fNvars;
1270  useVariable[Int_t (bla)] = kTRUE;
1271  nSelectedVars = 0;
1272  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1273  if (useVariable[ivar] == kTRUE) {
1274  mapVariable[nSelectedVars] = ivar;
1275  nSelectedVars++;
1276  }
1277  }
1278  }
1279  if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1280 }
1281 
1282 // Multithreaded version of DecisionTree::TrainNodeFast
1283 #ifdef R__USE_IMT
1284 //====================================================================================
1285 // Had to define this struct to enable parallelization in TrainNodeFast.
1286 // Using TrainNodeInfo, each thread can return the information needed.
1287 // After each thread returns we can then merge the results, hence the + operator.
1288 //====================================================================================
1289 struct TrainNodeInfo{
1290 
1291  TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1292 
1293  cNvars = cNvars_;
1294  nBins = nBins_;
1295 
1296  nSelS = std::vector< std::vector<Double_t> >(cNvars);
1297  nSelB = std::vector< std::vector<Double_t> >(cNvars);
1298  nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1299  nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1300  target = std::vector< std::vector<Double_t> >(cNvars);
1301  target2 = std::vector< std::vector<Double_t> >(cNvars);
1302 
1303  for(Int_t ivar=0; ivar<cNvars; ivar++){
1304  nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1305  nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1306  nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1307  nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1308  target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1309  target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1310  }
1311  };
1312 
1313  TrainNodeInfo(){};
1314 
1315  // #### malloc problem if I define this and try to destruct xmin and xmax...
1316  // #### Maybe someone better at C++ can figure out why and fix this if it's a
1317  // ### serious memory problem
1318  //~TrainNodeInfo(){
1319  // delete [] xmin;
1320  // delete [] xmax;
1321  //};
1322 
1323  Int_t cNvars = 0;
1324  UInt_t* nBins;
1325 
1326  Double_t nTotS = 0;
1327  Double_t nTotS_unWeighted = 0;
1328  Double_t nTotB = 0;
1329  Double_t nTotB_unWeighted = 0;
1330 
1331  std::vector< std::vector<Double_t> > nSelS;
1332  std::vector< std::vector<Double_t> > nSelB;
1333  std::vector< std::vector<Double_t> > nSelS_unWeighted;
1334  std::vector< std::vector<Double_t> > nSelB_unWeighted;
1335  std::vector< std::vector<Double_t> > target;
1336  std::vector< std::vector<Double_t> > target2;
1337 
1338  // Define the addition operator for TrainNodeInfo
1339  // Make sure both TrainNodeInfos have the same nvars if we add them
1340  TrainNodeInfo operator+(const TrainNodeInfo& other)
1341  {
1342  TrainNodeInfo ret(cNvars, nBins);
1343 
1344  // check that the two are compatible to add
1345  if(cNvars != other.cNvars)
1346  {
1347  std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1348  return ret;
1349  }
1350 
1351  // add the signal, background, and target sums
1352  for (Int_t ivar=0; ivar<cNvars; ivar++) {
1353  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1354  ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1355  ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1356  ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1357  ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1358  ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1359  ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1360  }
1361  }
1362 
1363  ret.nTotS = nTotS + other.nTotS;
1364  ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1365  ret.nTotB = nTotB + other.nTotB;
1366  ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1367 
1368  return ret;
1369  };
1370 
1371 };
1372 //===========================================================================
1373 // Done with TrainNodeInfo declaration
1374 //===========================================================================
1375 
1376 ////////////////////////////////////////////////////////////////////////////////
1377 /// Decide how to split a node using one of the variables that gives
1378 /// the best separation of signal/background. In order to do this, for each
1379 /// variable a scan of the different cut values in a grid (grid = fNCuts) is
1380 /// performed and the resulting separation gains are compared.
1381 /// in addition to the individual variables, one can also ask for a fisher
1382 /// discriminant being built out of (some) of the variables and used as a
1383 /// possible multivariate split.
1384 
1386  TMVA::DecisionTreeNode *node )
1387 {
1388  // #### OK let's comment this one to see how to parallelize it
1389  Double_t separationGainTotal = -1;
1390  Double_t *separationGain = new Double_t[fNvars+1];
1391  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1392 
1393  // #### initialize the sep gain and cut index values
1394  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1395  separationGain[ivar]=-1;
1396  cutIndex[ivar]=-1;
1397  }
1398  // ### set up some other variables
1399  Int_t mxVar = -1;
1400  Bool_t cutType = kTRUE;
1401  UInt_t nevents = eventSample.size();
1402 
1403 
1404  // the +1 comes from the fact that I treat later on the Fisher output as an
1405  // additional possible variable.
1406  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1407  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1408 
1409  std::vector<Double_t> fisherCoeff;
1410 
1411  // #### set up a map to the subset of variables using two arrays
1412  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1413  UInt_t tmp=fUseNvars;
1414  GetRandomisedVariables(useVariable,mapVariable,tmp);
1415  }
1416  else {
1417  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1418  useVariable[ivar] = kTRUE;
1419  mapVariable[ivar] = ivar;
1420  }
1421  }
1422  // #### last variable entry is the fisher variable
1423  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1424 
1425  // #### Begin Fisher calculation
1426  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1427  if (fUseFisherCuts) {
1428  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1429 
1430  //use for the Fisher discriminant ONLY those variables that show
1431  //some reasonable linear correlation in either Signal or Background
1432  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1433  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1434  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1435  useVarInFisher[ivar] = kFALSE;
1436  mapVarInFisher[ivar] = ivar;
1437  }
1438 
1439  std::vector<TMatrixDSym*>* covMatrices;
1440  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1441  if (!covMatrices){
1442  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1443  fisherOK = kFALSE;
1444  }else{
1445  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1446  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1447  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1448  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1449 
1450  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1451  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1452  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1453  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1454  useVarInFisher[ivar] = kTRUE;
1455  useVarInFisher[jvar] = kTRUE;
1456  }
1457  }
1458  }
1459  // now as you know which variables you want to use, count and map them:
1460  // such that you can use an array/matrix filled only with THOSE variables
1461  // that you used
1462  UInt_t nFisherVars = 0;
1463  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1464  //now .. pick those variables that are used in the FIsher and are also
1465  // part of the "allowed" variables in case of Randomized Trees)
1466  if (useVarInFisher[ivar] && useVariable[ivar]) {
1467  mapVarInFisher[nFisherVars++]=ivar;
1468  // now exclude the variables used in the Fisher cuts, and don't
1469  // use them anymore in the individual variable scan
1470  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1471  }
1472  }
1473 
1474 
1475  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1476  fisherOK = kTRUE;
1477  }
1478  delete [] useVarInFisher;
1479  delete [] mapVarInFisher;
1480 
1481  }
1482  // #### End Fisher calculation
1483 
1484 
1485  UInt_t cNvars = fNvars;
1486  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1487 
1488  // #### set up the binning info arrays
1489  // #### each var has its own binning since some may be integers
1490  UInt_t* nBins = new UInt_t [cNvars];
1491  Double_t* binWidth = new Double_t [cNvars];
1492  Double_t* invBinWidth = new Double_t [cNvars];
1493  Double_t** cutValues = new Double_t* [cNvars];
1494 
1495  // #### set up the xmin and xmax arrays
1496  // #### each var has its own range
1497  Double_t *xmin = new Double_t[cNvars];
1498  Double_t *xmax = new Double_t[cNvars];
1499 
1500  // construct and intialize binning/cuts
1501  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1502  // ncuts means that we need n+1 bins for each variable
1503  nBins[ivar] = fNCuts+1;
1504  if (ivar < fNvars) {
1505  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1506  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1507  }
1508  }
1509 
1510  cutValues[ivar] = new Double_t [nBins[ivar]];
1511  }
1512 
1513  // init the range and cutvalues for each var now that we know the binning
1514  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1515  if (ivar < fNvars){
1516  xmin[ivar]=node->GetSampleMin(ivar);
1517  xmax[ivar]=node->GetSampleMax(ivar);
1518  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1519  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1520  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1521  useVariable[ivar]=kFALSE;
1522  }
1523 
1524  } else { // the fisher variable
1525  xmin[ivar]=999;
1526  xmax[ivar]=-999;
1527  // too bad, for the moment I don't know how to do this without looping
1528  // once to get the "min max" and then AGAIN to fill the histogram
1529  for (UInt_t iev=0; iev<nevents; iev++) {
1530  // returns the Fisher value (no fixed range)
1531  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1532  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1533  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1534  if (result > xmax[ivar]) xmax[ivar]=result;
1535  if (result < xmin[ivar]) xmin[ivar]=result;
1536  }
1537  }
1538  // this loop could take a long time if nbins is large
1539  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1540  cutValues[ivar][ibin]=0;
1541  }
1542  }
1543 
1544  // ====================================================================
1545  // ====================================================================
1546  // Parallelized Version
1547  // ====================================================================
1548  // ====================================================================
1549 
1550  // #### Figure out the cut values, loops through vars then through cuts
1551  // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1552  // fill the cut values for the scan:
1553  auto varSeeds = ROOT::TSeqU(cNvars);
1554  auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1555 
1556  if ( useVariable[ivar] ) {
1557 
1558  //set the grid for the cut scan on the variables like this:
1559  //
1560  // | | | | | ... | |
1561  // xmin xmax
1562  //
1563  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1564  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1565  // --> nBins = fNCuts+1
1566  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1567  // hence can be safely omitted
1568 
1569  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1570  invBinWidth[ivar] = 1./binWidth[ivar];
1571  if (ivar < fNvars) {
1572  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1573  }
1574 
1575  // std::cout << "ivar="<<ivar
1576  // <<" min="<<xmin[ivar]
1577  // << " max="<<xmax[ivar]
1578  // << " width=" << istepSize
1579  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1580  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1581  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1582  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1583  }
1584  }
1585  return 0;
1586  };
1587  TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1588 
1589  // #### Loop through the events to get the total sig and background
1590  // #### Then loop through the vars to get the counts in each bin in each var
1591  // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1592 
1593  TrainNodeInfo nodeInfo(cNvars, nBins);
1594  UInt_t nPartitions = fNumPoolThreads;
1595 
1596  // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1597  // #### Parallelize by chunking the data into the same number of sections as we have processors
1598  if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1599  {
1600  auto seeds = ROOT::TSeqU(nPartitions);
1601 
1602  // need a lambda function to pass to TThreadExecutor::MapReduce
1603  auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1604  &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1605 
1606  UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1607  UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1608 
1609  TrainNodeInfo nodeInfof(cNvars, nBins);
1610 
1611  for(UInt_t iev=start; iev<end; iev++) {
1612 
1613  Double_t eventWeight = eventSample[iev]->GetWeight();
1614  if (eventSample[iev]->GetClass() == fSigClass) {
1615  nodeInfof.nTotS+=eventWeight;
1616  nodeInfof.nTotS_unWeighted++; }
1617  else {
1618  nodeInfof.nTotB+=eventWeight;
1619  nodeInfof.nTotB_unWeighted++;
1620  }
1621 
1622  // #### Count the number in each bin
1623  Int_t iBin=-1;
1624  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1625  // now scan trough the cuts for each varable and find which one gives
1626  // the best separationGain at the current stage.
1627  if ( useVariable[ivar] ) {
1628  Double_t eventData;
1629  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1630  else { // the fisher variable
1631  eventData = fisherCoeff[fNvars];
1632  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1633  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1634 
1635  }
1636  // #### figure out which bin it belongs in ...
1637  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1638  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1639  if (eventSample[iev]->GetClass() == fSigClass) {
1640  nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1641  nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1642  }
1643  else {
1644  nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1645  nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1646  }
1647  if (DoRegression()) {
1648  nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1649  nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1650  }
1651  }
1652  }
1653  }
1654  return nodeInfof;
1655  };
1656 
1657  // #### Need an intial struct to pass to std::accumulate
1658  TrainNodeInfo nodeInfoInit(cNvars, nBins);
1659 
1660  // #### Run the threads in parallel then merge the results
1661  auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1662  nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1663  }
1664 
1665 
1666  // #### When nbins is close to the order of the data this version of parallelization is faster
1667  // #### Parallelize by vectorizing the variable loop
1668  else {
1669 
1670  auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1671 
1672  for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1673 
1674  Int_t iBin=-1;
1675  Double_t eventWeight = eventSample[iev]->GetWeight();
1676 
1677  // Only count the net signal and background once
1678  if(ivar==0){
1679  if (eventSample[iev]->GetClass() == fSigClass) {
1680  nodeInfo.nTotS+=eventWeight;
1681  nodeInfo.nTotS_unWeighted++; }
1682  else {
1683  nodeInfo.nTotB+=eventWeight;
1684  nodeInfo.nTotB_unWeighted++;
1685  }
1686  }
1687 
1688  // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1689  if ( useVariable[ivar] ) {
1690  Double_t eventData;
1691  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1692  else { // the fisher variable
1693  eventData = fisherCoeff[fNvars];
1694  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1695  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1696 
1697  }
1698  // #### figure out which bin it belongs in ...
1699  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1700  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1701  if (eventSample[iev]->GetClass() == fSigClass) {
1702  nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1703  nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1704  }
1705  else {
1706  nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1707  nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1708  }
1709  if (DoRegression()) {
1710  nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1711  nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1712  }
1713  }
1714  }
1715  return 0;
1716  };
1717 
1718  TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1719  }
1720 
1721  // now turn each "histogram" into a cumulative distribution
1722  // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1723  // #### doesn't hurt otherwise, pretty unnoticeable
1724  auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1725  if (useVariable[ivar]) {
1726  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1727  nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1728  nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1729  nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1730  nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1731  if (DoRegression()) {
1732  nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1733  nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1734  }
1735  }
1736  if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1737  Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1738  << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1739  << " while eventsample size = " << eventSample.size()
1740  << Endl;
1741  }
1742  double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1743  double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1744  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1745  Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1746  << lastBins
1747  << " while total number of events = " << totalSum
1748  << Endl;
1749  }
1750  }
1751  return 0;
1752  };
1753  TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1754 
1755  // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1756  // now select the optimal cuts for each varable and find which one gives
1757  // the best separationGain at the current stage
1758  auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1759  if (useVariable[ivar]) {
1760  Double_t sepTmp;
1761  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1762  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1763  // calculated by the "SamplePurities" from the branches that would go to the
1764  // left or the right from this node if "these" cuts were used in the Node:
1765  // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1766  // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1767 
1768  // only allow splits where both daughter nodes match the specified minimum number
1769  // for this use the "unweighted" events, as you are interested in statistically
1770  // significant splits, which is determined by the actual number of entries
1771  // for a node, rather than the sum of event weights.
1772 
1773  Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1774  Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1775  Double_t s = nodeInfo.nTotS_unWeighted;
1776  Double_t b = nodeInfo.nTotB_unWeighted;
1777  Double_t slW = nodeInfo.nSelS[ivar][iBin];
1778  Double_t blW = nodeInfo.nSelB[ivar][iBin];
1779  Double_t sW = nodeInfo.nTotS;
1780  Double_t bW = nodeInfo.nTotB;
1781  Double_t sr = s-sl;
1782  Double_t br = b-bl;
1783  Double_t srW = sW-slW;
1784  Double_t brW = bW-blW;
1785  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1786  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1787  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1788  ) {
1789 
1790  if (DoRegression()) {
1791  sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1792  nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1793  nodeInfo.nTotS+nodeInfo.nTotB,
1794  nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1795  } else {
1796  sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1797  }
1798  if (separationGain[ivar] < sepTmp) {
1799  separationGain[ivar] = sepTmp;
1800  cutIndex[ivar] = iBin;
1801  }
1802  }
1803  }
1804  }
1805  return 0;
1806  };
1807  TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1808 
1809  // you found the best separation cut for each variable, now compare the variables
1810  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1811  if (useVariable[ivar] ) {
1812  if (separationGainTotal < separationGain[ivar]) {
1813  separationGainTotal = separationGain[ivar];
1814  mxVar = ivar;
1815  }
1816  }
1817  }
1818 
1819  if (mxVar >= 0) {
1820  if (DoRegression()) {
1821  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1822  node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1823  if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1824  node->SetRMS(0);
1825 
1826  }else{
1827  node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1828  }
1829  }
1830  else {
1831  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1832  if (mxVar >=0){
1833  if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1834  else cutType=kFALSE;
1835  }
1836  }
1837  node->SetSelector((UInt_t)mxVar);
1838  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1839  node->SetCutType(cutType);
1840  node->SetSeparationGain(separationGainTotal);
1841  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1842  node->SetNFisherCoeff(0);
1843  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1844  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1845  }else{
1846  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1847  // be even less storage space on average than storing also the mapping used otherwise
1848  // can always be changed relatively easy
1849  node->SetNFisherCoeff(fNvars+1);
1850  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1851  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1852  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1853  if (ivar<fNvars){
1854  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1855  }
1856  }
1857  }
1858  }
1859  else {
1860  separationGainTotal = 0;
1861  }
1862 
1863  // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1864  // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1865  // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1866  //for (UInt_t i=0; i<cNvars; i++) {
1867  // delete [] nodeInfo.nSelS[i];
1868  // delete [] nodeInfo.nSelB[i];
1869  // delete [] nodeInfo.nSelS_unWeighted[i];
1870  // delete [] nodeInfo.nSelB_unWeighted[i];
1871  // delete [] nodeInfo.target[i];
1872  // delete [] nodeInfo.target2[i];
1873  // delete [] cutValues[i];
1874  //}
1875  //delete [] nodeInfo.nSelS;
1876  //delete [] nodeInfo.nSelB;
1877  //delete [] nodeInfo.nSelS_unWeighted;
1878  //delete [] nodeInfo.nSelB_unWeighted;
1879  //delete [] nodeInfo.target;
1880  //delete [] nodeInfo.target2;
1881 
1882  // #### left these as dynamic arrays as they were before
1883  // #### since I didn't need to mess with them for parallelization
1884  delete [] cutValues;
1885 
1886  delete [] xmin;
1887  delete [] xmax;
1888 
1889  delete [] useVariable;
1890  delete [] mapVariable;
1891 
1892  delete [] separationGain;
1893  delete [] cutIndex;
1894 
1895  delete [] nBins;
1896  delete [] binWidth;
1897  delete [] invBinWidth;
1898 
1899  return separationGainTotal;
1900 }
1901 
1902 // Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1903 #else
1905  TMVA::DecisionTreeNode *node )
1906 {
1907 // #### OK let's comment this one to see how to parallelize it
1908  Double_t separationGainTotal = -1, sepTmp;
1909  Double_t *separationGain = new Double_t[fNvars+1];
1910  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1911 
1912  // #### initialize the sep gain and cut index values
1913  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1914  separationGain[ivar]=-1;
1915  cutIndex[ivar]=-1;
1916  }
1917  // ### set up some other variables
1918  Int_t mxVar = -1;
1919  Bool_t cutType = kTRUE;
1920  Double_t nTotS, nTotB;
1921  Int_t nTotS_unWeighted, nTotB_unWeighted;
1922  UInt_t nevents = eventSample.size();
1923 
1924 
1925  // the +1 comes from the fact that I treat later on the Fisher output as an
1926  // additional possible variable.
1927  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1928  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1929 
1930  std::vector<Double_t> fisherCoeff;
1931 
1932  // #### set up a map to the subset of variables using two arrays
1933  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1934  UInt_t tmp=fUseNvars;
1935  GetRandomisedVariables(useVariable,mapVariable,tmp);
1936  }
1937  else {
1938  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1939  useVariable[ivar] = kTRUE;
1940  mapVariable[ivar] = ivar;
1941  }
1942  }
1943  // #### last variable entry is the fisher variable
1944  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1945 
1946  // #### Begin Fisher calculation
1947  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1948  if (fUseFisherCuts) {
1949  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1950 
1951  //use for the Fisher discriminant ONLY those variables that show
1952  //some reasonable linear correlation in either Signal or Background
1953  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1954  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1955  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1956  useVarInFisher[ivar] = kFALSE;
1957  mapVarInFisher[ivar] = ivar;
1958  }
1959 
1960  std::vector<TMatrixDSym*>* covMatrices;
1961  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1962  if (!covMatrices){
1963  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1964  fisherOK = kFALSE;
1965  }else{
1966  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1967  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1968  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1969  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1970 
1971  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1972  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1973  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1974  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1975  useVarInFisher[ivar] = kTRUE;
1976  useVarInFisher[jvar] = kTRUE;
1977  }
1978  }
1979  }
1980  // now as you know which variables you want to use, count and map them:
1981  // such that you can use an array/matrix filled only with THOSE variables
1982  // that you used
1983  UInt_t nFisherVars = 0;
1984  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1985  //now .. pick those variables that are used in the FIsher and are also
1986  // part of the "allowed" variables in case of Randomized Trees)
1987  if (useVarInFisher[ivar] && useVariable[ivar]) {
1988  mapVarInFisher[nFisherVars++]=ivar;
1989  // now exclud the the variables used in the Fisher cuts, and don't
1990  // use them anymore in the individual variable scan
1991  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1992  }
1993  }
1994 
1995 
1996  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1997  fisherOK = kTRUE;
1998  }
1999  delete [] useVarInFisher;
2000  delete [] mapVarInFisher;
2001 
2002  }
2003  // #### End Fisher calculation
2004 
2005 
2006  UInt_t cNvars = fNvars;
2007  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
2008 
2009  // #### OK now what's going on...
2010  // #### looks like we are setting up histograms
2011  UInt_t* nBins = new UInt_t [cNvars];
2012  Double_t* binWidth = new Double_t [cNvars];
2013  Double_t* invBinWidth = new Double_t [cNvars];
2014 
2015  Double_t** nSelS = new Double_t* [cNvars];
2016  Double_t** nSelB = new Double_t* [cNvars];
2017  Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2018  Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2019  Double_t** target = new Double_t* [cNvars];
2020  Double_t** target2 = new Double_t* [cNvars];
2021  Double_t** cutValues = new Double_t* [cNvars];
2022 
2023  // #### looping through the variables...
2024  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2025  // #### ncuts means that we need n+1 bins for each variable
2026  nBins[ivar] = fNCuts+1;
2027  if (ivar < fNvars) {
2028  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2029  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2030  }
2031  }
2032 
2033  // #### make some new arrays for each ith var, size=nbins for each array
2034  // #### integer features get the same number of bins as values, set later
2035  nSelS[ivar] = new Double_t [nBins[ivar]];
2036  nSelB[ivar] = new Double_t [nBins[ivar]];
2037  nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2038  nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2039  target[ivar] = new Double_t [nBins[ivar]];
2040  target2[ivar] = new Double_t [nBins[ivar]];
2041  cutValues[ivar] = new Double_t [nBins[ivar]];
2042 
2043  }
2044 
2045  // #### xmin and xmax for earch variable
2046  Double_t *xmin = new Double_t[cNvars];
2047  Double_t *xmax = new Double_t[cNvars];
2048 
2049  // #### ok loop through each variable to initialize all the values
2050  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2051  if (ivar < fNvars){
2052  xmin[ivar]=node->GetSampleMin(ivar);
2053  xmax[ivar]=node->GetSampleMax(ivar);
2054  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2055  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2056  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2057  useVariable[ivar]=kFALSE;
2058  }
2059 
2060  } else { // the fisher variable
2061  xmin[ivar]=999;
2062  xmax[ivar]=-999;
2063  // too bad, for the moment I don't know how to do this without looping
2064  // once to get the "min max" and then AGAIN to fill the histogram
2065  for (UInt_t iev=0; iev<nevents; iev++) {
2066  // returns the Fisher value (no fixed range)
2067  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2068  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2069  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2070  if (result > xmax[ivar]) xmax[ivar]=result;
2071  if (result < xmin[ivar]) xmin[ivar]=result;
2072  }
2073  }
2074  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2075  nSelS[ivar][ibin]=0;
2076  nSelB[ivar][ibin]=0;
2077  nSelS_unWeighted[ivar][ibin]=0;
2078  nSelB_unWeighted[ivar][ibin]=0;
2079  target[ivar][ibin]=0;
2080  target2[ivar][ibin]=0;
2081  cutValues[ivar][ibin]=0;
2082  }
2083  }
2084 
2085  // #### Nothing to parallelize here really, no loop through events
2086  // #### only figures out the bin edge values for the "histogram" arrays
2087  // fill the cut values for the scan:
2088  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2089 
2090  if ( useVariable[ivar] ) {
2091 
2092  //set the grid for the cut scan on the variables like this:
2093  //
2094  // | | | | | ... | |
2095  // xmin xmax
2096  //
2097  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2098  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2099  // --> nBins = fNCuts+1
2100  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2101  // hence can be safely omitted
2102 
2103  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2104  invBinWidth[ivar] = 1./binWidth[ivar];
2105  if (ivar < fNvars) {
2106  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2107  }
2108 
2109  // std::cout << "ivar="<<ivar
2110  // <<" min="<<xmin[ivar]
2111  // << " max="<<xmax[ivar]
2112  // << " widht=" << istepSize
2113  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2114  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2115  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2116  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2117  }
2118  }
2119  }
2120 
2121  // #### Loop through the events to get the total sig and background
2122  nTotS=0; nTotB=0;
2123  nTotS_unWeighted=0; nTotB_unWeighted=0;
2124  for (UInt_t iev=0; iev<nevents; iev++) {
2125 
2126  Double_t eventWeight = eventSample[iev]->GetWeight();
2127  if (eventSample[iev]->GetClass() == fSigClass) {
2128  nTotS+=eventWeight;
2129  nTotS_unWeighted++; }
2130  else {
2131  nTotB+=eventWeight;
2132  nTotB_unWeighted++;
2133  }
2134 
2135  // #### Count the number in each bin (fill array "histograms")
2136  Int_t iBin=-1;
2137  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2138  // now scan trough the cuts for each varable and find which one gives
2139  // the best separationGain at the current stage.
2140  if ( useVariable[ivar] ) {
2141  Double_t eventData;
2142  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2143  else { // the fisher variable
2144  eventData = fisherCoeff[fNvars];
2145  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2146  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2147 
2148  }
2149  // #### figure out which bin the event belongs in ...
2150  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2151  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2152  if (eventSample[iev]->GetClass() == fSigClass) {
2153  nSelS[ivar][iBin]+=eventWeight;
2154  nSelS_unWeighted[ivar][iBin]++;
2155  }
2156  else {
2157  nSelB[ivar][iBin]+=eventWeight;
2158  nSelB_unWeighted[ivar][iBin]++;
2159  }
2160  if (DoRegression()) {
2161  target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2162  target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2163  }
2164  }
2165  }
2166  }
2167 
2168  // now turn the "histogram" into a cumulative distribution
2169  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2170  if (useVariable[ivar]) {
2171  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2172  nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2173  nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2174  nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2175  nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2176  if (DoRegression()) {
2177  target[ivar][ibin] +=target[ivar][ibin-1] ;
2178  target2[ivar][ibin]+=target2[ivar][ibin-1];
2179  }
2180  }
2181  if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2182  Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2183  << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2184  << " while eventsample size = " << eventSample.size()
2185  << Endl;
2186  }
2187  double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2188  double totalSum=nTotS+nTotB;
2189  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2190  Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2191  << lastBins
2192  << " while total number of events = " << totalSum
2193  << Endl;
2194  }
2195  }
2196  }
2197  // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2198  // now select the optimal cuts for each varable and find which one gives
2199  // the best separationGain at the current stage
2200  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2201  if (useVariable[ivar]) {
2202  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2203  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2204  // calculated by the "SamplePurities" fom the branches that would go to the
2205  // left or the right from this node if "these" cuts were used in the Node:
2206  // hereby: nSelS and nSelB would go to the right branch
2207  // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2208 
2209  // only allow splits where both daughter nodes match the specified miniumum number
2210  // for this use the "unweighted" events, as you are interested in statistically
2211  // significant splits, which is determined by the actual number of entries
2212  // for a node, rather than the sum of event weights.
2213 
2214  Double_t sl = nSelS_unWeighted[ivar][iBin];
2215  Double_t bl = nSelB_unWeighted[ivar][iBin];
2216  Double_t s = nTotS_unWeighted;
2217  Double_t b = nTotB_unWeighted;
2218  Double_t slW = nSelS[ivar][iBin];
2219  Double_t blW = nSelB[ivar][iBin];
2220  Double_t sW = nTotS;
2221  Double_t bW = nTotB;
2222  Double_t sr = s-sl;
2223  Double_t br = b-bl;
2224  Double_t srW = sW-slW;
2225  Double_t brW = bW-blW;
2226  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2227  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2228  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2229  ) {
2230 
2231  if (DoRegression()) {
2232  sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2233  target[ivar][iBin],target2[ivar][iBin],
2234  nTotS+nTotB,
2235  target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2236  } else {
2237  sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2238  }
2239  if (separationGain[ivar] < sepTmp) {
2240  separationGain[ivar] = sepTmp;
2241  cutIndex[ivar] = iBin;
2242  }
2243  }
2244  }
2245  }
2246  }
2247 
2248  //now you have found the best separation cut for each variable, now compare the variables
2249  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2250  if (useVariable[ivar] ) {
2251  if (separationGainTotal < separationGain[ivar]) {
2252  separationGainTotal = separationGain[ivar];
2253  mxVar = ivar;
2254  }
2255  }
2256  }
2257 
2258  if (mxVar >= 0) {
2259  if (DoRegression()) {
2260  node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2261  node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2262  if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2263  node->SetRMS(0);
2264  }else{
2265  node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2266  }
2267  }
2268  else {
2269  node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2270  if (mxVar >=0){
2271  if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2272  else cutType=kFALSE;
2273  }
2274  }
2275  node->SetSelector((UInt_t)mxVar);
2276  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2277  node->SetCutType(cutType);
2278  node->SetSeparationGain(separationGainTotal);
2279  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2280  node->SetNFisherCoeff(0);
2281  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2282  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2283  }else{
2284  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2285  // be even less storage space on average than storing also the mapping used otherwise
2286  // can always be changed relatively easy
2287  node->SetNFisherCoeff(fNvars+1);
2288  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2289  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2290  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2291  if (ivar<fNvars){
2292  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2293  }
2294  }
2295  }
2296  }
2297  else {
2298  separationGainTotal = 0;
2299  }
2300  // if (mxVar > -1) {
2301  // std::cout << "------------------------------------------------------------------"<<std::endl;
2302  // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2303  // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2304  // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2305  // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2306  // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2307  // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2308  // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2309  // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2310  // } else {
2311  // std::cout << "------------------------------------------------------------------"<<std::endl;
2312  // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2313  // }
2314  for (UInt_t i=0; i<cNvars; i++) {
2315  delete [] nSelS[i];
2316  delete [] nSelB[i];
2317  delete [] nSelS_unWeighted[i];
2318  delete [] nSelB_unWeighted[i];
2319  delete [] target[i];
2320  delete [] target2[i];
2321  delete [] cutValues[i];
2322  }
2323  delete [] nSelS;
2324  delete [] nSelB;
2325  delete [] nSelS_unWeighted;
2326  delete [] nSelB_unWeighted;
2327  delete [] target;
2328  delete [] target2;
2329  delete [] cutValues;
2330 
2331  delete [] xmin;
2332  delete [] xmax;
2333 
2334  delete [] useVariable;
2335  delete [] mapVariable;
2336 
2337  delete [] separationGain;
2338  delete [] cutIndex;
2339 
2340  delete [] nBins;
2341  delete [] binWidth;
2342  delete [] invBinWidth;
2343 
2344  return separationGainTotal;
2345 
2346 }
2347 #endif
2348 
2349 
2350 ////////////////////////////////////////////////////////////////////////////////
2351 /// calculate the fisher coefficients for the event sample and the variables used
2352 
2353 std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2354  std::vector<Double_t> fisherCoeff(fNvars+1);
2355 
2356  // initialization of global matrices and vectors
2357  // average value of each variables for S, B, S+B
2358  TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2359 
2360  // the covariance 'within class' and 'between class' matrices
2361  TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2362  TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2363  TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2364 
2365  //
2366  // compute mean values of variables in each sample, and the overall means
2367  //
2368 
2369  // initialize internal sum-of-weights variables
2370  Double_t sumOfWeightsS = 0;
2371  Double_t sumOfWeightsB = 0;
2372 
2373 
2374  // init vectors
2375  Double_t* sumS = new Double_t[nFisherVars];
2376  Double_t* sumB = new Double_t[nFisherVars];
2377  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2378 
2379  UInt_t nevents = eventSample.size();
2380  // compute sample means
2381  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2382 
2383  // read the Training Event into "event"
2384  const Event * ev = eventSample[ievt];
2385 
2386  // sum of weights
2387  Double_t weight = ev->GetWeight();
2388  if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2389  else sumOfWeightsB += weight;
2390 
2391  Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2392  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2393  sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2394  }
2395  }
2396  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2397  (*meanMatx)( ivar, 2 ) = sumS[ivar];
2398  (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2399 
2400  (*meanMatx)( ivar, 2 ) += sumB[ivar];
2401  (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2402 
2403  // signal + background
2404  (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2405  }
2406 
2407  delete [] sumS;
2408 
2409  delete [] sumB;
2410 
2411  // the matrix of covariance 'within class' reflects the dispersion of the
2412  // events relative to the center of gravity of their own class
2413 
2414  // assert required
2415 
2416  assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2417 
2418  // product matrices (x-<x>)(y-<y>) where x;y are variables
2419 
2420  const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2421  Double_t *sum2Sig = new Double_t[nFisherVars2];
2422  Double_t *sum2Bgd = new Double_t[nFisherVars2];
2423  Double_t *xval = new Double_t[nFisherVars2];
2424  memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2425  memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2426 
2427  // 'within class' covariance
2428  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2429 
2430  // read the Training Event into "event"
2431  // const Event* ev = eventSample[ievt];
2432  const Event* ev = eventSample.at(ievt);
2433 
2434  Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2435 
2436  for (UInt_t x=0; x<nFisherVars; x++) {
2437  xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2438  }
2439  Int_t k=0;
2440  for (UInt_t x=0; x<nFisherVars; x++) {
2441  for (UInt_t y=0; y<nFisherVars; y++) {
2442  if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2443  else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2444  k++;
2445  }
2446  }
2447  }
2448  Int_t k=0;
2449  for (UInt_t x=0; x<nFisherVars; x++) {
2450  for (UInt_t y=0; y<nFisherVars; y++) {
2451  (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2452  k++;
2453  }
2454  }
2455 
2456  delete [] sum2Sig;
2457  delete [] sum2Bgd;
2458  delete [] xval;
2459 
2460 
2461  // the matrix of covariance 'between class' reflects the dispersion of the
2462  // events of a class relative to the global center of gravity of all the class
2463  // hence the separation between classes
2464 
2465 
2466  Double_t prodSig, prodBgd;
2467 
2468  for (UInt_t x=0; x<nFisherVars; x++) {
2469  for (UInt_t y=0; y<nFisherVars; y++) {
2470 
2471  prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2472  ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2473  prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2474  ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2475 
2476  (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2477  }
2478  }
2479 
2480 
2481 
2482  // compute full covariance matrix from sum of within and between matrices
2483  for (UInt_t x=0; x<nFisherVars; x++)
2484  for (UInt_t y=0; y<nFisherVars; y++)
2485  (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2486 
2487  // Fisher = Sum { [coeff]*[variables] }
2488  //
2489  // let Xs be the array of the mean values of variables for signal evts
2490  // let Xb be the array of the mean values of variables for backgd evts
2491  // let InvWith be the inverse matrix of the 'within class' correlation matrix
2492  //
2493  // then the array of Fisher coefficients is
2494  // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2495  TMatrixD* theMat = with; // Fishers original
2496  // TMatrixD* theMat = cov; // Mahalanobis
2497 
2498  TMatrixD invCov( *theMat );
2499  if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2500  Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2501  << TMath::Abs(invCov.Determinant())
2502  << " did you use the variables that are linear combinations or highly correlated?"
2503  << Endl;
2504  }
2505  if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2506  Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2507  << TMath::Abs(invCov.Determinant())
2508  << " did you use the variables that are linear combinations?"
2509  << Endl;
2510  }
2511 
2512  invCov.Invert();
2513 
2514  // apply rescaling factor
2515  Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2516 
2517  // compute difference of mean values
2518  std::vector<Double_t> diffMeans( nFisherVars );
2519 
2520  for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2521  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2522  for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2523  Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2524  fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2525  }
2526 
2527  // rescale
2528  fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2529  }
2530 
2531  // offset correction
2532  Double_t f0 = 0.0;
2533  for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2534  f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2535  }
2536  f0 /= -2.0;
2537 
2538  fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2539 
2540  return fisherCoeff;
2541 }
2542 
2543 ////////////////////////////////////////////////////////////////////////////////
2544 /// train a node by finding the single optimal cut for a single variable
2545 /// that best separates signal and background (maximizes the separation gain)
2546 
2548  TMVA::DecisionTreeNode *node )
2549 {
2550  Double_t nTotS = 0.0, nTotB = 0.0;
2551  Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2552 
2553  std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2554 
2555  // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2556  // each spot in parallel no problem
2557  std::vector<Double_t> lCutValue( fNvars, 0.0 );
2558  std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2559  std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2560  lCutType.assign( fNvars, Char_t(kFALSE) );
2561 
2562  // Initialize (un)weighted counters for signal & background
2563  // Construct a list of event wrappers that point to the original data
2564  for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2565  if((*it)->GetClass() == fSigClass) { // signal or background event
2566  nTotS += (*it)->GetWeight();
2567  ++nTotS_unWeighted;
2568  }
2569  else {
2570  nTotB += (*it)->GetWeight();
2571  ++nTotB_unWeighted;
2572  }
2573  bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2574  }
2575 
2576  std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2577  useVariable.assign( fNvars, Char_t(kTRUE) );
2578 
2579  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2580  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2581  if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2582  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2583  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2584  }
2585  Int_t nSelectedVars = 0;
2586  while (nSelectedVars < fUseNvars) {
2587  Double_t bla = fMyTrandom->Rndm()*fNvars;
2588  useVariable[Int_t (bla)] = Char_t(kTRUE);
2589  nSelectedVars = 0;
2590  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2591  if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2592  }
2593  }
2594  }
2595  else {
2596  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2597  }
2598  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2599  if(!useVariable[ivar]) continue; // only optimze with selected variables
2600 
2601  TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2602 
2603  std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2604 
2605 
2606  Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2607 
2608  std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2609  for( ; it != it_end; ++it ) {
2610  if((**it)->GetClass() == fSigClass ) // specify signal or background event
2611  sigWeightCtr += (**it)->GetWeight();
2612  else
2613  bkgWeightCtr += (**it)->GetWeight();
2614  // Store the accumulated signal (background) weights
2615  it->SetCumulativeWeight(false,bkgWeightCtr);
2616  it->SetCumulativeWeight(true,sigWeightCtr);
2617  }
2618 
2619  const Double_t fPMin = 1.0e-6;
2620  Bool_t cutType = kFALSE;
2621  Long64_t index = 0;
2622  Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2623 
2624  // Locate the optimal cut for this (ivar-th) variable
2625  for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2626  if( index == 0 ) { ++index; continue; }
2627  if( *(*it) == NULL ) {
2628  Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2629  << index << ", and parent node=" << node->GetParent() << Endl;
2630  break;
2631  }
2632  dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2633  norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2634  // Only allow splits where both daughter nodes have the specified minimum number of events
2635  // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2636  if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2637 
2638  sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2639  if( sepTmp > separationGain ) {
2640  separationGain = sepTmp;
2641  cutValue = it->GetVal() - 0.5*dVal;
2642  Double_t nSelS = it->GetCumulativeWeight(true);
2643  Double_t nSelB = it->GetCumulativeWeight(false);
2644  // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2645  // or by removing signal (enhancing background)
2646  if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2647  else cutType = kFALSE;
2648  }
2649  }
2650  ++index;
2651  }
2652  lCutType[ivar] = Char_t(cutType);
2653  lCutValue[ivar] = cutValue;
2654  lSepGain[ivar] = separationGain;
2655  }
2656  Double_t separationGain = -1.0;
2657  Int_t iVarIndex = -1;
2658  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2659  if( lSepGain[ivar] > separationGain ) {
2660  iVarIndex = ivar;
2661  separationGain = lSepGain[ivar];
2662  }
2663  }
2664 
2665  // #### storing the best values into the node
2666  if(iVarIndex >= 0) {
2667  node->SetSelector(iVarIndex);
2668  node->SetCutValue(lCutValue[iVarIndex]);
2669  node->SetSeparationGain(lSepGain[iVarIndex]);
2670  node->SetCutType(lCutType[iVarIndex]);
2671  fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2672  }
2673  else {
2674  separationGain = 0.0;
2675  }
2676 
2677  return separationGain;
2678 }
2679 
2680 ////////////////////////////////////////////////////////////////////////////////
2681 /// get the pointer to the leaf node where a particular event ends up in...
2682 /// (used in gradient boosting)
2683 
2685 {
2687  while(current->GetNodeType() == 0) { // intermediate node in a tree
2688  current = (current->GoesRight(e)) ?
2689  (TMVA::DecisionTreeNode*)current->GetRight() :
2690  (TMVA::DecisionTreeNode*)current->GetLeft();
2691  }
2692  return current;
2693 }
2694 
2695 ////////////////////////////////////////////////////////////////////////////////
2696 /// the event e is put into the decision tree (starting at the root node)
2697 /// and the output is NodeType (signal) or (background) of the final node (basket)
2698 /// in which the given events ends up. I.e. the result of the classification if
2699 /// the event for this decision tree.
2700 
2702 {
2703  TMVA::DecisionTreeNode *current = this->GetRoot();
2704  if (!current){
2705  Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2706  return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2707  }
2708 
2709  while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2710  current = (current->GoesRight(*e)) ?
2711  current->GetRight() :
2712  current->GetLeft();
2713  if (!current) {
2714  Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2715  }
2716 
2717  }
2718 
2719  if (DoRegression()) {
2720  // Note: This path is also taken for MethodBDT with analysis type
2721  // kClassification and kMulticlass when using GradBoost.
2722  // See TMVA::MethodBDT::InitGradBoost
2723  return current->GetResponse();
2724  } else {
2725  if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2726  else return current->GetPurity();
2727  }
2728 }
2729 
2730 ////////////////////////////////////////////////////////////////////////////////
2731 /// calculates the purity S/(S+B) of a given event sample
2732 
2733 Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2734 {
2735  Double_t sumsig=0, sumbkg=0, sumtot=0;
2736  for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2737  if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2738  else sumsig+=eventSample[ievt]->GetWeight();
2739  sumtot+=eventSample[ievt]->GetWeight();
2740  }
2741  // sanity check
2742  if (sumtot!= (sumsig+sumbkg)){
2743  Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2744  << sumtot << " " << sumsig << " " << sumbkg << Endl;
2745  }
2746  if (sumtot>0) return sumsig/(sumsig + sumbkg);
2747  else return -1;
2748 }
2749 
2750 ////////////////////////////////////////////////////////////////////////////////
2751 /// Return the relative variable importance, normalized to all
2752 /// variables together having the importance 1. The importance in
2753 /// evaluated as the total separation-gain that this variable had in
2754 /// the decision trees (weighted by the number of events)
2755 
2757 {
2758  std::vector<Double_t> relativeImportance(fNvars);
2759  Double_t sum=0;
2760  for (UInt_t i=0; i< fNvars; i++) {
2761  sum += fVariableImportance[i];
2762  relativeImportance[i] = fVariableImportance[i];
2763  }
2764 
2765  for (UInt_t i=0; i< fNvars; i++) {
2767  relativeImportance[i] /= sum;
2768  else
2769  relativeImportance[i] = 0;
2770  }
2771  return relativeImportance;
2772 }
2773 
2774 ////////////////////////////////////////////////////////////////////////////////
2775 /// returns the relative importance of variable ivar
2776 
2778 {
2779  std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2780  if (ivar < fNvars) return relativeImportance[ivar];
2781  else {
2782  Log() << kFATAL << "<GetVariableImportance>" << Endl
2783  << "--- ivar = " << ivar << " is out of range " << Endl;
2784  }
2785 
2786  return -1;
2787 }
2788 
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:88
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
virtual Double_t GetSeparationGain(const Double_t nSelS, const Double_t nSelB, const Double_t nTotS, const Double_t nTotB)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
DataSetInfo * fDataSetInfo
Definition: DecisionTree.h:253
static long int sum(long int i)
Definition: Factory.cxx:2258
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:27
void SetSelector(Short_t i)
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
Singleton class for Global types used by TMVA.
Definition: Types.h:73
long long Long64_t
Definition: RtypesCore.h:69
virtual Double_t Rndm()
Machine independent random number generator.
Definition: TRandom3.cxx:100
float Float_t
Definition: RtypesCore.h:53
Float_t GetSumTarget() const
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
Double_t GetNodePurityLimit() const
Definition: DecisionTree.h:161
EPruneMethod fPruneMethod
Definition: DecisionTree.h:233
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
static Config & Instance()
static function: returns TMVA instance
Definition: Config.cxx:109
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
virtual DecisionTreeNode * GetParent() const
UInt_t GetDepth() const
Definition: Node.h:114
Float_t GetSumTarget2() const
void IncrementNEvents(Float_t nev)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node ...
#define f(i)
Definition: RSha256.hxx:104
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:168
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
virtual void SetParentTree(TMVA::BinaryTree *t)
Definition: Node.h:126
Double_t fNodePurityLimit
Definition: DecisionTree.h:236
virtual void SetRight(Node *r)
virtual ~DecisionTree(void)
destructor
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
Float_t GetNSigEvents(void) const
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1361
virtual DecisionTreeNode * GetRoot() const
Definition: DecisionTree.h:93
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree...
void DeleteNode(Node *)
protected, recursive, function used by the class destructor and when Pruning
Definition: BinaryTree.cxx:74
void SetNSigEvents_unweighted(Float_t s)
virtual Double_t GetSeparationIndex(const Double_t n, const Double_t target, const Double_t target2)
Separation Index: a simple Variance.
void SetResponse(Float_t r)
Short_t Abs(Short_t d)
Definition: TMathBase.h:108
void SetNBValidation(Double_t b)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
void SetNFisherCoeff(Int_t nvars)
void SetDepth(UInt_t d)
Definition: Node.h:111
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:73
TClass * GetClass(T *)
Definition: TClass.h:577
TSeq< unsigned int > TSeqU
Definition: TSeq.hxx:195
char GetVarType() const
Definition: VariableInfo.h:61
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Double_t x[n]
Definition: legend1.C:17
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
static const Int_t fgRandomSeed
Definition: DecisionTree.h:68
Float_t GetNBkgEvents(void) const
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node ...
Float_t GetCutValue(void) const
void IncrementNBkgEvents(Float_t b)
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Double_t GetNodeR() const
std::vector< Double_t > fVariableImportance
Definition: DecisionTree.h:244
void SetSeparationGain(Float_t sep)
UInt_t GetClass() const
Definition: Event.h:81
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
void ResetValidationData()
temporary stored node values (number of events, etc.) that originate not from the training but from t...
void SetNBkgEvents(Float_t b)
void SetNSValidation(Double_t s)
UInt_t GetNumThreadsInPool()
Definition: DecisionTree.h:205
Class that contains all the data information.
Definition: DataSetInfo.h:60
Double_t GetSeparationGain(const Double_t nLeft, const Double_t targetLeft, const Double_t target2Left, const Double_t nTot, const Double_t targetTot, const Double_t target2Tot)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
ROOT::TThreadExecutor & GetThreadExecutor()
Definition: Config.h:82
TString operator+(const TString &s1, const TString &s2)
Use the special concatenation constructor.
Definition: TString.cxx:1449
void AddToSumTarget(Float_t t)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1396
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t GetNTargets() const
accessor to the number of targets
Definition: Event.cxx:320
void SetNEvents(Float_t nev)
Double_t fPruneStrength
Definition: DecisionTree.h:231
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
Bool_t DoRegression() const
Definition: DecisionTree.h:187
Double_t fMinLinCorrForFisher
Definition: DecisionTree.h:220
void SetTotalTreeDepth(Int_t depth)
Definition: BinaryTree.h:95
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:97
Int_t GetNodeType(void) const
void SetSubTreeR(Double_t r)
ROOT::R::TRInterface & r
Definition: Object.C:4
SVector< double, 2 > v
Definition: Dict.h:5
virtual void SetLeft(Node *l)
void SetAlpha(Double_t alpha)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i.e.
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node ...
void SetCutValue(Float_t c)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time...
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:335
bool almost_equal_float(float x, float y, int ulp=4)
float xmax
Definition: THbookFile.cxx:93
Tools & gTools()
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
REAL epsilon
Definition: triangle.c:617
constexpr Double_t E()
Base of natural log: .
Definition: TMath.h:97
virtual Double_t GetSeparationIndex(const Double_t s, const Double_t b)=0
bool almost_equal_double(double x, double y, int ulp=4)
const Bool_t kFALSE
Definition: RtypesCore.h:88
static void SetVarIndex(Int_t iVar)
void AddToSumTarget2(Float_t t2)
Float_t GetPurity(void) const
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node ...
#define d(i)
Definition: RSha256.hxx:102
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
TRandom3 * fMyTrandom
Definition: DecisionTree.h:242
#define ClassImp(name)
Definition: Rtypes.h:359
A class to prune a decision tree using the Cost Complexity method.
double Double_t
Definition: RtypesCore.h:55
Double_t GetNBValidation() const
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
void IncrementNSigEvents(Float_t s)
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree ...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:336
void SetAlphaMinSubtree(Double_t g)
int type
Definition: TGX11.cxx:120
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:251
unsigned long ULong_t
Definition: RtypesCore.h:51
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t y[n]
Definition: legend1.C:17
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetNEvents_unboosted(Float_t nev)
static constexpr double s
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
UInt_t GetTotalTreeDepth() const
Definition: BinaryTree.h:93
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:96
void SetNSigEvents_unboosted(Float_t s)
void SetTerminal(Bool_t s=kTRUE)
RegressionVariance * fRegType
Definition: DecisionTree.h:224
void SetNSigEvents(Float_t s)
UInt_t CountNodes(Node *n=NULL)
return the number of nodes in the tree. (make a new count –> takes time)
Definition: BinaryTree.cxx:103
void SetNBkgEvents_unboosted(Float_t b)
SeparationBase * fSepType
Definition: DecisionTree.h:223
void SetNBkgEvents_unweighted(Float_t b)
char Char_t
Definition: RtypesCore.h:29
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
auto MapReduce(F func, unsigned nTimes, R redfunc) -> typename std::result_of< F()>::type
This method behaves just like Map, but an additional redfunc function must be provided.
Node * GetRightDaughter(Node *n)
get right daughter node current node "n"
Definition: BinaryTree.cxx:95
Node for the BinarySearch or Decision Trees.
Definition: Node.h:56
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Execute func (with no arguments) nTimes in parallel.
auto * l
Definition: textangle.C:4
Float_t GetResponse(void) const
Double_t GetNSValidation() const
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:200
Double_t GetOriginalWeight() const
Definition: Event.h:79
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
virtual DecisionTreeNode * GetLeft() const
virtual DecisionTreeNode * GetRight() const
Node * GetLeftDaughter(Node *n)
get left daughter node current node "n"
Definition: BinaryTree.cxx:87
void ClearNodeAndAllDaughters()
clear the nodes (their S/N, Nevents etc), just keep the structure of the tree
void SetSeparationIndex(Float_t sep)
void SetRoot(Node *r)
Definition: BinaryTree.h:80
Short_t GetSelector() const
virtual Int_t Poisson(Double_t mean)
Generates a random integer N according to a Poisson law.
Definition: TRandom.cxx:383
Double_t Sqrt(Double_t x)
Definition: TMath.h:690
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event *> &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1526
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
Double_t GetPruneStrength() const
Definition: DecisionTree.h:146
static constexpr double sr
const Bool_t kTRUE
Definition: RtypesCore.h:87
const Int_t n
Definition: legend1.C:16
A helper class to prune a decision tree using the expected error (C4.5) method.
void SetPos(char s)
Definition: Node.h:117
void SetNEvents_unweighted(Float_t nev)
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node