Logo ROOT   master
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TMVA::DecisionTree *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation of a Decision Tree *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://mva.sourceforge.net/license.txt) *
29  * *
30  **********************************************************************************/
31 
32 /*! \class TMVA::DecisionTree
33 \ingroup TMVA
34 
35 Implementation of a Decision Tree
36 
37 In a decision tree successive decision nodes are used to categorize the
38 events out of the sample as either signal or background. Each node
39 uses only a single discriminating variable to decide if the event is
40 signal-like ("goes right") or background-like ("goes left"). This
41 forms a tree like structure with "baskets" at the end (leave nodes),
42 and an event is classified as either signal or background according to
43 whether the basket where it ends up has been classified signal or
44 background during the training. Training of a decision tree is the
45 process to define the "cut criteria" for each node. The training
46 starts with the root node. Here one takes the full training event
47 sample and selects the variable and corresponding cut value that gives
48 the best separation between signal and background at this stage. Using
49 this cut criterion, the sample is then divided into two subsamples, a
50 signal-like (right) and a background-like (left) sample. Two new nodes
51 are then created for each of the two sub-samples and they are
52 constructed using the same mechanism as described for the root
53 node. The devision is stopped once a certain node has reached either a
54 minimum number of events, or a minimum or maximum signal purity. These
55 leave nodes are then called "signal" or "background" if they contain
56 more signal respective background events from the training sample.
57 
58 */
59 
60 #include <iostream>
61 #include <algorithm>
62 #include <vector>
63 #include <limits>
64 #include <cassert>
65 
66 #include "TRandom3.h"
67 #include "TMath.h"
68 #include "TMatrix.h"
69 
70 #include "TMVA/MsgLogger.h"
71 #include "TMVA/DecisionTree.h"
72 #include "TMVA/DecisionTreeNode.h"
73 #include "TMVA/BinarySearchTree.h"
74 
75 #include "TMVA/Tools.h"
76 #include "TMVA/Config.h"
77 
78 #include "TMVA/GiniIndex.h"
79 #include "TMVA/CrossEntropy.h"
81 #include "TMVA/SdivSqrtSplusB.h"
82 #include "TMVA/Event.h"
83 #include "TMVA/BDTEventWrapper.h"
84 #include "TMVA/IPruneTool.h"
87 
88 const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
89 
90 using std::vector;
91 
93 
94 bool almost_equal_float(float x, float y, int ulp=4){
95  // the machine epsilon has to be scaled to the magnitude of the values used
96  // and multiplied by the desired precision in ULPs (units in the last place)
97  return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
98  // unless the result is subnormal
99  || std::abs(x-y) < std::numeric_limits<float>::min();
100 }
101 
102 bool almost_equal_double(double x, double y, int ulp=4){
103  // the machine epsilon has to be scaled to the magnitude of the values used
104  // and multiplied by the desired precision in ULPs (units in the last place)
105  return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
106  // unless the result is subnormal
107  || std::abs(x-y) < std::numeric_limits<double>::min();
108 }
109 
110 ////////////////////////////////////////////////////////////////////////////////
111 /// default constructor using the GiniIndex as separation criterion,
112 /// no restrictions on minium number of events in a leave note or the
113 /// separation gain in the node splitting
114 
116  BinaryTree(),
117  fNvars (0),
118  fNCuts (-1),
119  fUseFisherCuts (kFALSE),
120  fMinLinCorrForFisher (1),
121  fUseExclusiveVars (kTRUE),
122  fSepType (NULL),
123  fRegType (NULL),
124  fMinSize (0),
125  fMinNodeSize (1),
126  fMinSepGain (0),
127  fUseSearchTree(kFALSE),
128  fPruneStrength(0),
129  fPruneMethod (kNoPruning),
130  fNNodesBeforePruning(0),
131  fNodePurityLimit(0.5),
132  fRandomisedTree (kFALSE),
133  fUseNvars (0),
134  fUsePoissonNvars(kFALSE),
135  fMyTrandom (NULL),
136  fMaxDepth (999999),
137  fSigClass (0),
138  fTreeID (0),
139  fAnalysisType (Types::kClassification),
140  fDataSetInfo (NULL)
141 
142 {}
143 
144 ////////////////////////////////////////////////////////////////////////////////
145 /// constructor specifying the separation type, the min number of
146 /// events in a no that is still subjected to further splitting, the
147 /// number of bins in the grid used in applying the cut for the node
148 /// splitting.
149 
151  Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
152  UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
153  BinaryTree(),
154  fNvars (0),
155  fNCuts (nCuts),
156  fUseFisherCuts (kFALSE),
157  fMinLinCorrForFisher (1),
158  fUseExclusiveVars (kTRUE),
159  fSepType (sepType),
160  fRegType (NULL),
161  fMinSize (0),
162  fMinNodeSize (minSize),
163  fMinSepGain (0),
164  fUseSearchTree (kFALSE),
165  fPruneStrength (0),
166  fPruneMethod (kNoPruning),
167  fNNodesBeforePruning(0),
168  fNodePurityLimit(purityLimit),
169  fRandomisedTree (randomisedTree),
170  fUseNvars (useNvars),
171  fUsePoissonNvars(usePoissonNvars),
172  fMyTrandom (new TRandom3(iSeed)),
173  fMaxDepth (nMaxDepth),
174  fSigClass (cls),
175  fTreeID (treeID),
176  fAnalysisType (Types::kClassification),
177  fDataSetInfo (dataInfo)
178 {
179  if (sepType == NULL) { // it is interpreted as a regression tree, where
180  // currently the separation type (simple least square)
181  // cannot be chosen freely)
184  if ( nCuts <=0 ) {
185  fNCuts = 200;
186  Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
187  << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
188  << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
189  << Endl;
190  }
191  }else{
193  }
194 }
195 
196 ////////////////////////////////////////////////////////////////////////////////
197 /// copy constructor that creates a true copy, i.e. a completely independent tree
198 /// the node copy will recursively copy all the nodes
199 
201  BinaryTree(),
202  fNvars (d.fNvars),
203  fNCuts (d.fNCuts),
204  fUseFisherCuts (d.fUseFisherCuts),
205  fMinLinCorrForFisher (d.fMinLinCorrForFisher),
206  fUseExclusiveVars (d.fUseExclusiveVars),
207  fSepType (d.fSepType),
208  fRegType (d.fRegType),
209  fMinSize (d.fMinSize),
210  fMinNodeSize(d.fMinNodeSize),
211  fMinSepGain (d.fMinSepGain),
212  fUseSearchTree (d.fUseSearchTree),
213  fPruneStrength (d.fPruneStrength),
214  fPruneMethod (d.fPruneMethod),
215  fNodePurityLimit(d.fNodePurityLimit),
216  fRandomisedTree (d.fRandomisedTree),
217  fUseNvars (d.fUseNvars),
218  fUsePoissonNvars(d.fUsePoissonNvars),
219  fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
220  fMaxDepth (d.fMaxDepth),
221  fSigClass (d.fSigClass),
222  fTreeID (d.fTreeID),
223  fAnalysisType(d.fAnalysisType),
224  fDataSetInfo (d.fDataSetInfo)
225 {
226  this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
227  this->SetParentTreeInNodes();
228  fNNodes = d.fNNodes;
229 
230 }
231 
232 
233 ////////////////////////////////////////////////////////////////////////////////
234 /// destructor
235 
237 {
238  // destruction of the tree nodes done in the "base class" BinaryTree
239 
240  if (fMyTrandom) delete fMyTrandom;
241  if (fRegType) delete fRegType;
242 }
243 
244 ////////////////////////////////////////////////////////////////////////////////
245 /// descend a tree to find all its leaf nodes, fill max depth reached in the
246 /// tree at the same time.
247 
249 {
250  if (n == NULL) { //default, start at the tree top, then descend recursively
251  n = this->GetRoot();
252  if (n == NULL) {
253  Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
254  return ;
255  }
256  }
257 
258  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
259  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
260  return;
261  } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
262  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
263  return;
264  }
265  else {
266  if (this->GetLeftDaughter(n) != NULL) {
267  this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
268  }
269  if (this->GetRightDaughter(n) != NULL) {
270  this->SetParentTreeInNodes( this->GetRightDaughter(n) );
271  }
272  }
273  n->SetParentTree(this);
274  if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
275  return;
276 }
277 
278 ////////////////////////////////////////////////////////////////////////////////
279 /// re-create a new tree (decision tree or search tree) from XML
280 
282  std::string type("");
283  gTools().ReadAttr(node,"type", type);
284  DecisionTree* dt = new DecisionTree();
285 
286  dt->ReadXML( node, tmva_Version_Code );
287  return dt;
288 }
289 
290 // #### Multithreaded DecisionTree::BuildTree
291 #ifdef R__USE_IMT
292 //====================================================================================
293 // Had to define this struct to enable parallelization.
294 // Using BuildNodeInfo, each thread can return the information needed.
295 // After each thread returns we can then merge the results, hence the + operator.
296 //====================================================================================
297 struct BuildNodeInfo{
298 
299  BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
300 
301  nvars = fNvars;
302  xmin = std::vector<Float_t>(nvars);
303  xmax = std::vector<Float_t>(nvars);
304 
305  // #### the initial min and max for each feature
306  for (Int_t ivar=0; ivar<fNvars; ivar++) {
307  const Double_t val = evt->GetValueFast(ivar);
308  xmin[ivar]=val;
309  xmax[ivar]=val;
310  }
311  };
312 
313  BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
314 
315  nvars = fNvars;
316  xmin = std::vector<Float_t>(nvars);
317  xmax = std::vector<Float_t>(nvars);
318 
319  // #### the initial min and max for each feature
320  for (Int_t ivar=0; ivar<fNvars; ivar++) {
321  xmin[ivar]=inxmin[ivar];
322  xmax[ivar]=inxmax[ivar];
323  }
324  };
325  BuildNodeInfo(){};
326 
327  Int_t nvars = 0;
328  Double_t s = 0;
329  Double_t suw = 0;
330  Double_t sub = 0;
331  Double_t b = 0;
332  Double_t buw = 0;
333  Double_t bub = 0;
334  Double_t target = 0;
335  Double_t target2 = 0;
336  std::vector<Float_t> xmin;
337  std::vector<Float_t> xmax;
338 
339  // #### Define the addition operator for BuildNodeInfo
340  // #### Make sure both BuildNodeInfos have the same nvars if we add them
341  BuildNodeInfo operator+(const BuildNodeInfo& other)
342  {
343  BuildNodeInfo ret(nvars, xmin, xmax);
344  if(nvars != other.nvars)
345  {
346  std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
347  return ret;
348  }
349  ret.s = s + other.s;
350  ret.suw = suw + other.suw;
351  ret.sub = sub + other.sub;
352  ret.b = b + other.b;
353  ret.buw = buw + other.buw;
354  ret.bub = bub + other.bub;
355  ret.target = target + other.target;
356  ret.target2 = target2 + other.target2;
357 
358  // xmin is the min of both, xmax is the max of both
359  for(Int_t i=0; i<nvars; i++)
360  {
361  ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
362  ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
363  }
364  return ret;
365  };
366 
367 };
368 //===========================================================================
369 // Done with BuildNodeInfo declaration
370 //===========================================================================
371 
372 
373 ////////////////////////////////////////////////////////////////////////////////
374 /// building the decision tree by recursively calling the splitting of
375 /// one (root-) node into two daughter nodes (returns the number of nodes)
376 
377 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
379 {
380  if (node==NULL) {
381  //start with the root node
382  node = new TMVA::DecisionTreeNode();
383  fNNodes = 1;
384  this->SetRoot(node);
385  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
386  this->GetRoot()->SetPos('s');
387  this->GetRoot()->SetDepth(0);
388  this->GetRoot()->SetParentTree(this);
389  fMinSize = fMinNodeSize/100. * eventSample.size();
390  if (GetTreeID()==0){
391  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
392  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
393  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
394  }
395  }
396 
397  UInt_t nevents = eventSample.size();
398 
399  if (nevents > 0 ) {
400  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
401  fVariableImportance.resize(fNvars);
402  }
403  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
404 
405  // sum up the totals
406  // sig and bkg for classification
407  // err and err2 for regression
408 
409  // #### Set up prerequisite info for multithreading
411  auto seeds = ROOT::TSeqU(nPartitions);
412 
413  // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
414  auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
415 
416  Int_t start = 1.0*partition/nPartitions*eventSample.size();
417  Int_t end = (partition+1.0)/nPartitions*eventSample.size();
418 
419  BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
420 
421  for(Int_t iev=start; iev<end; iev++){
422  const TMVA::Event* evt = eventSample[iev];
423  const Double_t weight = evt->GetWeight();
424  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
425  if (evt->GetClass() == fSigClass) {
426  nodeInfof.s += weight;
427  nodeInfof.suw += 1;
428  nodeInfof.sub += orgWeight;
429  }
430  else {
431  nodeInfof.b += weight;
432  nodeInfof.buw += 1;
433  nodeInfof.bub += orgWeight;
434  }
435  if ( DoRegression() ) {
436  const Double_t tgt = evt->GetTarget(0);
437  nodeInfof.target +=weight*tgt;
438  nodeInfof.target2+=weight*tgt*tgt;
439  }
440 
441  // save the min and max for each feature
442  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
443  const Double_t val = evt->GetValueFast(ivar);
444  if (iev==start){
445  nodeInfof.xmin[ivar]=val;
446  nodeInfof.xmax[ivar]=val;
447  }
448  if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
449  if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
450  }
451  }
452  return nodeInfof;
453  };
454 
455  // #### Need an initial struct to pass to std::accumulate
456  BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
457 
458  // #### Run the threads in parallel then merge the results
459  auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
460  BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
461  //NodeInfo nodeInfo(fNvars);
462 
463  if (nodeInfo.s+nodeInfo.b < 0) {
464  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
465  << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
466  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
467  << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
468  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
469  << "to allow for reasonable averaging!!!" << Endl
470  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
471  << "with negative weight in the training." << Endl;
472  double nBkg=0.;
473  for (UInt_t i=0; i<eventSample.size(); i++) {
474  if (eventSample[i]->GetClass() != fSigClass) {
475  nBkg += eventSample[i]->GetWeight();
476  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
477  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
478  }
479  }
480  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
481  }
482 
483  node->SetNSigEvents(nodeInfo.s);
484  node->SetNBkgEvents(nodeInfo.b);
485  node->SetNSigEvents_unweighted(nodeInfo.suw);
486  node->SetNBkgEvents_unweighted(nodeInfo.buw);
487  node->SetNSigEvents_unboosted(nodeInfo.sub);
488  node->SetNBkgEvents_unboosted(nodeInfo.bub);
489  node->SetPurity();
490  if (node == this->GetRoot()) {
491  node->SetNEvents(nodeInfo.s+nodeInfo.b);
492  node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
493  node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
494  }
495 
496  // save the min and max for each feature
497  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
498  node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
499  node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
500  }
501 
502  // I now demand the minimum number of events for both daughter nodes. Hence if the number
503  // of events in the parent node is not at least two times as big, I don't even need to try
504  // splitting
505 
506  // ask here for actual "events" independent of their weight.. OR the weighted events
507  // to exceed the min requested number of events per daughter node
508  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
509  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
510  // std::cout << "------------------------------------------------------------------"<<std::endl;
511  // std::cout << "------------------------------------------------------------------"<<std::endl;
512  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
513  if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
514  && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
515 
516  // Train the node and figure out the separation gain and split points
517  Double_t separationGain;
518  if (fNCuts > 0){
519  separationGain = this->TrainNodeFast(eventSample, node);
520  }
521  else {
522  separationGain = this->TrainNodeFull(eventSample, node);
523  }
524 
525  // The node has been trained and there is nothing to be gained by splitting
526  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
527  // no cut can actually do anything to improve the node
528  // hence, naturally, the current node is a leaf node
529  if (DoRegression()) {
530  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
531  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
532  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
533  node->SetRMS(0);
534  }else{
535  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
536  }
537  }
538  else {
539  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
540  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
541  else node->SetNodeType(-1);
542  }
543  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
544  }
545  else {
546  // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
547  // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
548  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
549  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
550 
551  Double_t nRight=0, nLeft=0;
552  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
553 
554  for (UInt_t ie=0; ie< nevents ; ie++) {
555  if (node->GoesRight(*eventSample[ie])) {
556  rightSample.push_back(eventSample[ie]);
557  nRight += eventSample[ie]->GetWeight();
558  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
559  }
560  else {
561  leftSample.push_back(eventSample[ie]);
562  nLeft += eventSample[ie]->GetWeight();
563  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
564  }
565  }
566  // sanity check
567  if (leftSample.empty() || rightSample.empty()) {
568 
569  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
570  << "--- Hence new node == old node ... check" << Endl
571  << "--- left:" << leftSample.size()
572  << " right:" << rightSample.size() << Endl
573  << " while the separation is thought to be " << separationGain
574  << "\n when cutting on variable " << node->GetSelector()
575  << " at value " << node->GetCutValue()
576  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
577  }
578 
579  // continue building daughter nodes for the left and the right eventsample
580  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
581  fNNodes++;
582  rightNode->SetNEvents(nRight);
583  rightNode->SetNEvents_unboosted(nRightUnBoosted);
584  rightNode->SetNEvents_unweighted(rightSample.size());
585 
586  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
587 
588  fNNodes++;
589  leftNode->SetNEvents(nLeft);
590  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
591  leftNode->SetNEvents_unweighted(leftSample.size());
592 
593  node->SetNodeType(0);
594  node->SetLeft(leftNode);
595  node->SetRight(rightNode);
596 
597  this->BuildTree(rightSample, rightNode);
598  this->BuildTree(leftSample, leftNode );
599 
600  }
601  }
602  else{ // it is a leaf node
603  if (DoRegression()) {
604  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
605  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
606  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
607  node->SetRMS(0);
608  }else{
609  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
610  }
611  }
612  else {
613  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
614  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
615  else node->SetNodeType(-1);
616  // loop through the event sample ending up in this node and check for events with negative weight
617  // those "cannot" be boosted normally. Hence, if there is one of those
618  // is misclassified, find randomly as many events with positive weights in this
619  // node as needed to get the same absolute number of weight, and mark them as
620  // "not to be boosted" in order to make up for not boosting the negative weight event
621  }
622 
623 
624  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
625  }
626 
627  // if (IsRootNode) this->CleanTree();
628  return fNNodes;
629 }
630 
631 // Standard DecisionTree::BuildTree (multithreading is not enabled)
632 #else
633 
634 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
636 {
637  if (node==NULL) {
638  //start with the root node
639  node = new TMVA::DecisionTreeNode();
640  fNNodes = 1;
641  this->SetRoot(node);
642  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
643  this->GetRoot()->SetPos('s');
644  this->GetRoot()->SetDepth(0);
645  this->GetRoot()->SetParentTree(this);
646  fMinSize = fMinNodeSize/100. * eventSample.size();
647  if (GetTreeID()==0){
648  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
649  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
650  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
651  }
652  }
653 
654  UInt_t nevents = eventSample.size();
655 
656  if (nevents > 0 ) {
657  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
658  fVariableImportance.resize(fNvars);
659  }
660  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
661 
662  Double_t s=0, b=0;
663  Double_t suw=0, buw=0;
664  Double_t sub=0, bub=0; // unboosted!
665  Double_t target=0, target2=0;
666  Float_t *xmin = new Float_t[fNvars];
667  Float_t *xmax = new Float_t[fNvars];
668 
669  // initializing xmin and xmax for each variable
670  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
671  xmin[ivar]=xmax[ivar]=0;
672  }
673  // sum up the totals
674  // sig and bkg for classification
675  // err and err2 for regression
676  for (UInt_t iev=0; iev<eventSample.size(); iev++) {
677  const TMVA::Event* evt = eventSample[iev];
678  const Double_t weight = evt->GetWeight();
679  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
680  if (evt->GetClass() == fSigClass) {
681  s += weight;
682  suw += 1;
683  sub += orgWeight;
684  }
685  else {
686  b += weight;
687  buw += 1;
688  bub += orgWeight;
689  }
690  if ( DoRegression() ) {
691  const Double_t tgt = evt->GetTarget(0);
692  target +=weight*tgt;
693  target2+=weight*tgt*tgt;
694  }
695 
696  // save the min and max for each feature
697  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
698  const Double_t val = evt->GetValueFast(ivar);
699  if (iev==0) xmin[ivar]=xmax[ivar]=val;
700  if (val < xmin[ivar]) xmin[ivar]=val;
701  if (val > xmax[ivar]) xmax[ivar]=val;
702  }
703  }
704 
705 
706  if (s+b < 0) {
707  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
708  << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
709  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
710  << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
711  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
712  << "to allow for reasonable averaging!!!" << Endl
713  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
714  << "with negative weight in the training." << Endl;
715  double nBkg=0.;
716  for (UInt_t i=0; i<eventSample.size(); i++) {
717  if (eventSample[i]->GetClass() != fSigClass) {
718  nBkg += eventSample[i]->GetWeight();
719  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
720  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
721  }
722  }
723  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
724  }
725 
726  node->SetNSigEvents(s);
727  node->SetNBkgEvents(b);
728  node->SetNSigEvents_unweighted(suw);
729  node->SetNBkgEvents_unweighted(buw);
730  node->SetNSigEvents_unboosted(sub);
731  node->SetNBkgEvents_unboosted(bub);
732  node->SetPurity();
733  if (node == this->GetRoot()) {
734  node->SetNEvents(s+b);
735  node->SetNEvents_unweighted(suw+buw);
736  node->SetNEvents_unboosted(sub+bub);
737  }
738 
739  // save the min and max for each feature
740  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
741  node->SetSampleMin(ivar,xmin[ivar]);
742  node->SetSampleMax(ivar,xmax[ivar]);
743 
744  }
745  delete[] xmin;
746  delete[] xmax;
747 
748  // I now demand the minimum number of events for both daughter nodes. Hence if the number
749  // of events in the parent node is not at least two times as big, I don't even need to try
750  // splitting
751 
752  // ask here for actuall "events" independent of their weight.. OR the weighted events
753  // to execeed the min requested number of events per dauther node
754  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
755  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
756  // std::cout << "------------------------------------------------------------------"<<std::endl;
757  // std::cout << "------------------------------------------------------------------"<<std::endl;
758  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
759  if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
760  && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
761  Double_t separationGain;
762  if (fNCuts > 0){
763  separationGain = this->TrainNodeFast(eventSample, node);
764  } else {
765  separationGain = this->TrainNodeFull(eventSample, node);
766  }
767  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
768  /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
769  // no cut can actually do anything to improve the node
770  // hence, naturally, the current node is a leaf node
771  if (DoRegression()) {
772  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
773  node->SetResponse(target/(s+b));
774  if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
775  node->SetRMS(0);
776  }else{
777  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
778  }
779  }
780  else {
781  node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
782 
783  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
784  else node->SetNodeType(-1);
785  }
786  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
787 
788  } else {
789 
790  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
791  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
792 
793  Double_t nRight=0, nLeft=0;
794  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
795 
796  for (UInt_t ie=0; ie< nevents ; ie++) {
797  if (node->GoesRight(*eventSample[ie])) {
798  rightSample.push_back(eventSample[ie]);
799  nRight += eventSample[ie]->GetWeight();
800  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
801  }
802  else {
803  leftSample.push_back(eventSample[ie]);
804  nLeft += eventSample[ie]->GetWeight();
805  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
806  }
807  }
808 
809  // sanity check
810  if (leftSample.empty() || rightSample.empty()) {
811 
812  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
813  << "--- Hence new node == old node ... check" << Endl
814  << "--- left:" << leftSample.size()
815  << " right:" << rightSample.size() << Endl
816  << " while the separation is thought to be " << separationGain
817  << "\n when cutting on variable " << node->GetSelector()
818  << " at value " << node->GetCutValue()
819  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
820  }
821 
822  // continue building daughter nodes for the left and the right eventsample
823  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
824  fNNodes++;
825  rightNode->SetNEvents(nRight);
826  rightNode->SetNEvents_unboosted(nRightUnBoosted);
827  rightNode->SetNEvents_unweighted(rightSample.size());
828 
829  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
830 
831  fNNodes++;
832  leftNode->SetNEvents(nLeft);
833  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
834  leftNode->SetNEvents_unweighted(leftSample.size());
835 
836  node->SetNodeType(0);
837  node->SetLeft(leftNode);
838  node->SetRight(rightNode);
839 
840  this->BuildTree(rightSample, rightNode);
841  this->BuildTree(leftSample, leftNode );
842 
843  }
844  }
845  else{ // it is a leaf node
846  if (DoRegression()) {
847  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
848  node->SetResponse(target/(s+b));
849  if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
850  node->SetRMS(0);
851  }else{
852  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
853  }
854  }
855  else {
856  node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
857  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
858  else node->SetNodeType(-1);
859  // loop through the event sample ending up in this node and check for events with negative weight
860  // those "cannot" be boosted normally. Hence, if there is one of those
861  // is misclassified, find randomly as many events with positive weights in this
862  // node as needed to get the same absolute number of weight, and mark them as
863  // "not to be boosted" in order to make up for not boosting the negative weight event
864  }
865 
866 
867  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
868  }
869 
870  // if (IsRootNode) this->CleanTree();
871  return fNNodes;
872 }
873 
874 #endif
875 
876 ////////////////////////////////////////////////////////////////////////////////
877 /// fill the existing the decision tree structure by filling event
878 /// in from the top node and see where they happen to end up
879 
880 void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
881 {
882  for (UInt_t i=0; i<eventSample.size(); i++) {
883  this->FillEvent(*(eventSample[i]),NULL);
884  }
885 }
886 
887 ////////////////////////////////////////////////////////////////////////////////
888 /// fill the existing the decision tree structure by filling event
889 /// in from the top node and see where they happen to end up
890 
892  TMVA::DecisionTreeNode *node )
893 {
894  if (node == NULL) { // that's the start, take the Root node
895  node = this->GetRoot();
896  }
897 
898  node->IncrementNEvents( event.GetWeight() );
900 
901  if (event.GetClass() == fSigClass) {
902  node->IncrementNSigEvents( event.GetWeight() );
904  }
905  else {
906  node->IncrementNBkgEvents( event.GetWeight() );
908  }
909  node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
910  node->GetNBkgEvents()));
911 
912  if (node->GetNodeType() == 0) { //intermediate node --> go down
913  if (node->GoesRight(event))
914  this->FillEvent(event, node->GetRight());
915  else
916  this->FillEvent(event, node->GetLeft());
917  }
918 }
919 
920 ////////////////////////////////////////////////////////////////////////////////
921 /// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
922 
924 {
925  if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
926 
927 }
928 
929 ////////////////////////////////////////////////////////////////////////////////
930 /// remove those last splits that result in two leaf nodes that
931 /// are both of the type (i.e. both signal or both background)
932 /// this of course is only a reasonable thing to do when you use
933 /// "YesOrNo" leafs, while it might loose s.th. if you use the
934 /// purity information in the nodes.
935 /// --> hence I don't call it automatically in the tree building
936 
938 {
939  if (node==NULL) {
940  node = this->GetRoot();
941  }
942 
943  DecisionTreeNode *l = node->GetLeft();
944  DecisionTreeNode *r = node->GetRight();
945 
946  if (node->GetNodeType() == 0) {
947  this->CleanTree(l);
948  this->CleanTree(r);
949  if (l->GetNodeType() * r->GetNodeType() > 0) {
950 
951  this->PruneNode(node);
952  }
953  }
954  // update the number of nodes after the cleaning
955  return this->CountNodes();
956 
957 }
958 
959 ////////////////////////////////////////////////////////////////////////////////
960 /// prune (get rid of internal nodes) the Decision tree to avoid overtraining
961 /// several different pruning methods can be applied as selected by the
962 /// variable "fPruneMethod".
963 
965 {
966  IPruneTool* tool(NULL);
967  PruningInfo* info(NULL);
968 
969  if( fPruneMethod == kNoPruning ) return 0.0;
970 
971  if (fPruneMethod == kExpectedErrorPruning)
972  // tool = new ExpectedErrorPruneTool(logfile);
973  tool = new ExpectedErrorPruneTool();
974  else if (fPruneMethod == kCostComplexityPruning)
975  {
976  tool = new CostComplexityPruneTool();
977  }
978  else {
979  Log() << kFATAL << "Selected pruning method not yet implemented "
980  << Endl;
981  }
982 
983  if(!tool) return 0.0;
984 
985  tool->SetPruneStrength(GetPruneStrength());
986  if(tool->IsAutomatic()) {
987  if(validationSample == NULL){
988  Log() << kFATAL << "Cannot automate the pruning algorithm without an "
989  << "independent validation sample!" << Endl;
990  }else if(validationSample->size() == 0) {
991  Log() << kFATAL << "Cannot automate the pruning algorithm with "
992  << "independent validation sample of ZERO events!" << Endl;
993  }
994  }
995 
996  info = tool->CalculatePruningInfo(this,validationSample);
997  Double_t pruneStrength=0;
998  if(!info) {
999  Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1000  << Endl;
1001  } else {
1002  pruneStrength = info->PruneStrength;
1003 
1004  // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1005  // << " has quality index " << info->QualityIndex << Endl;
1006 
1007 
1008  for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1009 
1010  PruneNode(info->PruneSequence[i]);
1011  }
1012  // update the number of nodes after the pruning
1013  this->CountNodes();
1014  }
1015 
1016  delete tool;
1017  delete info;
1018 
1019  return pruneStrength;
1020 };
1021 
1022 
1023 ////////////////////////////////////////////////////////////////////////////////
1024 /// run the validation sample through the (pruned) tree and fill in the nodes
1025 /// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1026 /// and Background events from the validation sample. This is then later used
1027 /// when asking for the "tree quality" ..
1028 
1029 void TMVA::DecisionTree::ApplyValidationSample( const EventConstList* validationSample ) const
1030 {
1031  GetRoot()->ResetValidationData();
1032  for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1033  CheckEventWithPrunedTree((*validationSample)[ievt]);
1034  }
1035 }
1036 
1037 ////////////////////////////////////////////////////////////////////////////////
1038 /// return the misclassification rate of a pruned tree
1039 /// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1040 /// any node, hence this tree quality testing will stop there, hence test
1041 /// the pruned tree (while the full tree is still in place for normal/later use)
1042 
1044 {
1045  if (n == NULL) { // default, start at the tree top, then descend recursively
1046  n = this->GetRoot();
1047  if (n == NULL) {
1048  Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1049  return 0;
1050  }
1051  }
1052 
1053  if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1054  return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1055  TestPrunedTreeQuality( n->GetRight(), mode ));
1056  }
1057  else { // terminal leaf (in a pruned subtree of T_max at least)
1058  if (DoRegression()) {
1059  Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1060  return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1061  }
1062  else {
1063  if (mode == 0) {
1064  if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1065  return n->GetNBValidation();
1066  else
1067  return n->GetNSValidation();
1068  }
1069  else if ( mode == 1 ) {
1070  // calculate the weighted error using the pruning validation sample
1071  return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1072  }
1073  else {
1074  throw std::string("Unknown ValidationQualityMode");
1075  }
1076  }
1077  }
1078 }
1079 
1080 ////////////////////////////////////////////////////////////////////////////////
1081 /// pass a single validation event through a pruned decision tree
1082 /// on the way down the tree, fill in all the "intermediate" information
1083 /// that would normally be there from training.
1084 
1086 {
1087  DecisionTreeNode* current = this->GetRoot();
1088  if (current == NULL) {
1089  Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1090  }
1091 
1092  while(current != NULL) {
1093  if(e->GetClass() == fSigClass)
1094  current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1095  else
1096  current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1097 
1098  if (e->GetNTargets() > 0) {
1099  current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1100  current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1101  }
1102 
1103  if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1104  current = NULL;
1105  }
1106  else {
1107  if (current->GoesRight(*e))
1108  current = (TMVA::DecisionTreeNode*)current->GetRight();
1109  else
1110  current = (TMVA::DecisionTreeNode*)current->GetLeft();
1111  }
1112  }
1113 }
1114 
1115 ////////////////////////////////////////////////////////////////////////////////
1116 /// calculate the normalization factor for a pruning validation sample
1117 
1119 {
1120  Double_t sumWeights = 0.0;
1121  for( EventConstList::const_iterator it = validationSample->begin();
1122  it != validationSample->end(); ++it ) {
1123  sumWeights += (*it)->GetWeight();
1124  }
1125  return sumWeights;
1126 }
1127 
1128 ////////////////////////////////////////////////////////////////////////////////
1129 /// return the number of terminal nodes in the sub-tree below Node n
1130 
1132 {
1133  if (n == NULL) { // default, start at the tree top, then descend recursively
1134  n = this->GetRoot();
1135  if (n == NULL) {
1136  Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1137  return 0;
1138  }
1139  }
1140 
1141  UInt_t countLeafs=0;
1142 
1143  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1144  countLeafs += 1;
1145  }
1146  else {
1147  if (this->GetLeftDaughter(n) != NULL) {
1148  countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1149  }
1150  if (this->GetRightDaughter(n) != NULL) {
1151  countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1152  }
1153  }
1154  return countLeafs;
1155 }
1156 
1157 ////////////////////////////////////////////////////////////////////////////////
1158 /// descend a tree to find all its leaf nodes
1159 
1161 {
1162  if (n == NULL) { // default, start at the tree top, then descend recursively
1163  n = this->GetRoot();
1164  if (n == NULL) {
1165  Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1166  return ;
1167  }
1168  }
1169 
1170  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1171  // do nothing
1172  }
1173  else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1174  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1175  return;
1176  }
1177  else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1178  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1179  return;
1180  }
1181  else {
1182  if (this->GetLeftDaughter(n) != NULL) {
1183  this->DescendTree( this->GetLeftDaughter(n) );
1184  }
1185  if (this->GetRightDaughter(n) != NULL) {
1186  this->DescendTree( this->GetRightDaughter(n) );
1187  }
1188  }
1189 }
1190 
1191 ////////////////////////////////////////////////////////////////////////////////
1192 /// prune away the subtree below the node
1193 
1195 {
1196  DecisionTreeNode *l = node->GetLeft();
1197  DecisionTreeNode *r = node->GetRight();
1198 
1199  node->SetRight(NULL);
1200  node->SetLeft(NULL);
1201  node->SetSelector(-1);
1202  node->SetSeparationGain(-1);
1203  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1204  else node->SetNodeType(-1);
1205  this->DeleteNode(l);
1206  this->DeleteNode(r);
1207  // update the stored number of nodes in the Tree
1208  this->CountNodes();
1209 
1210 }
1211 
1212 ////////////////////////////////////////////////////////////////////////////////
1213 /// prune a node temporarily (without actually deleting its descendants
1214 /// which allows testing the pruned tree quality for many different
1215 /// pruning stages without "touching" the tree.
1216 
1218  if(node == NULL) return;
1219  node->SetNTerminal(1);
1220  node->SetSubTreeR( node->GetNodeR() );
1221  node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1222  node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1223  node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1224 }
1225 
1226 ////////////////////////////////////////////////////////////////////////////////
1227 /// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1228 /// is coded as a sequence of left-right moves starting from the root, coded as
1229 /// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1230 
1232 {
1233  Node* current = this->GetRoot();
1234 
1235  for (UInt_t i =0; i < depth; i++) {
1236  ULong_t tmp = 1 << i;
1237  if ( tmp & sequence) current = this->GetRightDaughter(current);
1238  else current = this->GetLeftDaughter(current);
1239  }
1240 
1241  return current;
1242 }
1243 
1244 ////////////////////////////////////////////////////////////////////////////////
1245 ///
1246 
1247 void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1248  for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1249  if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1250  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1251  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1252  }
1253  if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1254  else useNvars = fUseNvars;
1255 
1256  UInt_t nSelectedVars = 0;
1257  while (nSelectedVars < useNvars) {
1258  Double_t bla = fMyTrandom->Rndm()*fNvars;
1259  useVariable[Int_t (bla)] = kTRUE;
1260  nSelectedVars = 0;
1261  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1262  if (useVariable[ivar] == kTRUE) {
1263  mapVariable[nSelectedVars] = ivar;
1264  nSelectedVars++;
1265  }
1266  }
1267  }
1268  if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1269 }
1270 
1271 // Multithreaded version of DecisionTree::TrainNodeFast
1272 #ifdef R__USE_IMT
1273 //====================================================================================
1274 // Had to define this struct to enable parallelization in TrainNodeFast.
1275 // Using TrainNodeInfo, each thread can return the information needed.
1276 // After each thread returns we can then merge the results, hence the + operator.
1277 //====================================================================================
1278 struct TrainNodeInfo{
1279 
1280  TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1281 
1282  cNvars = cNvars_;
1283  nBins = nBins_;
1284 
1285  nSelS = std::vector< std::vector<Double_t> >(cNvars);
1286  nSelB = std::vector< std::vector<Double_t> >(cNvars);
1287  nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1288  nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1289  target = std::vector< std::vector<Double_t> >(cNvars);
1290  target2 = std::vector< std::vector<Double_t> >(cNvars);
1291 
1292  for(Int_t ivar=0; ivar<cNvars; ivar++){
1293  nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1294  nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1295  nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1296  nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297  target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298  target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299  }
1300  };
1301 
1302  TrainNodeInfo(){};
1303 
1304  // #### malloc problem if I define this and try to destruct xmin and xmax...
1305  // #### Maybe someone better at C++ can figure out why and fix this if it's a
1306  // ### serious memory problem
1307  //~TrainNodeInfo(){
1308  // delete [] xmin;
1309  // delete [] xmax;
1310  //};
1311 
1312  Int_t cNvars = 0;
1313  UInt_t* nBins;
1314 
1315  Double_t nTotS = 0;
1316  Double_t nTotS_unWeighted = 0;
1317  Double_t nTotB = 0;
1318  Double_t nTotB_unWeighted = 0;
1319 
1320  std::vector< std::vector<Double_t> > nSelS;
1321  std::vector< std::vector<Double_t> > nSelB;
1322  std::vector< std::vector<Double_t> > nSelS_unWeighted;
1323  std::vector< std::vector<Double_t> > nSelB_unWeighted;
1324  std::vector< std::vector<Double_t> > target;
1325  std::vector< std::vector<Double_t> > target2;
1326 
1327  // Define the addition operator for TrainNodeInfo
1328  // Make sure both TrainNodeInfos have the same nvars if we add them
1329  TrainNodeInfo operator+(const TrainNodeInfo& other)
1330  {
1331  TrainNodeInfo ret(cNvars, nBins);
1332 
1333  // check that the two are compatible to add
1334  if(cNvars != other.cNvars)
1335  {
1336  std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1337  return ret;
1338  }
1339 
1340  // add the signal, background, and target sums
1341  for (Int_t ivar=0; ivar<cNvars; ivar++) {
1342  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1343  ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1344  ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1345  ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1346  ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1347  ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1348  ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1349  }
1350  }
1351 
1352  ret.nTotS = nTotS + other.nTotS;
1353  ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1354  ret.nTotB = nTotB + other.nTotB;
1355  ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1356 
1357  return ret;
1358  };
1359 
1360 };
1361 //===========================================================================
1362 // Done with TrainNodeInfo declaration
1363 //===========================================================================
1364 
1365 ////////////////////////////////////////////////////////////////////////////////
1366 /// Decide how to split a node using one of the variables that gives
1367 /// the best separation of signal/background. In order to do this, for each
1368 /// variable a scan of the different cut values in a grid (grid = fNCuts) is
1369 /// performed and the resulting separation gains are compared.
1370 /// in addition to the individual variables, one can also ask for a fisher
1371 /// discriminant being built out of (some) of the variables and used as a
1372 /// possible multivariate split.
1373 
1375  TMVA::DecisionTreeNode *node )
1376 {
1377  // #### OK let's comment this one to see how to parallelize it
1378  Double_t separationGainTotal = -1;
1379  Double_t *separationGain = new Double_t[fNvars+1];
1380  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1381 
1382  // #### initialize the sep gain and cut index values
1383  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1384  separationGain[ivar]=-1;
1385  cutIndex[ivar]=-1;
1386  }
1387  // ### set up some other variables
1388  Int_t mxVar = -1;
1389  Bool_t cutType = kTRUE;
1390  UInt_t nevents = eventSample.size();
1391 
1392 
1393  // the +1 comes from the fact that I treat later on the Fisher output as an
1394  // additional possible variable.
1395  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1396  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1397 
1398  std::vector<Double_t> fisherCoeff;
1399 
1400  // #### set up a map to the subset of variables using two arrays
1401  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1402  UInt_t tmp=fUseNvars;
1403  GetRandomisedVariables(useVariable,mapVariable,tmp);
1404  }
1405  else {
1406  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1407  useVariable[ivar] = kTRUE;
1408  mapVariable[ivar] = ivar;
1409  }
1410  }
1411  // #### last variable entry is the fisher variable
1412  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1413 
1414  // #### Begin Fisher calculation
1415  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1416  if (fUseFisherCuts) {
1417  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1418 
1419  //use for the Fisher discriminant ONLY those variables that show
1420  //some reasonable linear correlation in either Signal or Background
1421  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1422  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1423  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1424  useVarInFisher[ivar] = kFALSE;
1425  mapVarInFisher[ivar] = ivar;
1426  }
1427 
1428  std::vector<TMatrixDSym*>* covMatrices;
1429  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1430  if (!covMatrices){
1431  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1432  fisherOK = kFALSE;
1433  }else{
1434  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1435  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1436  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1437  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1438 
1439  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1440  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1441  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1442  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1443  useVarInFisher[ivar] = kTRUE;
1444  useVarInFisher[jvar] = kTRUE;
1445  }
1446  }
1447  }
1448  // now as you know which variables you want to use, count and map them:
1449  // such that you can use an array/matrix filled only with THOSE variables
1450  // that you used
1451  UInt_t nFisherVars = 0;
1452  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1453  //now .. pick those variables that are used in the FIsher and are also
1454  // part of the "allowed" variables in case of Randomized Trees)
1455  if (useVarInFisher[ivar] && useVariable[ivar]) {
1456  mapVarInFisher[nFisherVars++]=ivar;
1457  // now exclude the variables used in the Fisher cuts, and don't
1458  // use them anymore in the individual variable scan
1459  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1460  }
1461  }
1462 
1463 
1464  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1465  fisherOK = kTRUE;
1466  }
1467  delete [] useVarInFisher;
1468  delete [] mapVarInFisher;
1469 
1470  }
1471  // #### End Fisher calculation
1472 
1473 
1474  UInt_t cNvars = fNvars;
1475  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1476 
1477  // #### set up the binning info arrays
1478  // #### each var has its own binning since some may be integers
1479  UInt_t* nBins = new UInt_t [cNvars];
1480  Double_t* binWidth = new Double_t [cNvars];
1481  Double_t* invBinWidth = new Double_t [cNvars];
1482  Double_t** cutValues = new Double_t* [cNvars];
1483 
1484  // #### set up the xmin and xmax arrays
1485  // #### each var has its own range
1486  Double_t *xmin = new Double_t[cNvars];
1487  Double_t *xmax = new Double_t[cNvars];
1488 
1489  // construct and intialize binning/cuts
1490  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1491  // ncuts means that we need n+1 bins for each variable
1492  nBins[ivar] = fNCuts+1;
1493  if (ivar < fNvars) {
1494  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1495  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1496  }
1497  }
1498 
1499  cutValues[ivar] = new Double_t [nBins[ivar]];
1500  }
1501 
1502  // init the range and cutvalues for each var now that we know the binning
1503  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1504  if (ivar < fNvars){
1505  xmin[ivar]=node->GetSampleMin(ivar);
1506  xmax[ivar]=node->GetSampleMax(ivar);
1507  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1508  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1509  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1510  useVariable[ivar]=kFALSE;
1511  }
1512 
1513  } else { // the fisher variable
1514  xmin[ivar]=999;
1515  xmax[ivar]=-999;
1516  // too bad, for the moment I don't know how to do this without looping
1517  // once to get the "min max" and then AGAIN to fill the histogram
1518  for (UInt_t iev=0; iev<nevents; iev++) {
1519  // returns the Fisher value (no fixed range)
1520  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1521  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1522  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1523  if (result > xmax[ivar]) xmax[ivar]=result;
1524  if (result < xmin[ivar]) xmin[ivar]=result;
1525  }
1526  }
1527  // this loop could take a long time if nbins is large
1528  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1529  cutValues[ivar][ibin]=0;
1530  }
1531  }
1532 
1533  // ====================================================================
1534  // ====================================================================
1535  // Parallelized Version
1536  // ====================================================================
1537  // ====================================================================
1538 
1539  // #### Figure out the cut values, loops through vars then through cuts
1540  // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1541  // fill the cut values for the scan:
1542  auto varSeeds = ROOT::TSeqU(cNvars);
1543  auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1544 
1545  if ( useVariable[ivar] ) {
1546 
1547  //set the grid for the cut scan on the variables like this:
1548  //
1549  // | | | | | ... | |
1550  // xmin xmax
1551  //
1552  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1553  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1554  // --> nBins = fNCuts+1
1555  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1556  // hence can be safely omitted
1557 
1558  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1559  invBinWidth[ivar] = 1./binWidth[ivar];
1560  if (ivar < fNvars) {
1561  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1562  }
1563 
1564  // std::cout << "ivar="<<ivar
1565  // <<" min="<<xmin[ivar]
1566  // << " max="<<xmax[ivar]
1567  // << " width=" << istepSize
1568  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1569  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1570  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1571  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1572  }
1573  }
1574  return 0;
1575  };
1576  TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1577 
1578  // #### Loop through the events to get the total sig and background
1579  // #### Then loop through the vars to get the counts in each bin in each var
1580  // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1581 
1582  TrainNodeInfo nodeInfo(cNvars, nBins);
1584 
1585  // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1586  // #### Parallelize by chunking the data into the same number of sections as we have processors
1587  if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1588  {
1589  auto seeds = ROOT::TSeqU(nPartitions);
1590 
1591  // need a lambda function to pass to TThreadExecutor::MapReduce
1592  auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1593  &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1594 
1595  UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1596  UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1597 
1598  TrainNodeInfo nodeInfof(cNvars, nBins);
1599 
1600  for(UInt_t iev=start; iev<end; iev++) {
1601 
1602  Double_t eventWeight = eventSample[iev]->GetWeight();
1603  if (eventSample[iev]->GetClass() == fSigClass) {
1604  nodeInfof.nTotS+=eventWeight;
1605  nodeInfof.nTotS_unWeighted++; }
1606  else {
1607  nodeInfof.nTotB+=eventWeight;
1608  nodeInfof.nTotB_unWeighted++;
1609  }
1610 
1611  // #### Count the number in each bin
1612  Int_t iBin=-1;
1613  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1614  // now scan trough the cuts for each varable and find which one gives
1615  // the best separationGain at the current stage.
1616  if ( useVariable[ivar] ) {
1617  Double_t eventData;
1618  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1619  else { // the fisher variable
1620  eventData = fisherCoeff[fNvars];
1621  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1622  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1623 
1624  }
1625  // #### figure out which bin it belongs in ...
1626  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1627  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1628  if (eventSample[iev]->GetClass() == fSigClass) {
1629  nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1630  nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1631  }
1632  else {
1633  nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1634  nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1635  }
1636  if (DoRegression()) {
1637  nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1638  nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1639  }
1640  }
1641  }
1642  }
1643  return nodeInfof;
1644  };
1645 
1646  // #### Need an intial struct to pass to std::accumulate
1647  TrainNodeInfo nodeInfoInit(cNvars, nBins);
1648 
1649  // #### Run the threads in parallel then merge the results
1650  auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1651  nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1652  }
1653 
1654 
1655  // #### When nbins is close to the order of the data this version of parallelization is faster
1656  // #### Parallelize by vectorizing the variable loop
1657  else {
1658 
1659  auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1660 
1661  for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1662 
1663  Int_t iBin=-1;
1664  Double_t eventWeight = eventSample[iev]->GetWeight();
1665 
1666  // Only count the net signal and background once
1667  if(ivar==0){
1668  if (eventSample[iev]->GetClass() == fSigClass) {
1669  nodeInfo.nTotS+=eventWeight;
1670  nodeInfo.nTotS_unWeighted++; }
1671  else {
1672  nodeInfo.nTotB+=eventWeight;
1673  nodeInfo.nTotB_unWeighted++;
1674  }
1675  }
1676 
1677  // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1678  if ( useVariable[ivar] ) {
1679  Double_t eventData;
1680  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1681  else { // the fisher variable
1682  eventData = fisherCoeff[fNvars];
1683  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1684  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1685 
1686  }
1687  // #### figure out which bin it belongs in ...
1688  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1689  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1690  if (eventSample[iev]->GetClass() == fSigClass) {
1691  nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1692  nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1693  }
1694  else {
1695  nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1696  nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1697  }
1698  if (DoRegression()) {
1699  nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1700  nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1701  }
1702  }
1703  }
1704  return 0;
1705  };
1706 
1707  TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1708  }
1709 
1710  // now turn each "histogram" into a cumulative distribution
1711  // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1712  // #### doesn't hurt otherwise, pretty unnoticeable
1713  auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1714  if (useVariable[ivar]) {
1715  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1716  nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1717  nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1718  nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1719  nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1720  if (DoRegression()) {
1721  nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1722  nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1723  }
1724  }
1725  if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1726  Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1727  << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1728  << " while eventsample size = " << eventSample.size()
1729  << Endl;
1730  }
1731  double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1732  double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1733  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1734  Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1735  << lastBins
1736  << " while total number of events = " << totalSum
1737  << Endl;
1738  }
1739  }
1740  return 0;
1741  };
1742  TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1743 
1744  // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1745  // now select the optimal cuts for each varable and find which one gives
1746  // the best separationGain at the current stage
1747  auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1748  if (useVariable[ivar]) {
1749  Double_t sepTmp;
1750  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1751  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1752  // calculated by the "SamplePurities" from the branches that would go to the
1753  // left or the right from this node if "these" cuts were used in the Node:
1754  // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1755  // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1756 
1757  // only allow splits where both daughter nodes match the specified minimum number
1758  // for this use the "unweighted" events, as you are interested in statistically
1759  // significant splits, which is determined by the actual number of entries
1760  // for a node, rather than the sum of event weights.
1761 
1762  Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1763  Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1764  Double_t s = nodeInfo.nTotS_unWeighted;
1765  Double_t b = nodeInfo.nTotB_unWeighted;
1766  Double_t slW = nodeInfo.nSelS[ivar][iBin];
1767  Double_t blW = nodeInfo.nSelB[ivar][iBin];
1768  Double_t sW = nodeInfo.nTotS;
1769  Double_t bW = nodeInfo.nTotB;
1770  Double_t sr = s-sl;
1771  Double_t br = b-bl;
1772  Double_t srW = sW-slW;
1773  Double_t brW = bW-blW;
1774  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1775  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1776  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1777  ) {
1778 
1779  if (DoRegression()) {
1780  sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1781  nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1782  nodeInfo.nTotS+nodeInfo.nTotB,
1783  nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1784  } else {
1785  sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1786  }
1787  if (separationGain[ivar] < sepTmp) {
1788  separationGain[ivar] = sepTmp;
1789  cutIndex[ivar] = iBin;
1790  }
1791  }
1792  }
1793  }
1794  return 0;
1795  };
1796  TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1797 
1798  // you found the best separation cut for each variable, now compare the variables
1799  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1800  if (useVariable[ivar] ) {
1801  if (separationGainTotal < separationGain[ivar]) {
1802  separationGainTotal = separationGain[ivar];
1803  mxVar = ivar;
1804  }
1805  }
1806  }
1807 
1808  if (mxVar >= 0) {
1809  if (DoRegression()) {
1810  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1811  node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1812  if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1813  node->SetRMS(0);
1814 
1815  }else{
1816  node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1817  }
1818  }
1819  else {
1820  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1821  if (mxVar >=0){
1822  if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1823  else cutType=kFALSE;
1824  }
1825  }
1826  node->SetSelector((UInt_t)mxVar);
1827  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1828  node->SetCutType(cutType);
1829  node->SetSeparationGain(separationGainTotal);
1830  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1831  node->SetNFisherCoeff(0);
1832  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1833  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1834  }else{
1835  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1836  // be even less storage space on average than storing also the mapping used otherwise
1837  // can always be changed relatively easy
1838  node->SetNFisherCoeff(fNvars+1);
1839  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1840  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1841  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1842  if (ivar<fNvars){
1843  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1844  }
1845  }
1846  }
1847  }
1848  else {
1849  separationGainTotal = 0;
1850  }
1851 
1852  // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1853  // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1854  // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1855  for (UInt_t i=0; i<cNvars; i++) {
1856  // delete [] nodeInfo.nSelS[i];
1857  // delete [] nodeInfo.nSelB[i];
1858  // delete [] nodeInfo.nSelS_unWeighted[i];
1859  // delete [] nodeInfo.nSelB_unWeighted[i];
1860  // delete [] nodeInfo.target[i];
1861  // delete [] nodeInfo.target2[i];
1862  delete [] cutValues[i];
1863  }
1864  //delete [] nodeInfo.nSelS;
1865  //delete [] nodeInfo.nSelB;
1866  //delete [] nodeInfo.nSelS_unWeighted;
1867  //delete [] nodeInfo.nSelB_unWeighted;
1868  //delete [] nodeInfo.target;
1869  //delete [] nodeInfo.target2;
1870 
1871  // #### left these as dynamic arrays as they were before
1872  // #### since I didn't need to mess with them for parallelization
1873  delete [] cutValues;
1874 
1875  delete [] xmin;
1876  delete [] xmax;
1877 
1878  delete [] useVariable;
1879  delete [] mapVariable;
1880 
1881  delete [] separationGain;
1882  delete [] cutIndex;
1883 
1884  delete [] nBins;
1885  delete [] binWidth;
1886  delete [] invBinWidth;
1887 
1888  return separationGainTotal;
1889 }
1890 
1891 // Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1892 #else
1893 Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1894  TMVA::DecisionTreeNode *node )
1895 {
1896 // #### OK let's comment this one to see how to parallelize it
1897  Double_t separationGainTotal = -1, sepTmp;
1898  Double_t *separationGain = new Double_t[fNvars+1];
1899  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1900 
1901  // #### initialize the sep gain and cut index values
1902  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1903  separationGain[ivar]=-1;
1904  cutIndex[ivar]=-1;
1905  }
1906  // ### set up some other variables
1907  Int_t mxVar = -1;
1908  Bool_t cutType = kTRUE;
1909  Double_t nTotS, nTotB;
1910  Int_t nTotS_unWeighted, nTotB_unWeighted;
1911  UInt_t nevents = eventSample.size();
1912 
1913 
1914  // the +1 comes from the fact that I treat later on the Fisher output as an
1915  // additional possible variable.
1916  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1917  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1918 
1919  std::vector<Double_t> fisherCoeff;
1920 
1921  // #### set up a map to the subset of variables using two arrays
1922  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1923  UInt_t tmp=fUseNvars;
1924  GetRandomisedVariables(useVariable,mapVariable,tmp);
1925  }
1926  else {
1927  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1928  useVariable[ivar] = kTRUE;
1929  mapVariable[ivar] = ivar;
1930  }
1931  }
1932  // #### last variable entry is the fisher variable
1933  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1934 
1935  // #### Begin Fisher calculation
1936  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1937  if (fUseFisherCuts) {
1938  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1939 
1940  //use for the Fisher discriminant ONLY those variables that show
1941  //some reasonable linear correlation in either Signal or Background
1942  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1943  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1944  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1945  useVarInFisher[ivar] = kFALSE;
1946  mapVarInFisher[ivar] = ivar;
1947  }
1948 
1949  std::vector<TMatrixDSym*>* covMatrices;
1950  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1951  if (!covMatrices){
1952  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1953  fisherOK = kFALSE;
1954  }else{
1955  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1956  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1957  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1958  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1959 
1960  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1961  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1962  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1963  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1964  useVarInFisher[ivar] = kTRUE;
1965  useVarInFisher[jvar] = kTRUE;
1966  }
1967  }
1968  }
1969  // now as you know which variables you want to use, count and map them:
1970  // such that you can use an array/matrix filled only with THOSE variables
1971  // that you used
1972  UInt_t nFisherVars = 0;
1973  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1974  //now .. pick those variables that are used in the FIsher and are also
1975  // part of the "allowed" variables in case of Randomized Trees)
1976  if (useVarInFisher[ivar] && useVariable[ivar]) {
1977  mapVarInFisher[nFisherVars++]=ivar;
1978  // now exclud the the variables used in the Fisher cuts, and don't
1979  // use them anymore in the individual variable scan
1980  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1981  }
1982  }
1983 
1984 
1985  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1986  fisherOK = kTRUE;
1987  }
1988  delete [] useVarInFisher;
1989  delete [] mapVarInFisher;
1990 
1991  }
1992  // #### End Fisher calculation
1993 
1994 
1995  UInt_t cNvars = fNvars;
1996  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1997 
1998  // #### OK now what's going on...
1999  // #### looks like we are setting up histograms
2000  UInt_t* nBins = new UInt_t [cNvars];
2001  Double_t* binWidth = new Double_t [cNvars];
2002  Double_t* invBinWidth = new Double_t [cNvars];
2003 
2004  Double_t** nSelS = new Double_t* [cNvars];
2005  Double_t** nSelB = new Double_t* [cNvars];
2006  Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2007  Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2008  Double_t** target = new Double_t* [cNvars];
2009  Double_t** target2 = new Double_t* [cNvars];
2010  Double_t** cutValues = new Double_t* [cNvars];
2011 
2012  // #### looping through the variables...
2013  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2014  // #### ncuts means that we need n+1 bins for each variable
2015  nBins[ivar] = fNCuts+1;
2016  if (ivar < fNvars) {
2017  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2018  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2019  }
2020  }
2021 
2022  // #### make some new arrays for each ith var, size=nbins for each array
2023  // #### integer features get the same number of bins as values, set later
2024  nSelS[ivar] = new Double_t [nBins[ivar]];
2025  nSelB[ivar] = new Double_t [nBins[ivar]];
2026  nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2027  nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2028  target[ivar] = new Double_t [nBins[ivar]];
2029  target2[ivar] = new Double_t [nBins[ivar]];
2030  cutValues[ivar] = new Double_t [nBins[ivar]];
2031 
2032  }
2033 
2034  // #### xmin and xmax for earch variable
2035  Double_t *xmin = new Double_t[cNvars];
2036  Double_t *xmax = new Double_t[cNvars];
2037 
2038  // #### ok loop through each variable to initialize all the values
2039  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2040  if (ivar < fNvars){
2041  xmin[ivar]=node->GetSampleMin(ivar);
2042  xmax[ivar]=node->GetSampleMax(ivar);
2043  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2044  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2045  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2046  useVariable[ivar]=kFALSE;
2047  }
2048 
2049  } else { // the fisher variable
2050  xmin[ivar]=999;
2051  xmax[ivar]=-999;
2052  // too bad, for the moment I don't know how to do this without looping
2053  // once to get the "min max" and then AGAIN to fill the histogram
2054  for (UInt_t iev=0; iev<nevents; iev++) {
2055  // returns the Fisher value (no fixed range)
2056  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2057  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2058  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2059  if (result > xmax[ivar]) xmax[ivar]=result;
2060  if (result < xmin[ivar]) xmin[ivar]=result;
2061  }
2062  }
2063  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2064  nSelS[ivar][ibin]=0;
2065  nSelB[ivar][ibin]=0;
2066  nSelS_unWeighted[ivar][ibin]=0;
2067  nSelB_unWeighted[ivar][ibin]=0;
2068  target[ivar][ibin]=0;
2069  target2[ivar][ibin]=0;
2070  cutValues[ivar][ibin]=0;
2071  }
2072  }
2073 
2074  // #### Nothing to parallelize here really, no loop through events
2075  // #### only figures out the bin edge values for the "histogram" arrays
2076  // fill the cut values for the scan:
2077  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2078 
2079  if ( useVariable[ivar] ) {
2080 
2081  //set the grid for the cut scan on the variables like this:
2082  //
2083  // | | | | | ... | |
2084  // xmin xmax
2085  //
2086  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2087  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2088  // --> nBins = fNCuts+1
2089  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2090  // hence can be safely omitted
2091 
2092  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2093  invBinWidth[ivar] = 1./binWidth[ivar];
2094  if (ivar < fNvars) {
2095  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2096  }
2097 
2098  // std::cout << "ivar="<<ivar
2099  // <<" min="<<xmin[ivar]
2100  // << " max="<<xmax[ivar]
2101  // << " widht=" << istepSize
2102  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2103  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2104  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2105  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2106  }
2107  }
2108  }
2109 
2110  // #### Loop through the events to get the total sig and background
2111  nTotS=0; nTotB=0;
2112  nTotS_unWeighted=0; nTotB_unWeighted=0;
2113  for (UInt_t iev=0; iev<nevents; iev++) {
2114 
2115  Double_t eventWeight = eventSample[iev]->GetWeight();
2116  if (eventSample[iev]->GetClass() == fSigClass) {
2117  nTotS+=eventWeight;
2118  nTotS_unWeighted++; }
2119  else {
2120  nTotB+=eventWeight;
2121  nTotB_unWeighted++;
2122  }
2123 
2124  // #### Count the number in each bin (fill array "histograms")
2125  Int_t iBin=-1;
2126  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2127  // now scan trough the cuts for each varable and find which one gives
2128  // the best separationGain at the current stage.
2129  if ( useVariable[ivar] ) {
2130  Double_t eventData;
2131  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2132  else { // the fisher variable
2133  eventData = fisherCoeff[fNvars];
2134  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2135  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2136 
2137  }
2138  // #### figure out which bin the event belongs in ...
2139  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2140  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2141  if (eventSample[iev]->GetClass() == fSigClass) {
2142  nSelS[ivar][iBin]+=eventWeight;
2143  nSelS_unWeighted[ivar][iBin]++;
2144  }
2145  else {
2146  nSelB[ivar][iBin]+=eventWeight;
2147  nSelB_unWeighted[ivar][iBin]++;
2148  }
2149  if (DoRegression()) {
2150  target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2151  target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2152  }
2153  }
2154  }
2155  }
2156 
2157  // now turn the "histogram" into a cumulative distribution
2158  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2159  if (useVariable[ivar]) {
2160  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2161  nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2162  nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2163  nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2164  nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2165  if (DoRegression()) {
2166  target[ivar][ibin] +=target[ivar][ibin-1] ;
2167  target2[ivar][ibin]+=target2[ivar][ibin-1];
2168  }
2169  }
2170  if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2171  Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2172  << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2173  << " while eventsample size = " << eventSample.size()
2174  << Endl;
2175  }
2176  double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2177  double totalSum=nTotS+nTotB;
2178  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2179  Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2180  << lastBins
2181  << " while total number of events = " << totalSum
2182  << Endl;
2183  }
2184  }
2185  }
2186  // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2187  // now select the optimal cuts for each varable and find which one gives
2188  // the best separationGain at the current stage
2189  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2190  if (useVariable[ivar]) {
2191  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2192  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2193  // calculated by the "SamplePurities" fom the branches that would go to the
2194  // left or the right from this node if "these" cuts were used in the Node:
2195  // hereby: nSelS and nSelB would go to the right branch
2196  // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2197 
2198  // only allow splits where both daughter nodes match the specified miniumum number
2199  // for this use the "unweighted" events, as you are interested in statistically
2200  // significant splits, which is determined by the actual number of entries
2201  // for a node, rather than the sum of event weights.
2202 
2203  Double_t sl = nSelS_unWeighted[ivar][iBin];
2204  Double_t bl = nSelB_unWeighted[ivar][iBin];
2205  Double_t s = nTotS_unWeighted;
2206  Double_t b = nTotB_unWeighted;
2207  Double_t slW = nSelS[ivar][iBin];
2208  Double_t blW = nSelB[ivar][iBin];
2209  Double_t sW = nTotS;
2210  Double_t bW = nTotB;
2211  Double_t sr = s-sl;
2212  Double_t br = b-bl;
2213  Double_t srW = sW-slW;
2214  Double_t brW = bW-blW;
2215  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2216  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2217  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2218  ) {
2219 
2220  if (DoRegression()) {
2221  sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2222  target[ivar][iBin],target2[ivar][iBin],
2223  nTotS+nTotB,
2224  target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2225  } else {
2226  sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2227  }
2228  if (separationGain[ivar] < sepTmp) {
2229  separationGain[ivar] = sepTmp;
2230  cutIndex[ivar] = iBin;
2231  }
2232  }
2233  }
2234  }
2235  }
2236 
2237  //now you have found the best separation cut for each variable, now compare the variables
2238  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2239  if (useVariable[ivar] ) {
2240  if (separationGainTotal < separationGain[ivar]) {
2241  separationGainTotal = separationGain[ivar];
2242  mxVar = ivar;
2243  }
2244  }
2245  }
2246 
2247  if (mxVar >= 0) {
2248  if (DoRegression()) {
2249  node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2250  node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2251  if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2252  node->SetRMS(0);
2253  }else{
2254  node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2255  }
2256  }
2257  else {
2258  node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2259  if (mxVar >=0){
2260  if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2261  else cutType=kFALSE;
2262  }
2263  }
2264  node->SetSelector((UInt_t)mxVar);
2265  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2266  node->SetCutType(cutType);
2267  node->SetSeparationGain(separationGainTotal);
2268  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2269  node->SetNFisherCoeff(0);
2270  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2271  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2272  }else{
2273  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2274  // be even less storage space on average than storing also the mapping used otherwise
2275  // can always be changed relatively easy
2276  node->SetNFisherCoeff(fNvars+1);
2277  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2278  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2279  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2280  if (ivar<fNvars){
2281  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2282  }
2283  }
2284  }
2285  }
2286  else {
2287  separationGainTotal = 0;
2288  }
2289  // if (mxVar > -1) {
2290  // std::cout << "------------------------------------------------------------------"<<std::endl;
2291  // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2292  // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2293  // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2294  // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2295  // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2296  // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2297  // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2298  // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2299  // } else {
2300  // std::cout << "------------------------------------------------------------------"<<std::endl;
2301  // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2302  // }
2303  for (UInt_t i=0; i<cNvars; i++) {
2304  delete [] nSelS[i];
2305  delete [] nSelB[i];
2306  delete [] nSelS_unWeighted[i];
2307  delete [] nSelB_unWeighted[i];
2308  delete [] target[i];
2309  delete [] target2[i];
2310  delete [] cutValues[i];
2311  }
2312  delete [] nSelS;
2313  delete [] nSelB;
2314  delete [] nSelS_unWeighted;
2315  delete [] nSelB_unWeighted;
2316  delete [] target;
2317  delete [] target2;
2318  delete [] cutValues;
2319 
2320  delete [] xmin;
2321  delete [] xmax;
2322 
2323  delete [] useVariable;
2324  delete [] mapVariable;
2325 
2326  delete [] separationGain;
2327  delete [] cutIndex;
2328 
2329  delete [] nBins;
2330  delete [] binWidth;
2331  delete [] invBinWidth;
2332 
2333  return separationGainTotal;
2334 
2335 }
2336 #endif
2337 
2338 
2339 ////////////////////////////////////////////////////////////////////////////////
2340 /// calculate the fisher coefficients for the event sample and the variables used
2341 
2342 std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2343  std::vector<Double_t> fisherCoeff(fNvars+1);
2344 
2345  // initialization of global matrices and vectors
2346  // average value of each variables for S, B, S+B
2347  TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2348 
2349  // the covariance 'within class' and 'between class' matrices
2350  TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2351  TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2352  TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2353 
2354  //
2355  // compute mean values of variables in each sample, and the overall means
2356  //
2357 
2358  // initialize internal sum-of-weights variables
2359  Double_t sumOfWeightsS = 0;
2360  Double_t sumOfWeightsB = 0;
2361 
2362 
2363  // init vectors
2364  Double_t* sumS = new Double_t[nFisherVars];
2365  Double_t* sumB = new Double_t[nFisherVars];
2366  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2367 
2368  UInt_t nevents = eventSample.size();
2369  // compute sample means
2370  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2371 
2372  // read the Training Event into "event"
2373  const Event * ev = eventSample[ievt];
2374 
2375  // sum of weights
2376  Double_t weight = ev->GetWeight();
2377  if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2378  else sumOfWeightsB += weight;
2379 
2380  Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2381  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2382  sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2383  }
2384  }
2385  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2386  (*meanMatx)( ivar, 2 ) = sumS[ivar];
2387  (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2388 
2389  (*meanMatx)( ivar, 2 ) += sumB[ivar];
2390  (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2391 
2392  // signal + background
2393  (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2394  }
2395 
2396  delete [] sumS;
2397 
2398  delete [] sumB;
2399 
2400  // the matrix of covariance 'within class' reflects the dispersion of the
2401  // events relative to the center of gravity of their own class
2402 
2403  // assert required
2404 
2405  assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2406 
2407  // product matrices (x-<x>)(y-<y>) where x;y are variables
2408 
2409  const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2410  Double_t *sum2Sig = new Double_t[nFisherVars2];
2411  Double_t *sum2Bgd = new Double_t[nFisherVars2];
2412  Double_t *xval = new Double_t[nFisherVars2];
2413  memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2414  memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2415 
2416  // 'within class' covariance
2417  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2418 
2419  // read the Training Event into "event"
2420  // const Event* ev = eventSample[ievt];
2421  const Event* ev = eventSample.at(ievt);
2422 
2423  Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2424 
2425  for (UInt_t x=0; x<nFisherVars; x++) {
2426  xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2427  }
2428  Int_t k=0;
2429  for (UInt_t x=0; x<nFisherVars; x++) {
2430  for (UInt_t y=0; y<nFisherVars; y++) {
2431  if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2432  else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2433  k++;
2434  }
2435  }
2436  }
2437  Int_t k=0;
2438  for (UInt_t x=0; x<nFisherVars; x++) {
2439  for (UInt_t y=0; y<nFisherVars; y++) {
2440  (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2441  k++;
2442  }
2443  }
2444 
2445  delete [] sum2Sig;
2446  delete [] sum2Bgd;
2447  delete [] xval;
2448 
2449 
2450  // the matrix of covariance 'between class' reflects the dispersion of the
2451  // events of a class relative to the global center of gravity of all the class
2452  // hence the separation between classes
2453 
2454 
2455  Double_t prodSig, prodBgd;
2456 
2457  for (UInt_t x=0; x<nFisherVars; x++) {
2458  for (UInt_t y=0; y<nFisherVars; y++) {
2459 
2460  prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2461  ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2462  prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2463  ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2464 
2465  (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2466  }
2467  }
2468 
2469 
2470 
2471  // compute full covariance matrix from sum of within and between matrices
2472  for (UInt_t x=0; x<nFisherVars; x++)
2473  for (UInt_t y=0; y<nFisherVars; y++)
2474  (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2475 
2476  // Fisher = Sum { [coeff]*[variables] }
2477  //
2478  // let Xs be the array of the mean values of variables for signal evts
2479  // let Xb be the array of the mean values of variables for backgd evts
2480  // let InvWith be the inverse matrix of the 'within class' correlation matrix
2481  //
2482  // then the array of Fisher coefficients is
2483  // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2484  TMatrixD* theMat = with; // Fishers original
2485  // TMatrixD* theMat = cov; // Mahalanobis
2486 
2487  TMatrixD invCov( *theMat );
2488  if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2489  Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2490  << TMath::Abs(invCov.Determinant())
2491  << " did you use the variables that are linear combinations or highly correlated?"
2492  << Endl;
2493  }
2494  if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2495  Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2496  << TMath::Abs(invCov.Determinant())
2497  << " did you use the variables that are linear combinations?"
2498  << Endl;
2499  }
2500 
2501  invCov.Invert();
2502 
2503  // apply rescaling factor
2504  Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2505 
2506  // compute difference of mean values
2507  std::vector<Double_t> diffMeans( nFisherVars );
2508 
2509  for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2510  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2511  for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2512  Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2513  fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2514  }
2515 
2516  // rescale
2517  fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2518  }
2519 
2520  // offset correction
2521  Double_t f0 = 0.0;
2522  for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2523  f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2524  }
2525  f0 /= -2.0;
2526 
2527  fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2528 
2529  return fisherCoeff;
2530 }
2531 
2532 ////////////////////////////////////////////////////////////////////////////////
2533 /// train a node by finding the single optimal cut for a single variable
2534 /// that best separates signal and background (maximizes the separation gain)
2535 
2537  TMVA::DecisionTreeNode *node )
2538 {
2539  Double_t nTotS = 0.0, nTotB = 0.0;
2540  Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2541 
2542  std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2543 
2544  // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2545  // each spot in parallel no problem
2546  std::vector<Double_t> lCutValue( fNvars, 0.0 );
2547  std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2548  std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2549  lCutType.assign( fNvars, Char_t(kFALSE) );
2550 
2551  // Initialize (un)weighted counters for signal & background
2552  // Construct a list of event wrappers that point to the original data
2553  for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2554  if((*it)->GetClass() == fSigClass) { // signal or background event
2555  nTotS += (*it)->GetWeight();
2556  ++nTotS_unWeighted;
2557  }
2558  else {
2559  nTotB += (*it)->GetWeight();
2560  ++nTotB_unWeighted;
2561  }
2562  bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2563  }
2564 
2565  std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2566  useVariable.assign( fNvars, Char_t(kTRUE) );
2567 
2568  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2569  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2570  if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2571  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2572  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2573  }
2574  Int_t nSelectedVars = 0;
2575  while (nSelectedVars < fUseNvars) {
2576  Double_t bla = fMyTrandom->Rndm()*fNvars;
2577  useVariable[Int_t (bla)] = Char_t(kTRUE);
2578  nSelectedVars = 0;
2579  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2580  if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2581  }
2582  }
2583  }
2584  else {
2585  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2586  }
2587  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2588  if(!useVariable[ivar]) continue; // only optimze with selected variables
2589 
2590  TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2591 
2592  std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2593 
2594 
2595  Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2596 
2597  std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2598  for( ; it != it_end; ++it ) {
2599  if((**it)->GetClass() == fSigClass ) // specify signal or background event
2600  sigWeightCtr += (**it)->GetWeight();
2601  else
2602  bkgWeightCtr += (**it)->GetWeight();
2603  // Store the accumulated signal (background) weights
2604  it->SetCumulativeWeight(false,bkgWeightCtr);
2605  it->SetCumulativeWeight(true,sigWeightCtr);
2606  }
2607 
2608  const Double_t fPMin = 1.0e-6;
2609  Bool_t cutType = kFALSE;
2610  Long64_t index = 0;
2611  Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2612 
2613  // Locate the optimal cut for this (ivar-th) variable
2614  for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2615  if( index == 0 ) { ++index; continue; }
2616  if( *(*it) == NULL ) {
2617  Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2618  << index << ", and parent node=" << node->GetParent() << Endl;
2619  break;
2620  }
2621  dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2622  norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2623  // Only allow splits where both daughter nodes have the specified minimum number of events
2624  // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2625  if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2626 
2627  sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2628  if( sepTmp > separationGain ) {
2629  separationGain = sepTmp;
2630  cutValue = it->GetVal() - 0.5*dVal;
2631  Double_t nSelS = it->GetCumulativeWeight(true);
2632  Double_t nSelB = it->GetCumulativeWeight(false);
2633  // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2634  // or by removing signal (enhancing background)
2635  if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2636  else cutType = kFALSE;
2637  }
2638  }
2639  ++index;
2640  }
2641  lCutType[ivar] = Char_t(cutType);
2642  lCutValue[ivar] = cutValue;
2643  lSepGain[ivar] = separationGain;
2644  }
2645  Double_t separationGain = -1.0;
2646  Int_t iVarIndex = -1;
2647  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2648  if( lSepGain[ivar] > separationGain ) {
2649  iVarIndex = ivar;
2650  separationGain = lSepGain[ivar];
2651  }
2652  }
2653 
2654  // #### storing the best values into the node
2655  if(iVarIndex >= 0) {
2656  node->SetSelector(iVarIndex);
2657  node->SetCutValue(lCutValue[iVarIndex]);
2658  node->SetSeparationGain(lSepGain[iVarIndex]);
2659  node->SetCutType(lCutType[iVarIndex]);
2660  fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2661  }
2662  else {
2663  separationGain = 0.0;
2664  }
2665 
2666  return separationGain;
2667 }
2668 
2669 ////////////////////////////////////////////////////////////////////////////////
2670 /// get the pointer to the leaf node where a particular event ends up in...
2671 /// (used in gradient boosting)
2672 
2674 {
2675  TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2676  while(current->GetNodeType() == 0) { // intermediate node in a tree
2677  current = (current->GoesRight(e)) ?
2678  (TMVA::DecisionTreeNode*)current->GetRight() :
2679  (TMVA::DecisionTreeNode*)current->GetLeft();
2680  }
2681  return current;
2682 }
2683 
2684 ////////////////////////////////////////////////////////////////////////////////
2685 /// the event e is put into the decision tree (starting at the root node)
2686 /// and the output is NodeType (signal) or (background) of the final node (basket)
2687 /// in which the given events ends up. I.e. the result of the classification if
2688 /// the event for this decision tree.
2689 
2691 {
2692  TMVA::DecisionTreeNode *current = this->GetRoot();
2693  if (!current){
2694  Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2695  return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2696  }
2697 
2698  while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2699  current = (current->GoesRight(*e)) ?
2700  current->GetRight() :
2701  current->GetLeft();
2702  if (!current) {
2703  Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2704  }
2705 
2706  }
2707 
2708  if (DoRegression()) {
2709  // Note: This path is also taken for MethodBDT with analysis type
2710  // kClassification and kMulticlass when using GradBoost.
2711  // See TMVA::MethodBDT::InitGradBoost
2712  return current->GetResponse();
2713  } else {
2714  if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2715  else return current->GetPurity();
2716  }
2717 }
2718 
2719 ////////////////////////////////////////////////////////////////////////////////
2720 /// calculates the purity S/(S+B) of a given event sample
2721 
2722 Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2723 {
2724  Double_t sumsig=0, sumbkg=0, sumtot=0;
2725  for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2726  if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2727  else sumsig+=eventSample[ievt]->GetWeight();
2728  sumtot+=eventSample[ievt]->GetWeight();
2729  }
2730  // sanity check
2731  if (sumtot!= (sumsig+sumbkg)){
2732  Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2733  << sumtot << " " << sumsig << " " << sumbkg << Endl;
2734  }
2735  if (sumtot>0) return sumsig/(sumsig + sumbkg);
2736  else return -1;
2737 }
2738 
2739 ////////////////////////////////////////////////////////////////////////////////
2740 /// Return the relative variable importance, normalized to all
2741 /// variables together having the importance 1. The importance in
2742 /// evaluated as the total separation-gain that this variable had in
2743 /// the decision trees (weighted by the number of events)
2744 
2746 {
2747  std::vector<Double_t> relativeImportance(fNvars);
2748  Double_t sum=0;
2749  for (UInt_t i=0; i< fNvars; i++) {
2750  sum += fVariableImportance[i];
2751  relativeImportance[i] = fVariableImportance[i];
2752  }
2753 
2754  for (UInt_t i=0; i< fNvars; i++) {
2756  relativeImportance[i] /= sum;
2757  else
2758  relativeImportance[i] = 0;
2759  }
2760  return relativeImportance;
2761 }
2762 
2763 ////////////////////////////////////////////////////////////////////////////////
2764 /// returns the relative importance of variable ivar
2765 
2767 {
2768  std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2769  if (ivar < fNvars) return relativeImportance[ivar];
2770  else {
2771  Log() << kFATAL << "<GetVariableImportance>" << Endl
2772  << "--- ivar = " << ivar << " is out of range " << Endl;
2773  }
2774 
2775  return -1;
2776 }
2777 
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:93
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
unsigned int GetPoolSize() const
Definition: Executor.h:99
static long int sum(long int i)
Definition: Factory.cxx:2272
static constexpr double sr
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:27
unsigned long ULong_t
Definition: CPyCppyy.h:51
void SetSelector(Short_t i)
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
Singleton class for Global types used by TMVA.
Definition: Types.h:73
Double_t Log(Double_t x)
Definition: TMath.h:750
float Float_t
Definition: RtypesCore.h:55
unsigned int UInt_t
Definition: CPyCppyy.h:44
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
int Int_t
Definition: CPyCppyy.h:43
static Config & Instance()
static function: returns TMVA instance
Definition: Config.cxx:105
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
virtual DecisionTreeNode * GetParent() const
UInt_t GetDepth() const
Definition: Node.h:114
void IncrementNEvents(Float_t nev)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node ...
#define f(i)
Definition: RSha256.hxx:104
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:180
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Definition: Executor.h:133
virtual void SetRight(Node *r)
virtual ~DecisionTree(void)
destructor
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
Float_t GetNSigEvents(void) const
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1364
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree...
void SetNSigEvents_unweighted(Float_t s)
void SetResponse(Float_t r)
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
void SetNBValidation(Double_t b)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
void SetNFisherCoeff(Int_t nvars)
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:73
TClass * GetClass(T *)
Definition: TClass.h:589
TSeq< unsigned int > TSeqU
Definition: TSeq.hxx:195
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Double_t x[n]
Definition: legend1.C:17
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
static const Int_t fgRandomSeed
Definition: DecisionTree.h:68
Float_t GetNBkgEvents(void) const
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node ...
Float_t GetCutValue(void) const
void IncrementNBkgEvents(Float_t b)
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Double_t GetNodeR() const
void SetSeparationGain(Float_t sep)
UInt_t GetClass() const
Definition: Event.h:86
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
void SetNBkgEvents(Float_t b)
void SetNSValidation(Double_t s)
Class that contains all the data information.
Definition: DataSetInfo.h:60
static constexpr double s
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
Definition: Config.h:83
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
TString operator+(const TString &s1, const TString &s2)
Use the special concatenation constructor.
Definition: TString.cxx:1474
void AddToSumTarget(Float_t t)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:381
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1399
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void SetNEvents(Float_t nev)
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
Int_t GetNodeType(void) const
void SetSubTreeR(Double_t r)
ROOT::R::TRInterface & r
Definition: Object.C:4
auto MapReduce(F func, ROOT::TSeq< INTEGER > args, R redfunc) -> typename std::result_of< F(INTEGER)>::type
Wrap TExecutor::MapReduce functions.
Definition: Executor.h:145
virtual void SetLeft(Node *l)
void SetAlpha(Double_t alpha)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i.e.
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node ...
void SetCutValue(Float_t c)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time...
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:335
bool almost_equal_float(float x, float y, int ulp=4)
float xmax
Definition: THbookFile.cxx:93
Tools & gTools()
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
REAL epsilon
Definition: triangle.c:617
constexpr Double_t E()
Base of natural log: .
Definition: TMath.h:97
bool almost_equal_double(double x, double y, int ulp=4)
const Bool_t kFALSE
Definition: RtypesCore.h:90
static void SetVarIndex(Int_t iVar)
void AddToSumTarget2(Float_t t2)
Float_t GetPurity(void) const
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node ...
#define d(i)
Definition: RSha256.hxx:102
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
#define ClassImp(name)
Definition: Rtypes.h:361
A class to prune a decision tree using the Cost Complexity method.
double Double_t
Definition: RtypesCore.h:57
Double_t GetNBValidation() const
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
void IncrementNSigEvents(Float_t s)
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree ...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:335
void SetAlphaMinSubtree(Double_t g)
int type
Definition: TGX11.cxx:120
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:238
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t y[n]
Definition: legend1.C:17
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetNEvents_unboosted(Float_t nev)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
void SetNSigEvents_unboosted(Float_t s)
void SetTerminal(Bool_t s=kTRUE)
RegressionVariance * fRegType
Definition: DecisionTree.h:211
void SetNSigEvents(Float_t s)
void SetNBkgEvents_unboosted(Float_t b)
void SetNBkgEvents_unweighted(Float_t b)
char Char_t
Definition: RtypesCore.h:31
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
Node for the BinarySearch or Decision Trees.
Definition: Node.h:56
auto * l
Definition: textangle.C:4
Float_t GetResponse(void) const
Double_t GetNSValidation() const
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
Double_t GetOriginalWeight() const
Definition: Event.h:84
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
virtual DecisionTreeNode * GetLeft() const
virtual DecisionTreeNode * GetRight() const
void SetSeparationIndex(Float_t sep)
void SetRoot(Node *r)
Definition: BinaryTree.h:80
Short_t GetSelector() const
Double_t Sqrt(Double_t x)
Definition: TMath.h:681
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event *> &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1525
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
const Bool_t kTRUE
Definition: RtypesCore.h:89
const Int_t n
Definition: legend1.C:16
A helper class to prune a decision tree using the expected error (C4.5) method.
void SetNEvents_unweighted(Float_t nev)
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
long long Long64_t
Definition: cpp_cppyy.h:13