Logo ROOT  
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TMVA::DecisionTree *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Implementation of a Decision Tree *
12 * *
13 * Authors (alphabetical): *
14 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16 * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17 * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18 * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19 * *
20 * Copyright (c) 2005-2011: *
21 * CERN, Switzerland *
22 * U. of Victoria, Canada *
23 * MPI-K Heidelberg, Germany *
24 * U. of Bonn, Germany *
25 * *
26 * Redistribution and use in source and binary forms, with or without *
27 * modification, are permitted according to the terms listed in LICENSE *
28 * (http://mva.sourceforge.net/license.txt) *
29 * *
30 **********************************************************************************/
31
32/*! \class TMVA::DecisionTree
33\ingroup TMVA
34
35Implementation of a Decision Tree
36
37In a decision tree successive decision nodes are used to categorize the
38events out of the sample as either signal or background. Each node
39uses only a single discriminating variable to decide if the event is
40signal-like ("goes right") or background-like ("goes left"). This
41forms a tree like structure with "baskets" at the end (leave nodes),
42and an event is classified as either signal or background according to
43whether the basket where it ends up has been classified signal or
44background during the training. Training of a decision tree is the
45process to define the "cut criteria" for each node. The training
46starts with the root node. Here one takes the full training event
47sample and selects the variable and corresponding cut value that gives
48the best separation between signal and background at this stage. Using
49this cut criterion, the sample is then divided into two subsamples, a
50signal-like (right) and a background-like (left) sample. Two new nodes
51are then created for each of the two sub-samples and they are
52constructed using the same mechanism as described for the root
53node. The devision is stopped once a certain node has reached either a
54minimum number of events, or a minimum or maximum signal purity. These
55leave nodes are then called "signal" or "background" if they contain
56more signal respective background events from the training sample.
57
58*/
59
60#include <iostream>
61#include <algorithm>
62#include <vector>
63#include <limits>
64#include <fstream>
65#include <cassert>
66
67#include "TRandom3.h"
68#include "TMath.h"
69#include "TMatrix.h"
70#include "TStopwatch.h"
71
72#include "TMVA/MsgLogger.h"
73#include "TMVA/DecisionTree.h"
76
77#include "TMVA/Tools.h"
78#include "TMVA/Config.h"
79
80#include "TMVA/GiniIndex.h"
81#include "TMVA/CrossEntropy.h"
83#include "TMVA/SdivSqrtSplusB.h"
84#include "TMVA/Event.h"
86#include "TMVA/IPruneTool.h"
89
90const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
91
92using std::vector;
93
95
96bool almost_equal_float(float x, float y, int ulp=4){
97 // the machine epsilon has to be scaled to the magnitude of the values used
98 // and multiplied by the desired precision in ULPs (units in the last place)
99 return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
100 // unless the result is subnormal
101 || std::abs(x-y) < std::numeric_limits<float>::min();
102}
103
104bool almost_equal_double(double x, double y, int ulp=4){
105 // the machine epsilon has to be scaled to the magnitude of the values used
106 // and multiplied by the desired precision in ULPs (units in the last place)
107 return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
108 // unless the result is subnormal
109 || std::abs(x-y) < std::numeric_limits<double>::min();
110}
111
112////////////////////////////////////////////////////////////////////////////////
113/// default constructor using the GiniIndex as separation criterion,
114/// no restrictions on minium number of events in a leave note or the
115/// separation gain in the node splitting
116
118 BinaryTree(),
119 fNvars (0),
120 fNCuts (-1),
121 fUseFisherCuts (kFALSE),
122 fMinLinCorrForFisher (1),
123 fUseExclusiveVars (kTRUE),
124 fSepType (NULL),
125 fRegType (NULL),
126 fMinSize (0),
127 fMinNodeSize (1),
128 fMinSepGain (0),
129 fUseSearchTree(kFALSE),
130 fPruneStrength(0),
131 fPruneMethod (kNoPruning),
132 fNNodesBeforePruning(0),
133 fNodePurityLimit(0.5),
134 fRandomisedTree (kFALSE),
135 fUseNvars (0),
136 fUsePoissonNvars(kFALSE),
137 fMyTrandom (NULL),
138 fMaxDepth (999999),
139 fSigClass (0),
140 fTreeID (0),
141 fAnalysisType (Types::kClassification),
142 fDataSetInfo (NULL)
143
144{}
145
146////////////////////////////////////////////////////////////////////////////////
147/// constructor specifying the separation type, the min number of
148/// events in a no that is still subjected to further splitting, the
149/// number of bins in the grid used in applying the cut for the node
150/// splitting.
151
153 Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
154 UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
155 BinaryTree(),
156 fNvars (0),
157 fNCuts (nCuts),
158 fUseFisherCuts (kFALSE),
159 fMinLinCorrForFisher (1),
160 fUseExclusiveVars (kTRUE),
161 fSepType (sepType),
162 fRegType (NULL),
163 fMinSize (0),
164 fMinNodeSize (minSize),
165 fMinSepGain (0),
166 fUseSearchTree (kFALSE),
167 fPruneStrength (0),
168 fPruneMethod (kNoPruning),
169 fNNodesBeforePruning(0),
170 fNodePurityLimit(purityLimit),
171 fRandomisedTree (randomisedTree),
172 fUseNvars (useNvars),
173 fUsePoissonNvars(usePoissonNvars),
174 fMyTrandom (new TRandom3(iSeed)),
175 fMaxDepth (nMaxDepth),
176 fSigClass (cls),
177 fTreeID (treeID),
178 fAnalysisType (Types::kClassification),
179 fDataSetInfo (dataInfo)
180{
181 if (sepType == NULL) { // it is interpreted as a regression tree, where
182 // currently the separation type (simple least square)
183 // cannot be chosen freely)
186 if ( nCuts <=0 ) {
187 fNCuts = 200;
188 Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
189 << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
190 << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
191 << Endl;
192 }
193 }else{
195 }
196}
197
198////////////////////////////////////////////////////////////////////////////////
199/// copy constructor that creates a true copy, i.e. a completely independent tree
200/// the node copy will recursively copy all the nodes
201
203 BinaryTree(),
204 fNvars (d.fNvars),
205 fNCuts (d.fNCuts),
206 fUseFisherCuts (d.fUseFisherCuts),
207 fMinLinCorrForFisher (d.fMinLinCorrForFisher),
208 fUseExclusiveVars (d.fUseExclusiveVars),
209 fSepType (d.fSepType),
210 fRegType (d.fRegType),
211 fMinSize (d.fMinSize),
212 fMinNodeSize(d.fMinNodeSize),
213 fMinSepGain (d.fMinSepGain),
214 fUseSearchTree (d.fUseSearchTree),
215 fPruneStrength (d.fPruneStrength),
216 fPruneMethod (d.fPruneMethod),
217 fNodePurityLimit(d.fNodePurityLimit),
218 fRandomisedTree (d.fRandomisedTree),
219 fUseNvars (d.fUseNvars),
220 fUsePoissonNvars(d.fUsePoissonNvars),
221 fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
222 fMaxDepth (d.fMaxDepth),
223 fSigClass (d.fSigClass),
224 fTreeID (d.fTreeID),
225 fAnalysisType(d.fAnalysisType),
226 fDataSetInfo (d.fDataSetInfo)
227{
228 this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
229 this->SetParentTreeInNodes();
230 fNNodes = d.fNNodes;
231
232}
233
234
235////////////////////////////////////////////////////////////////////////////////
236/// destructor
237
239{
240 // destruction of the tree nodes done in the "base class" BinaryTree
241
242 if (fMyTrandom) delete fMyTrandom;
243 if (fRegType) delete fRegType;
244}
245
246////////////////////////////////////////////////////////////////////////////////
247/// descend a tree to find all its leaf nodes, fill max depth reached in the
248/// tree at the same time.
249
251{
252 if (n == NULL) { //default, start at the tree top, then descend recursively
253 n = this->GetRoot();
254 if (n == NULL) {
255 Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
256 return ;
257 }
258 }
259
260 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
261 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
262 return;
263 } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
264 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
265 return;
266 }
267 else {
268 if (this->GetLeftDaughter(n) != NULL) {
269 this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
270 }
271 if (this->GetRightDaughter(n) != NULL) {
272 this->SetParentTreeInNodes( this->GetRightDaughter(n) );
273 }
274 }
275 n->SetParentTree(this);
276 if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
277 return;
278}
279
280////////////////////////////////////////////////////////////////////////////////
281/// re-create a new tree (decision tree or search tree) from XML
282
284 std::string type("");
285 gTools().ReadAttr(node,"type", type);
286 DecisionTree* dt = new DecisionTree();
287
288 dt->ReadXML( node, tmva_Version_Code );
289 return dt;
290}
291
292// #### Multithreaded DecisionTree::BuildTree
293#ifdef R__USE_IMT
294//====================================================================================
295// Had to define this struct to enable parallelization.
296// Using BuildNodeInfo, each thread can return the information needed.
297// After each thread returns we can then merge the results, hence the + operator.
298//====================================================================================
299struct BuildNodeInfo{
300
301 BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
302
303 nvars = fNvars;
304 xmin = std::vector<Float_t>(nvars);
305 xmax = std::vector<Float_t>(nvars);
306
307 // #### the initial min and max for each feature
308 for (Int_t ivar=0; ivar<fNvars; ivar++) {
309 const Double_t val = evt->GetValueFast(ivar);
310 xmin[ivar]=val;
311 xmax[ivar]=val;
312 }
313 };
314
315 BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
316
317 nvars = fNvars;
318 xmin = std::vector<Float_t>(nvars);
319 xmax = std::vector<Float_t>(nvars);
320
321 // #### the initial min and max for each feature
322 for (Int_t ivar=0; ivar<fNvars; ivar++) {
323 xmin[ivar]=inxmin[ivar];
324 xmax[ivar]=inxmax[ivar];
325 }
326 };
327 BuildNodeInfo(){};
328
329 Int_t nvars = 0;
330 Double_t s = 0;
331 Double_t suw = 0;
332 Double_t sub = 0;
333 Double_t b = 0;
334 Double_t buw = 0;
335 Double_t bub = 0;
336 Double_t target = 0;
337 Double_t target2 = 0;
338 std::vector<Float_t> xmin;
339 std::vector<Float_t> xmax;
340
341 // #### Define the addition operator for BuildNodeInfo
342 // #### Make sure both BuildNodeInfos have the same nvars if we add them
343 BuildNodeInfo operator+(const BuildNodeInfo& other)
344 {
345 BuildNodeInfo ret(nvars, xmin, xmax);
346 if(nvars != other.nvars)
347 {
348 std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
349 return ret;
350 }
351 ret.s = s + other.s;
352 ret.suw = suw + other.suw;
353 ret.sub = sub + other.sub;
354 ret.b = b + other.b;
355 ret.buw = buw + other.buw;
356 ret.bub = bub + other.bub;
357 ret.target = target + other.target;
358 ret.target2 = target2 + other.target2;
359
360 // xmin is the min of both, xmax is the max of both
361 for(Int_t i=0; i<nvars; i++)
362 {
363 ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
364 ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
365 }
366 return ret;
367 };
368
369};
370//===========================================================================
371// Done with BuildNodeInfo declaration
372//===========================================================================
373
374
375////////////////////////////////////////////////////////////////////////////////
376/// building the decision tree by recursively calling the splitting of
377/// one (root-) node into two daughter nodes (returns the number of nodes)
378
379UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
381{
382 if (node==NULL) {
383 //start with the root node
384 node = new TMVA::DecisionTreeNode();
385 fNNodes = 1;
386 this->SetRoot(node);
387 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
388 this->GetRoot()->SetPos('s');
389 this->GetRoot()->SetDepth(0);
390 this->GetRoot()->SetParentTree(this);
391 fMinSize = fMinNodeSize/100. * eventSample.size();
392 if (GetTreeID()==0){
393 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
394 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
395 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
396 }
397 }
398
399 UInt_t nevents = eventSample.size();
400
401 if (nevents > 0 ) {
402 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
403 fVariableImportance.resize(fNvars);
404 }
405 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
406
407 // sum up the totals
408 // sig and bkg for classification
409 // err and err2 for regression
410
411 // #### Set up prerequisite info for multithreading
413 auto seeds = ROOT::TSeqU(nPartitions);
414
415 // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
416 auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
417
418 Int_t start = 1.0*partition/nPartitions*eventSample.size();
419 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
420
421 BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
422
423 for(Int_t iev=start; iev<end; iev++){
424 const TMVA::Event* evt = eventSample[iev];
425 const Double_t weight = evt->GetWeight();
426 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
427 if (evt->GetClass() == fSigClass) {
428 nodeInfof.s += weight;
429 nodeInfof.suw += 1;
430 nodeInfof.sub += orgWeight;
431 }
432 else {
433 nodeInfof.b += weight;
434 nodeInfof.buw += 1;
435 nodeInfof.bub += orgWeight;
436 }
437 if ( DoRegression() ) {
438 const Double_t tgt = evt->GetTarget(0);
439 nodeInfof.target +=weight*tgt;
440 nodeInfof.target2+=weight*tgt*tgt;
441 }
442
443 // save the min and max for each feature
444 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
445 const Double_t val = evt->GetValueFast(ivar);
446 if (iev==start){
447 nodeInfof.xmin[ivar]=val;
448 nodeInfof.xmax[ivar]=val;
449 }
450 if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
451 if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
452 }
453 }
454 return nodeInfof;
455 };
456
457 // #### Need an initial struct to pass to std::accumulate
458 BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
459
460 // #### Run the threads in parallel then merge the results
461 auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
462 BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
463 //NodeInfo nodeInfo(fNvars);
464
465 if (nodeInfo.s+nodeInfo.b < 0) {
466 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
467 << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
468 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
469 << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
470 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
471 << "to allow for reasonable averaging!!!" << Endl
472 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
473 << "with negative weight in the training." << Endl;
474 double nBkg=0.;
475 for (UInt_t i=0; i<eventSample.size(); i++) {
476 if (eventSample[i]->GetClass() != fSigClass) {
477 nBkg += eventSample[i]->GetWeight();
478 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
479 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
480 }
481 }
482 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
483 }
484
485 node->SetNSigEvents(nodeInfo.s);
486 node->SetNBkgEvents(nodeInfo.b);
487 node->SetNSigEvents_unweighted(nodeInfo.suw);
488 node->SetNBkgEvents_unweighted(nodeInfo.buw);
489 node->SetNSigEvents_unboosted(nodeInfo.sub);
490 node->SetNBkgEvents_unboosted(nodeInfo.bub);
491 node->SetPurity();
492 if (node == this->GetRoot()) {
493 node->SetNEvents(nodeInfo.s+nodeInfo.b);
494 node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
495 node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
496 }
497
498 // save the min and max for each feature
499 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
500 node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
501 node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
502 }
503
504 // I now demand the minimum number of events for both daughter nodes. Hence if the number
505 // of events in the parent node is not at least two times as big, I don't even need to try
506 // splitting
507
508 // ask here for actual "events" independent of their weight.. OR the weighted events
509 // to exceed the min requested number of events per daughter node
510 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
511 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
512 // std::cout << "------------------------------------------------------------------"<<std::endl;
513 // std::cout << "------------------------------------------------------------------"<<std::endl;
514 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
515 if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
516 && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
517
518 // Train the node and figure out the separation gain and split points
519 Double_t separationGain;
520 if (fNCuts > 0){
521 separationGain = this->TrainNodeFast(eventSample, node);
522 }
523 else {
524 separationGain = this->TrainNodeFull(eventSample, node);
525 }
526
527 // The node has been trained and there is nothing to be gained by splitting
528 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
529 // no cut can actually do anything to improve the node
530 // hence, naturally, the current node is a leaf node
531 if (DoRegression()) {
532 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
533 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
534 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
535 node->SetRMS(0);
536 }else{
537 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
538 }
539 }
540 else {
541 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
542 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
543 else node->SetNodeType(-1);
544 }
545 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
546 }
547 else {
548 // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
549 // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
550 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
551 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
552
553 Double_t nRight=0, nLeft=0;
554 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
555
556 for (UInt_t ie=0; ie< nevents ; ie++) {
557 if (node->GoesRight(*eventSample[ie])) {
558 rightSample.push_back(eventSample[ie]);
559 nRight += eventSample[ie]->GetWeight();
560 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
561 }
562 else {
563 leftSample.push_back(eventSample[ie]);
564 nLeft += eventSample[ie]->GetWeight();
565 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
566 }
567 }
568 // sanity check
569 if (leftSample.empty() || rightSample.empty()) {
570
571 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
572 << "--- Hence new node == old node ... check" << Endl
573 << "--- left:" << leftSample.size()
574 << " right:" << rightSample.size() << Endl
575 << " while the separation is thought to be " << separationGain
576 << "\n when cutting on variable " << node->GetSelector()
577 << " at value " << node->GetCutValue()
578 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
579 }
580
581 // continue building daughter nodes for the left and the right eventsample
582 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
583 fNNodes++;
584 rightNode->SetNEvents(nRight);
585 rightNode->SetNEvents_unboosted(nRightUnBoosted);
586 rightNode->SetNEvents_unweighted(rightSample.size());
587
588 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
589
590 fNNodes++;
591 leftNode->SetNEvents(nLeft);
592 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
593 leftNode->SetNEvents_unweighted(leftSample.size());
594
595 node->SetNodeType(0);
596 node->SetLeft(leftNode);
597 node->SetRight(rightNode);
598
599 this->BuildTree(rightSample, rightNode);
600 this->BuildTree(leftSample, leftNode );
601
602 }
603 }
604 else{ // it is a leaf node
605 if (DoRegression()) {
606 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
607 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
608 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
609 node->SetRMS(0);
610 }else{
611 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
612 }
613 }
614 else {
615 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
616 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
617 else node->SetNodeType(-1);
618 // loop through the event sample ending up in this node and check for events with negative weight
619 // those "cannot" be boosted normally. Hence, if there is one of those
620 // is misclassified, find randomly as many events with positive weights in this
621 // node as needed to get the same absolute number of weight, and mark them as
622 // "not to be boosted" in order to make up for not boosting the negative weight event
623 }
624
625
626 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
627 }
628
629 // if (IsRootNode) this->CleanTree();
630 return fNNodes;
631}
632
633// Standard DecisionTree::BuildTree (multithreading is not enabled)
634#else
635
636UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
638{
639 if (node==NULL) {
640 //start with the root node
641 node = new TMVA::DecisionTreeNode();
642 fNNodes = 1;
643 this->SetRoot(node);
644 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
645 this->GetRoot()->SetPos('s');
646 this->GetRoot()->SetDepth(0);
647 this->GetRoot()->SetParentTree(this);
648 fMinSize = fMinNodeSize/100. * eventSample.size();
649 if (GetTreeID()==0){
650 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
651 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
652 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
653 }
654 }
655
656 UInt_t nevents = eventSample.size();
657
658 if (nevents > 0 ) {
659 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
660 fVariableImportance.resize(fNvars);
661 }
662 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
663
664 Double_t s=0, b=0;
665 Double_t suw=0, buw=0;
666 Double_t sub=0, bub=0; // unboosted!
667 Double_t target=0, target2=0;
668 Float_t *xmin = new Float_t[fNvars];
669 Float_t *xmax = new Float_t[fNvars];
670
671 // initializing xmin and xmax for each variable
672 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
673 xmin[ivar]=xmax[ivar]=0;
674 }
675 // sum up the totals
676 // sig and bkg for classification
677 // err and err2 for regression
678 for (UInt_t iev=0; iev<eventSample.size(); iev++) {
679 const TMVA::Event* evt = eventSample[iev];
680 const Double_t weight = evt->GetWeight();
681 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
682 if (evt->GetClass() == fSigClass) {
683 s += weight;
684 suw += 1;
685 sub += orgWeight;
686 }
687 else {
688 b += weight;
689 buw += 1;
690 bub += orgWeight;
691 }
692 if ( DoRegression() ) {
693 const Double_t tgt = evt->GetTarget(0);
694 target +=weight*tgt;
695 target2+=weight*tgt*tgt;
696 }
697
698 // save the min and max for each feature
699 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
700 const Double_t val = evt->GetValueFast(ivar);
701 if (iev==0) xmin[ivar]=xmax[ivar]=val;
702 if (val < xmin[ivar]) xmin[ivar]=val;
703 if (val > xmax[ivar]) xmax[ivar]=val;
704 }
705 }
706
707
708 if (s+b < 0) {
709 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
710 << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
711 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
712 << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
713 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
714 << "to allow for reasonable averaging!!!" << Endl
715 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
716 << "with negative weight in the training." << Endl;
717 double nBkg=0.;
718 for (UInt_t i=0; i<eventSample.size(); i++) {
719 if (eventSample[i]->GetClass() != fSigClass) {
720 nBkg += eventSample[i]->GetWeight();
721 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
722 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
723 }
724 }
725 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
726 }
727
728 node->SetNSigEvents(s);
729 node->SetNBkgEvents(b);
730 node->SetNSigEvents_unweighted(suw);
731 node->SetNBkgEvents_unweighted(buw);
732 node->SetNSigEvents_unboosted(sub);
733 node->SetNBkgEvents_unboosted(bub);
734 node->SetPurity();
735 if (node == this->GetRoot()) {
736 node->SetNEvents(s+b);
737 node->SetNEvents_unweighted(suw+buw);
738 node->SetNEvents_unboosted(sub+bub);
739 }
740
741 // save the min and max for each feature
742 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
743 node->SetSampleMin(ivar,xmin[ivar]);
744 node->SetSampleMax(ivar,xmax[ivar]);
745
746 }
747 delete[] xmin;
748 delete[] xmax;
749
750 // I now demand the minimum number of events for both daughter nodes. Hence if the number
751 // of events in the parent node is not at least two times as big, I don't even need to try
752 // splitting
753
754 // ask here for actuall "events" independent of their weight.. OR the weighted events
755 // to execeed the min requested number of events per dauther node
756 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
757 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
758 // std::cout << "------------------------------------------------------------------"<<std::endl;
759 // std::cout << "------------------------------------------------------------------"<<std::endl;
760 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
761 if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
762 && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
763 Double_t separationGain;
764 if (fNCuts > 0){
765 separationGain = this->TrainNodeFast(eventSample, node);
766 } else {
767 separationGain = this->TrainNodeFull(eventSample, node);
768 }
769 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
770 /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
771 // no cut can actually do anything to improve the node
772 // hence, naturally, the current node is a leaf node
773 if (DoRegression()) {
774 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
775 node->SetResponse(target/(s+b));
776 if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
777 node->SetRMS(0);
778 }else{
779 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
780 }
781 }
782 else {
783 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
784
785 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
786 else node->SetNodeType(-1);
787 }
788 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
789
790 } else {
791
792 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
793 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
794
795 Double_t nRight=0, nLeft=0;
796 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
797
798 for (UInt_t ie=0; ie< nevents ; ie++) {
799 if (node->GoesRight(*eventSample[ie])) {
800 rightSample.push_back(eventSample[ie]);
801 nRight += eventSample[ie]->GetWeight();
802 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
803 }
804 else {
805 leftSample.push_back(eventSample[ie]);
806 nLeft += eventSample[ie]->GetWeight();
807 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
808 }
809 }
810
811 // sanity check
812 if (leftSample.empty() || rightSample.empty()) {
813
814 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
815 << "--- Hence new node == old node ... check" << Endl
816 << "--- left:" << leftSample.size()
817 << " right:" << rightSample.size() << Endl
818 << " while the separation is thought to be " << separationGain
819 << "\n when cutting on variable " << node->GetSelector()
820 << " at value " << node->GetCutValue()
821 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
822 }
823
824 // continue building daughter nodes for the left and the right eventsample
825 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
826 fNNodes++;
827 rightNode->SetNEvents(nRight);
828 rightNode->SetNEvents_unboosted(nRightUnBoosted);
829 rightNode->SetNEvents_unweighted(rightSample.size());
830
831 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
832
833 fNNodes++;
834 leftNode->SetNEvents(nLeft);
835 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
836 leftNode->SetNEvents_unweighted(leftSample.size());
837
838 node->SetNodeType(0);
839 node->SetLeft(leftNode);
840 node->SetRight(rightNode);
841
842 this->BuildTree(rightSample, rightNode);
843 this->BuildTree(leftSample, leftNode );
844
845 }
846 }
847 else{ // it is a leaf node
848 if (DoRegression()) {
849 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
850 node->SetResponse(target/(s+b));
851 if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
852 node->SetRMS(0);
853 }else{
854 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
855 }
856 }
857 else {
858 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
859 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
860 else node->SetNodeType(-1);
861 // loop through the event sample ending up in this node and check for events with negative weight
862 // those "cannot" be boosted normally. Hence, if there is one of those
863 // is misclassified, find randomly as many events with positive weights in this
864 // node as needed to get the same absolute number of weight, and mark them as
865 // "not to be boosted" in order to make up for not boosting the negative weight event
866 }
867
868
869 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
870 }
871
872 // if (IsRootNode) this->CleanTree();
873 return fNNodes;
874}
875
876#endif
877
878////////////////////////////////////////////////////////////////////////////////
879/// fill the existing the decision tree structure by filling event
880/// in from the top node and see where they happen to end up
881
882void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
883{
884 for (UInt_t i=0; i<eventSample.size(); i++) {
885 this->FillEvent(*(eventSample[i]),NULL);
886 }
887}
888
889////////////////////////////////////////////////////////////////////////////////
890/// fill the existing the decision tree structure by filling event
891/// in from the top node and see where they happen to end up
892
895{
896 if (node == NULL) { // that's the start, take the Root node
897 node = this->GetRoot();
898 }
899
900 node->IncrementNEvents( event.GetWeight() );
902
903 if (event.GetClass() == fSigClass) {
904 node->IncrementNSigEvents( event.GetWeight() );
906 }
907 else {
908 node->IncrementNBkgEvents( event.GetWeight() );
910 }
911 node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
912 node->GetNBkgEvents()));
913
914 if (node->GetNodeType() == 0) { //intermediate node --> go down
915 if (node->GoesRight(event))
916 this->FillEvent(event, node->GetRight());
917 else
918 this->FillEvent(event, node->GetLeft());
919 }
920}
921
922////////////////////////////////////////////////////////////////////////////////
923/// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
924
926{
927 if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
928
929}
930
931////////////////////////////////////////////////////////////////////////////////
932/// remove those last splits that result in two leaf nodes that
933/// are both of the type (i.e. both signal or both background)
934/// this of course is only a reasonable thing to do when you use
935/// "YesOrNo" leafs, while it might loose s.th. if you use the
936/// purity information in the nodes.
937/// --> hence I don't call it automatically in the tree building
938
940{
941 if (node==NULL) {
942 node = this->GetRoot();
943 }
944
945 DecisionTreeNode *l = node->GetLeft();
946 DecisionTreeNode *r = node->GetRight();
947
948 if (node->GetNodeType() == 0) {
949 this->CleanTree(l);
950 this->CleanTree(r);
951 if (l->GetNodeType() * r->GetNodeType() > 0) {
952
953 this->PruneNode(node);
954 }
955 }
956 // update the number of nodes after the cleaning
957 return this->CountNodes();
958
959}
960
961////////////////////////////////////////////////////////////////////////////////
962/// prune (get rid of internal nodes) the Decision tree to avoid overtraining
963/// several different pruning methods can be applied as selected by the
964/// variable "fPruneMethod".
965
967{
968 IPruneTool* tool(NULL);
969 PruningInfo* info(NULL);
970
971 if( fPruneMethod == kNoPruning ) return 0.0;
972
973 if (fPruneMethod == kExpectedErrorPruning)
974 // tool = new ExpectedErrorPruneTool(logfile);
975 tool = new ExpectedErrorPruneTool();
976 else if (fPruneMethod == kCostComplexityPruning)
977 {
978 tool = new CostComplexityPruneTool();
979 }
980 else {
981 Log() << kFATAL << "Selected pruning method not yet implemented "
982 << Endl;
983 }
984
985 if(!tool) return 0.0;
986
987 tool->SetPruneStrength(GetPruneStrength());
988 if(tool->IsAutomatic()) {
989 if(validationSample == NULL){
990 Log() << kFATAL << "Cannot automate the pruning algorithm without an "
991 << "independent validation sample!" << Endl;
992 }else if(validationSample->size() == 0) {
993 Log() << kFATAL << "Cannot automate the pruning algorithm with "
994 << "independent validation sample of ZERO events!" << Endl;
995 }
996 }
997
998 info = tool->CalculatePruningInfo(this,validationSample);
999 Double_t pruneStrength=0;
1000 if(!info) {
1001 Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1002 << Endl;
1003 } else {
1004 pruneStrength = info->PruneStrength;
1005
1006 // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1007 // << " has quality index " << info->QualityIndex << Endl;
1008
1009
1010 for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1011
1012 PruneNode(info->PruneSequence[i]);
1013 }
1014 // update the number of nodes after the pruning
1015 this->CountNodes();
1016 }
1017
1018 delete tool;
1019 delete info;
1020
1021 return pruneStrength;
1022};
1023
1024
1025////////////////////////////////////////////////////////////////////////////////
1026/// run the validation sample through the (pruned) tree and fill in the nodes
1027/// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1028/// and Background events from the validation sample. This is then later used
1029/// when asking for the "tree quality" ..
1030
1032{
1033 GetRoot()->ResetValidationData();
1034 for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1035 CheckEventWithPrunedTree((*validationSample)[ievt]);
1036 }
1037}
1038
1039////////////////////////////////////////////////////////////////////////////////
1040/// return the misclassification rate of a pruned tree
1041/// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1042/// any node, hence this tree quality testing will stop there, hence test
1043/// the pruned tree (while the full tree is still in place for normal/later use)
1044
1046{
1047 if (n == NULL) { // default, start at the tree top, then descend recursively
1048 n = this->GetRoot();
1049 if (n == NULL) {
1050 Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1051 return 0;
1052 }
1053 }
1054
1055 if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1056 return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1057 TestPrunedTreeQuality( n->GetRight(), mode ));
1058 }
1059 else { // terminal leaf (in a pruned subtree of T_max at least)
1060 if (DoRegression()) {
1061 Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1062 return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1063 }
1064 else {
1065 if (mode == 0) {
1066 if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1067 return n->GetNBValidation();
1068 else
1069 return n->GetNSValidation();
1070 }
1071 else if ( mode == 1 ) {
1072 // calculate the weighted error using the pruning validation sample
1073 return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1074 }
1075 else {
1076 throw std::string("Unknown ValidationQualityMode");
1077 }
1078 }
1079 }
1080}
1081
1082////////////////////////////////////////////////////////////////////////////////
1083/// pass a single validation event through a pruned decision tree
1084/// on the way down the tree, fill in all the "intermediate" information
1085/// that would normally be there from training.
1086
1088{
1089 DecisionTreeNode* current = this->GetRoot();
1090 if (current == NULL) {
1091 Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1092 }
1093
1094 while(current != NULL) {
1095 if(e->GetClass() == fSigClass)
1096 current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1097 else
1098 current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1099
1100 if (e->GetNTargets() > 0) {
1101 current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1102 current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1103 }
1104
1105 if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1106 current = NULL;
1107 }
1108 else {
1109 if (current->GoesRight(*e))
1110 current = (TMVA::DecisionTreeNode*)current->GetRight();
1111 else
1112 current = (TMVA::DecisionTreeNode*)current->GetLeft();
1113 }
1114 }
1115}
1116
1117////////////////////////////////////////////////////////////////////////////////
1118/// calculate the normalization factor for a pruning validation sample
1119
1121{
1122 Double_t sumWeights = 0.0;
1123 for( EventConstList::const_iterator it = validationSample->begin();
1124 it != validationSample->end(); ++it ) {
1125 sumWeights += (*it)->GetWeight();
1126 }
1127 return sumWeights;
1128}
1129
1130////////////////////////////////////////////////////////////////////////////////
1131/// return the number of terminal nodes in the sub-tree below Node n
1132
1134{
1135 if (n == NULL) { // default, start at the tree top, then descend recursively
1136 n = this->GetRoot();
1137 if (n == NULL) {
1138 Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1139 return 0;
1140 }
1141 }
1142
1143 UInt_t countLeafs=0;
1144
1145 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1146 countLeafs += 1;
1147 }
1148 else {
1149 if (this->GetLeftDaughter(n) != NULL) {
1150 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1151 }
1152 if (this->GetRightDaughter(n) != NULL) {
1153 countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1154 }
1155 }
1156 return countLeafs;
1157}
1158
1159////////////////////////////////////////////////////////////////////////////////
1160/// descend a tree to find all its leaf nodes
1161
1163{
1164 if (n == NULL) { // default, start at the tree top, then descend recursively
1165 n = this->GetRoot();
1166 if (n == NULL) {
1167 Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1168 return ;
1169 }
1170 }
1171
1172 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1173 // do nothing
1174 }
1175 else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1176 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1177 return;
1178 }
1179 else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1180 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1181 return;
1182 }
1183 else {
1184 if (this->GetLeftDaughter(n) != NULL) {
1185 this->DescendTree( this->GetLeftDaughter(n) );
1186 }
1187 if (this->GetRightDaughter(n) != NULL) {
1188 this->DescendTree( this->GetRightDaughter(n) );
1189 }
1190 }
1191}
1192
1193////////////////////////////////////////////////////////////////////////////////
1194/// prune away the subtree below the node
1195
1197{
1198 DecisionTreeNode *l = node->GetLeft();
1199 DecisionTreeNode *r = node->GetRight();
1200
1201 node->SetRight(NULL);
1202 node->SetLeft(NULL);
1203 node->SetSelector(-1);
1204 node->SetSeparationGain(-1);
1205 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1206 else node->SetNodeType(-1);
1207 this->DeleteNode(l);
1208 this->DeleteNode(r);
1209 // update the stored number of nodes in the Tree
1210 this->CountNodes();
1211
1212}
1213
1214////////////////////////////////////////////////////////////////////////////////
1215/// prune a node temporarily (without actually deleting its descendants
1216/// which allows testing the pruned tree quality for many different
1217/// pruning stages without "touching" the tree.
1218
1220 if(node == NULL) return;
1221 node->SetNTerminal(1);
1222 node->SetSubTreeR( node->GetNodeR() );
1223 node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1224 node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1225 node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1226}
1227
1228////////////////////////////////////////////////////////////////////////////////
1229/// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1230/// is coded as a sequence of left-right moves starting from the root, coded as
1231/// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1232
1234{
1235 Node* current = this->GetRoot();
1236
1237 for (UInt_t i =0; i < depth; i++) {
1238 ULong_t tmp = 1 << i;
1239 if ( tmp & sequence) current = this->GetRightDaughter(current);
1240 else current = this->GetLeftDaughter(current);
1241 }
1242
1243 return current;
1244}
1245
1246////////////////////////////////////////////////////////////////////////////////
1247///
1248
1249void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1250 for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1251 if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1252 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1253 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1254 }
1255 if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1256 else useNvars = fUseNvars;
1257
1258 UInt_t nSelectedVars = 0;
1259 while (nSelectedVars < useNvars) {
1260 Double_t bla = fMyTrandom->Rndm()*fNvars;
1261 useVariable[Int_t (bla)] = kTRUE;
1262 nSelectedVars = 0;
1263 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1264 if (useVariable[ivar] == kTRUE) {
1265 mapVariable[nSelectedVars] = ivar;
1266 nSelectedVars++;
1267 }
1268 }
1269 }
1270 if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1271}
1272
1273// Multithreaded version of DecisionTree::TrainNodeFast
1274#ifdef R__USE_IMT
1275//====================================================================================
1276// Had to define this struct to enable parallelization in TrainNodeFast.
1277// Using TrainNodeInfo, each thread can return the information needed.
1278// After each thread returns we can then merge the results, hence the + operator.
1279//====================================================================================
1280struct TrainNodeInfo{
1281
1282 TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1283
1284 cNvars = cNvars_;
1285 nBins = nBins_;
1286
1287 nSelS = std::vector< std::vector<Double_t> >(cNvars);
1288 nSelB = std::vector< std::vector<Double_t> >(cNvars);
1289 nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1290 nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1291 target = std::vector< std::vector<Double_t> >(cNvars);
1292 target2 = std::vector< std::vector<Double_t> >(cNvars);
1293
1294 for(Int_t ivar=0; ivar<cNvars; ivar++){
1295 nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1296 nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297 nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298 nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299 target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1300 target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1301 }
1302 };
1303
1304 TrainNodeInfo(){};
1305
1306 // #### malloc problem if I define this and try to destruct xmin and xmax...
1307 // #### Maybe someone better at C++ can figure out why and fix this if it's a
1308 // ### serious memory problem
1309 //~TrainNodeInfo(){
1310 // delete [] xmin;
1311 // delete [] xmax;
1312 //};
1313
1314 Int_t cNvars = 0;
1315 UInt_t* nBins;
1316
1317 Double_t nTotS = 0;
1318 Double_t nTotS_unWeighted = 0;
1319 Double_t nTotB = 0;
1320 Double_t nTotB_unWeighted = 0;
1321
1322 std::vector< std::vector<Double_t> > nSelS;
1323 std::vector< std::vector<Double_t> > nSelB;
1324 std::vector< std::vector<Double_t> > nSelS_unWeighted;
1325 std::vector< std::vector<Double_t> > nSelB_unWeighted;
1326 std::vector< std::vector<Double_t> > target;
1327 std::vector< std::vector<Double_t> > target2;
1328
1329 // Define the addition operator for TrainNodeInfo
1330 // Make sure both TrainNodeInfos have the same nvars if we add them
1331 TrainNodeInfo operator+(const TrainNodeInfo& other)
1332 {
1333 TrainNodeInfo ret(cNvars, nBins);
1334
1335 // check that the two are compatible to add
1336 if(cNvars != other.cNvars)
1337 {
1338 std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1339 return ret;
1340 }
1341
1342 // add the signal, background, and target sums
1343 for (Int_t ivar=0; ivar<cNvars; ivar++) {
1344 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1345 ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1346 ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1347 ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1348 ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1349 ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1350 ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1351 }
1352 }
1353
1354 ret.nTotS = nTotS + other.nTotS;
1355 ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1356 ret.nTotB = nTotB + other.nTotB;
1357 ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1358
1359 return ret;
1360 };
1361
1362};
1363//===========================================================================
1364// Done with TrainNodeInfo declaration
1365//===========================================================================
1366
1367////////////////////////////////////////////////////////////////////////////////
1368/// Decide how to split a node using one of the variables that gives
1369/// the best separation of signal/background. In order to do this, for each
1370/// variable a scan of the different cut values in a grid (grid = fNCuts) is
1371/// performed and the resulting separation gains are compared.
1372/// in addition to the individual variables, one can also ask for a fisher
1373/// discriminant being built out of (some) of the variables and used as a
1374/// possible multivariate split.
1375
1378{
1379 // #### OK let's comment this one to see how to parallelize it
1380 Double_t separationGainTotal = -1;
1381 Double_t *separationGain = new Double_t[fNvars+1];
1382 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1383
1384 // #### initialize the sep gain and cut index values
1385 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1386 separationGain[ivar]=-1;
1387 cutIndex[ivar]=-1;
1388 }
1389 // ### set up some other variables
1390 Int_t mxVar = -1;
1391 Bool_t cutType = kTRUE;
1392 UInt_t nevents = eventSample.size();
1393
1394
1395 // the +1 comes from the fact that I treat later on the Fisher output as an
1396 // additional possible variable.
1397 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1398 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1399
1400 std::vector<Double_t> fisherCoeff;
1401
1402 // #### set up a map to the subset of variables using two arrays
1403 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1404 UInt_t tmp=fUseNvars;
1405 GetRandomisedVariables(useVariable,mapVariable,tmp);
1406 }
1407 else {
1408 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1409 useVariable[ivar] = kTRUE;
1410 mapVariable[ivar] = ivar;
1411 }
1412 }
1413 // #### last variable entry is the fisher variable
1414 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1415
1416 // #### Begin Fisher calculation
1417 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1418 if (fUseFisherCuts) {
1419 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1420
1421 //use for the Fisher discriminant ONLY those variables that show
1422 //some reasonable linear correlation in either Signal or Background
1423 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1424 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1425 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1426 useVarInFisher[ivar] = kFALSE;
1427 mapVarInFisher[ivar] = ivar;
1428 }
1429
1430 std::vector<TMatrixDSym*>* covMatrices;
1431 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1432 if (!covMatrices){
1433 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1434 fisherOK = kFALSE;
1435 }else{
1436 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1437 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1438 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1439 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1440
1441 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1442 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1443 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1444 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1445 useVarInFisher[ivar] = kTRUE;
1446 useVarInFisher[jvar] = kTRUE;
1447 }
1448 }
1449 }
1450 // now as you know which variables you want to use, count and map them:
1451 // such that you can use an array/matrix filled only with THOSE variables
1452 // that you used
1453 UInt_t nFisherVars = 0;
1454 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1455 //now .. pick those variables that are used in the FIsher and are also
1456 // part of the "allowed" variables in case of Randomized Trees)
1457 if (useVarInFisher[ivar] && useVariable[ivar]) {
1458 mapVarInFisher[nFisherVars++]=ivar;
1459 // now exclude the variables used in the Fisher cuts, and don't
1460 // use them anymore in the individual variable scan
1461 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1462 }
1463 }
1464
1465
1466 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1467 fisherOK = kTRUE;
1468 }
1469 delete [] useVarInFisher;
1470 delete [] mapVarInFisher;
1471
1472 }
1473 // #### End Fisher calculation
1474
1475
1476 UInt_t cNvars = fNvars;
1477 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1478
1479 // #### set up the binning info arrays
1480 // #### each var has its own binning since some may be integers
1481 UInt_t* nBins = new UInt_t [cNvars];
1482 Double_t* binWidth = new Double_t [cNvars];
1483 Double_t* invBinWidth = new Double_t [cNvars];
1484 Double_t** cutValues = new Double_t* [cNvars];
1485
1486 // #### set up the xmin and xmax arrays
1487 // #### each var has its own range
1488 Double_t *xmin = new Double_t[cNvars];
1489 Double_t *xmax = new Double_t[cNvars];
1490
1491 // construct and intialize binning/cuts
1492 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1493 // ncuts means that we need n+1 bins for each variable
1494 nBins[ivar] = fNCuts+1;
1495 if (ivar < fNvars) {
1496 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1497 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1498 }
1499 }
1500
1501 cutValues[ivar] = new Double_t [nBins[ivar]];
1502 }
1503
1504 // init the range and cutvalues for each var now that we know the binning
1505 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1506 if (ivar < fNvars){
1507 xmin[ivar]=node->GetSampleMin(ivar);
1508 xmax[ivar]=node->GetSampleMax(ivar);
1509 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1510 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1511 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1512 useVariable[ivar]=kFALSE;
1513 }
1514
1515 } else { // the fisher variable
1516 xmin[ivar]=999;
1517 xmax[ivar]=-999;
1518 // too bad, for the moment I don't know how to do this without looping
1519 // once to get the "min max" and then AGAIN to fill the histogram
1520 for (UInt_t iev=0; iev<nevents; iev++) {
1521 // returns the Fisher value (no fixed range)
1522 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1523 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1524 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1525 if (result > xmax[ivar]) xmax[ivar]=result;
1526 if (result < xmin[ivar]) xmin[ivar]=result;
1527 }
1528 }
1529 // this loop could take a long time if nbins is large
1530 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1531 cutValues[ivar][ibin]=0;
1532 }
1533 }
1534
1535 // ====================================================================
1536 // ====================================================================
1537 // Parallelized Version
1538 // ====================================================================
1539 // ====================================================================
1540
1541 // #### Figure out the cut values, loops through vars then through cuts
1542 // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1543 // fill the cut values for the scan:
1544 auto varSeeds = ROOT::TSeqU(cNvars);
1545 auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1546
1547 if ( useVariable[ivar] ) {
1548
1549 //set the grid for the cut scan on the variables like this:
1550 //
1551 // | | | | | ... | |
1552 // xmin xmax
1553 //
1554 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1555 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1556 // --> nBins = fNCuts+1
1557 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1558 // hence can be safely omitted
1559
1560 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1561 invBinWidth[ivar] = 1./binWidth[ivar];
1562 if (ivar < fNvars) {
1563 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1564 }
1565
1566 // std::cout << "ivar="<<ivar
1567 // <<" min="<<xmin[ivar]
1568 // << " max="<<xmax[ivar]
1569 // << " width=" << istepSize
1570 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1571 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1572 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1573 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1574 }
1575 }
1576 return 0;
1577 };
1578 TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1579
1580 // #### Loop through the events to get the total sig and background
1581 // #### Then loop through the vars to get the counts in each bin in each var
1582 // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1583
1584 TrainNodeInfo nodeInfo(cNvars, nBins);
1586
1587 // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1588 // #### Parallelize by chunking the data into the same number of sections as we have processors
1589 if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1590 {
1591 auto seeds = ROOT::TSeqU(nPartitions);
1592
1593 // need a lambda function to pass to TThreadExecutor::MapReduce
1594 auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1595 &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1596
1597 UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1598 UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1599
1600 TrainNodeInfo nodeInfof(cNvars, nBins);
1601
1602 for(UInt_t iev=start; iev<end; iev++) {
1603
1604 Double_t eventWeight = eventSample[iev]->GetWeight();
1605 if (eventSample[iev]->GetClass() == fSigClass) {
1606 nodeInfof.nTotS+=eventWeight;
1607 nodeInfof.nTotS_unWeighted++; }
1608 else {
1609 nodeInfof.nTotB+=eventWeight;
1610 nodeInfof.nTotB_unWeighted++;
1611 }
1612
1613 // #### Count the number in each bin
1614 Int_t iBin=-1;
1615 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1616 // now scan trough the cuts for each varable and find which one gives
1617 // the best separationGain at the current stage.
1618 if ( useVariable[ivar] ) {
1619 Double_t eventData;
1620 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1621 else { // the fisher variable
1622 eventData = fisherCoeff[fNvars];
1623 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1624 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1625
1626 }
1627 // #### figure out which bin it belongs in ...
1628 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1629 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1630 if (eventSample[iev]->GetClass() == fSigClass) {
1631 nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1632 nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1633 }
1634 else {
1635 nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1636 nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1637 }
1638 if (DoRegression()) {
1639 nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1640 nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1641 }
1642 }
1643 }
1644 }
1645 return nodeInfof;
1646 };
1647
1648 // #### Need an intial struct to pass to std::accumulate
1649 TrainNodeInfo nodeInfoInit(cNvars, nBins);
1650
1651 // #### Run the threads in parallel then merge the results
1652 auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1653 nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1654 }
1655
1656
1657 // #### When nbins is close to the order of the data this version of parallelization is faster
1658 // #### Parallelize by vectorizing the variable loop
1659 else {
1660
1661 auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1662
1663 for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1664
1665 Int_t iBin=-1;
1666 Double_t eventWeight = eventSample[iev]->GetWeight();
1667
1668 // Only count the net signal and background once
1669 if(ivar==0){
1670 if (eventSample[iev]->GetClass() == fSigClass) {
1671 nodeInfo.nTotS+=eventWeight;
1672 nodeInfo.nTotS_unWeighted++; }
1673 else {
1674 nodeInfo.nTotB+=eventWeight;
1675 nodeInfo.nTotB_unWeighted++;
1676 }
1677 }
1678
1679 // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1680 if ( useVariable[ivar] ) {
1681 Double_t eventData;
1682 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1683 else { // the fisher variable
1684 eventData = fisherCoeff[fNvars];
1685 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1686 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1687
1688 }
1689 // #### figure out which bin it belongs in ...
1690 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1691 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1692 if (eventSample[iev]->GetClass() == fSigClass) {
1693 nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1694 nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1695 }
1696 else {
1697 nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1698 nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1699 }
1700 if (DoRegression()) {
1701 nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1702 nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1703 }
1704 }
1705 }
1706 return 0;
1707 };
1708
1709 TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1710 }
1711
1712 // now turn each "histogram" into a cumulative distribution
1713 // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1714 // #### doesn't hurt otherwise, pretty unnoticeable
1715 auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1716 if (useVariable[ivar]) {
1717 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1718 nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1719 nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1720 nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1721 nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1722 if (DoRegression()) {
1723 nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1724 nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1725 }
1726 }
1727 if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1728 Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1729 << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1730 << " while eventsample size = " << eventSample.size()
1731 << Endl;
1732 }
1733 double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1734 double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1735 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1736 Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1737 << lastBins
1738 << " while total number of events = " << totalSum
1739 << Endl;
1740 }
1741 }
1742 return 0;
1743 };
1744 TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1745
1746 // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1747 // now select the optimal cuts for each varable and find which one gives
1748 // the best separationGain at the current stage
1749 auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1750 if (useVariable[ivar]) {
1751 Double_t sepTmp;
1752 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1753 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1754 // calculated by the "SamplePurities" from the branches that would go to the
1755 // left or the right from this node if "these" cuts were used in the Node:
1756 // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1757 // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1758
1759 // only allow splits where both daughter nodes match the specified minimum number
1760 // for this use the "unweighted" events, as you are interested in statistically
1761 // significant splits, which is determined by the actual number of entries
1762 // for a node, rather than the sum of event weights.
1763
1764 Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1765 Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1766 Double_t s = nodeInfo.nTotS_unWeighted;
1767 Double_t b = nodeInfo.nTotB_unWeighted;
1768 Double_t slW = nodeInfo.nSelS[ivar][iBin];
1769 Double_t blW = nodeInfo.nSelB[ivar][iBin];
1770 Double_t sW = nodeInfo.nTotS;
1771 Double_t bW = nodeInfo.nTotB;
1772 Double_t sr = s-sl;
1773 Double_t br = b-bl;
1774 Double_t srW = sW-slW;
1775 Double_t brW = bW-blW;
1776 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1777 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1778 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1779 ) {
1780
1781 if (DoRegression()) {
1782 sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1783 nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1784 nodeInfo.nTotS+nodeInfo.nTotB,
1785 nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1786 } else {
1787 sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1788 }
1789 if (separationGain[ivar] < sepTmp) {
1790 separationGain[ivar] = sepTmp;
1791 cutIndex[ivar] = iBin;
1792 }
1793 }
1794 }
1795 }
1796 return 0;
1797 };
1798 TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1799
1800 // you found the best separation cut for each variable, now compare the variables
1801 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1802 if (useVariable[ivar] ) {
1803 if (separationGainTotal < separationGain[ivar]) {
1804 separationGainTotal = separationGain[ivar];
1805 mxVar = ivar;
1806 }
1807 }
1808 }
1809
1810 if (mxVar >= 0) {
1811 if (DoRegression()) {
1812 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1813 node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1814 if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1815 node->SetRMS(0);
1816
1817 }else{
1818 node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1819 }
1820 }
1821 else {
1822 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1823 if (mxVar >=0){
1824 if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1825 else cutType=kFALSE;
1826 }
1827 }
1828 node->SetSelector((UInt_t)mxVar);
1829 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1830 node->SetCutType(cutType);
1831 node->SetSeparationGain(separationGainTotal);
1832 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1833 node->SetNFisherCoeff(0);
1834 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1835 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1836 }else{
1837 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1838 // be even less storage space on average than storing also the mapping used otherwise
1839 // can always be changed relatively easy
1840 node->SetNFisherCoeff(fNvars+1);
1841 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1842 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1843 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1844 if (ivar<fNvars){
1845 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1846 }
1847 }
1848 }
1849 }
1850 else {
1851 separationGainTotal = 0;
1852 }
1853
1854 // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1855 // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1856 // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1857 for (UInt_t i=0; i<cNvars; i++) {
1858 // delete [] nodeInfo.nSelS[i];
1859 // delete [] nodeInfo.nSelB[i];
1860 // delete [] nodeInfo.nSelS_unWeighted[i];
1861 // delete [] nodeInfo.nSelB_unWeighted[i];
1862 // delete [] nodeInfo.target[i];
1863 // delete [] nodeInfo.target2[i];
1864 delete [] cutValues[i];
1865 }
1866 //delete [] nodeInfo.nSelS;
1867 //delete [] nodeInfo.nSelB;
1868 //delete [] nodeInfo.nSelS_unWeighted;
1869 //delete [] nodeInfo.nSelB_unWeighted;
1870 //delete [] nodeInfo.target;
1871 //delete [] nodeInfo.target2;
1872
1873 // #### left these as dynamic arrays as they were before
1874 // #### since I didn't need to mess with them for parallelization
1875 delete [] cutValues;
1876
1877 delete [] xmin;
1878 delete [] xmax;
1879
1880 delete [] useVariable;
1881 delete [] mapVariable;
1882
1883 delete [] separationGain;
1884 delete [] cutIndex;
1885
1886 delete [] nBins;
1887 delete [] binWidth;
1888 delete [] invBinWidth;
1889
1890 return separationGainTotal;
1891}
1892
1893// Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1894#else
1895Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1897{
1898// #### OK let's comment this one to see how to parallelize it
1899 Double_t separationGainTotal = -1, sepTmp;
1900 Double_t *separationGain = new Double_t[fNvars+1];
1901 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1902
1903 // #### initialize the sep gain and cut index values
1904 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1905 separationGain[ivar]=-1;
1906 cutIndex[ivar]=-1;
1907 }
1908 // ### set up some other variables
1909 Int_t mxVar = -1;
1910 Bool_t cutType = kTRUE;
1911 Double_t nTotS, nTotB;
1912 Int_t nTotS_unWeighted, nTotB_unWeighted;
1913 UInt_t nevents = eventSample.size();
1914
1915
1916 // the +1 comes from the fact that I treat later on the Fisher output as an
1917 // additional possible variable.
1918 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1919 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1920
1921 std::vector<Double_t> fisherCoeff;
1922
1923 // #### set up a map to the subset of variables using two arrays
1924 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1925 UInt_t tmp=fUseNvars;
1926 GetRandomisedVariables(useVariable,mapVariable,tmp);
1927 }
1928 else {
1929 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1930 useVariable[ivar] = kTRUE;
1931 mapVariable[ivar] = ivar;
1932 }
1933 }
1934 // #### last variable entry is the fisher variable
1935 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1936
1937 // #### Begin Fisher calculation
1938 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1939 if (fUseFisherCuts) {
1940 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1941
1942 //use for the Fisher discriminant ONLY those variables that show
1943 //some reasonable linear correlation in either Signal or Background
1944 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1945 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1946 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1947 useVarInFisher[ivar] = kFALSE;
1948 mapVarInFisher[ivar] = ivar;
1949 }
1950
1951 std::vector<TMatrixDSym*>* covMatrices;
1952 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1953 if (!covMatrices){
1954 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1955 fisherOK = kFALSE;
1956 }else{
1957 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1958 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1959 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1960 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1961
1962 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1963 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1964 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1965 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1966 useVarInFisher[ivar] = kTRUE;
1967 useVarInFisher[jvar] = kTRUE;
1968 }
1969 }
1970 }
1971 // now as you know which variables you want to use, count and map them:
1972 // such that you can use an array/matrix filled only with THOSE variables
1973 // that you used
1974 UInt_t nFisherVars = 0;
1975 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1976 //now .. pick those variables that are used in the FIsher and are also
1977 // part of the "allowed" variables in case of Randomized Trees)
1978 if (useVarInFisher[ivar] && useVariable[ivar]) {
1979 mapVarInFisher[nFisherVars++]=ivar;
1980 // now exclud the the variables used in the Fisher cuts, and don't
1981 // use them anymore in the individual variable scan
1982 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1983 }
1984 }
1985
1986
1987 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1988 fisherOK = kTRUE;
1989 }
1990 delete [] useVarInFisher;
1991 delete [] mapVarInFisher;
1992
1993 }
1994 // #### End Fisher calculation
1995
1996
1997 UInt_t cNvars = fNvars;
1998 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1999
2000 // #### OK now what's going on...
2001 // #### looks like we are setting up histograms
2002 UInt_t* nBins = new UInt_t [cNvars];
2003 Double_t* binWidth = new Double_t [cNvars];
2004 Double_t* invBinWidth = new Double_t [cNvars];
2005
2006 Double_t** nSelS = new Double_t* [cNvars];
2007 Double_t** nSelB = new Double_t* [cNvars];
2008 Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2009 Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2010 Double_t** target = new Double_t* [cNvars];
2011 Double_t** target2 = new Double_t* [cNvars];
2012 Double_t** cutValues = new Double_t* [cNvars];
2013
2014 // #### looping through the variables...
2015 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2016 // #### ncuts means that we need n+1 bins for each variable
2017 nBins[ivar] = fNCuts+1;
2018 if (ivar < fNvars) {
2019 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2020 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2021 }
2022 }
2023
2024 // #### make some new arrays for each ith var, size=nbins for each array
2025 // #### integer features get the same number of bins as values, set later
2026 nSelS[ivar] = new Double_t [nBins[ivar]];
2027 nSelB[ivar] = new Double_t [nBins[ivar]];
2028 nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2029 nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2030 target[ivar] = new Double_t [nBins[ivar]];
2031 target2[ivar] = new Double_t [nBins[ivar]];
2032 cutValues[ivar] = new Double_t [nBins[ivar]];
2033
2034 }
2035
2036 // #### xmin and xmax for earch variable
2037 Double_t *xmin = new Double_t[cNvars];
2038 Double_t *xmax = new Double_t[cNvars];
2039
2040 // #### ok loop through each variable to initialize all the values
2041 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2042 if (ivar < fNvars){
2043 xmin[ivar]=node->GetSampleMin(ivar);
2044 xmax[ivar]=node->GetSampleMax(ivar);
2045 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2046 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2047 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2048 useVariable[ivar]=kFALSE;
2049 }
2050
2051 } else { // the fisher variable
2052 xmin[ivar]=999;
2053 xmax[ivar]=-999;
2054 // too bad, for the moment I don't know how to do this without looping
2055 // once to get the "min max" and then AGAIN to fill the histogram
2056 for (UInt_t iev=0; iev<nevents; iev++) {
2057 // returns the Fisher value (no fixed range)
2058 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2059 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2060 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2061 if (result > xmax[ivar]) xmax[ivar]=result;
2062 if (result < xmin[ivar]) xmin[ivar]=result;
2063 }
2064 }
2065 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2066 nSelS[ivar][ibin]=0;
2067 nSelB[ivar][ibin]=0;
2068 nSelS_unWeighted[ivar][ibin]=0;
2069 nSelB_unWeighted[ivar][ibin]=0;
2070 target[ivar][ibin]=0;
2071 target2[ivar][ibin]=0;
2072 cutValues[ivar][ibin]=0;
2073 }
2074 }
2075
2076 // #### Nothing to parallelize here really, no loop through events
2077 // #### only figures out the bin edge values for the "histogram" arrays
2078 // fill the cut values for the scan:
2079 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2080
2081 if ( useVariable[ivar] ) {
2082
2083 //set the grid for the cut scan on the variables like this:
2084 //
2085 // | | | | | ... | |
2086 // xmin xmax
2087 //
2088 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2089 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2090 // --> nBins = fNCuts+1
2091 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2092 // hence can be safely omitted
2093
2094 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2095 invBinWidth[ivar] = 1./binWidth[ivar];
2096 if (ivar < fNvars) {
2097 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2098 }
2099
2100 // std::cout << "ivar="<<ivar
2101 // <<" min="<<xmin[ivar]
2102 // << " max="<<xmax[ivar]
2103 // << " widht=" << istepSize
2104 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2105 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2106 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2107 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2108 }
2109 }
2110 }
2111
2112 // #### Loop through the events to get the total sig and background
2113 nTotS=0; nTotB=0;
2114 nTotS_unWeighted=0; nTotB_unWeighted=0;
2115 for (UInt_t iev=0; iev<nevents; iev++) {
2116
2117 Double_t eventWeight = eventSample[iev]->GetWeight();
2118 if (eventSample[iev]->GetClass() == fSigClass) {
2119 nTotS+=eventWeight;
2120 nTotS_unWeighted++; }
2121 else {
2122 nTotB+=eventWeight;
2123 nTotB_unWeighted++;
2124 }
2125
2126 // #### Count the number in each bin (fill array "histograms")
2127 Int_t iBin=-1;
2128 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2129 // now scan trough the cuts for each varable and find which one gives
2130 // the best separationGain at the current stage.
2131 if ( useVariable[ivar] ) {
2132 Double_t eventData;
2133 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2134 else { // the fisher variable
2135 eventData = fisherCoeff[fNvars];
2136 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2137 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2138
2139 }
2140 // #### figure out which bin the event belongs in ...
2141 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2142 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2143 if (eventSample[iev]->GetClass() == fSigClass) {
2144 nSelS[ivar][iBin]+=eventWeight;
2145 nSelS_unWeighted[ivar][iBin]++;
2146 }
2147 else {
2148 nSelB[ivar][iBin]+=eventWeight;
2149 nSelB_unWeighted[ivar][iBin]++;
2150 }
2151 if (DoRegression()) {
2152 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2153 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2154 }
2155 }
2156 }
2157 }
2158
2159 // now turn the "histogram" into a cumulative distribution
2160 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2161 if (useVariable[ivar]) {
2162 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2163 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2164 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2165 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2166 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2167 if (DoRegression()) {
2168 target[ivar][ibin] +=target[ivar][ibin-1] ;
2169 target2[ivar][ibin]+=target2[ivar][ibin-1];
2170 }
2171 }
2172 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2173 Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2174 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2175 << " while eventsample size = " << eventSample.size()
2176 << Endl;
2177 }
2178 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2179 double totalSum=nTotS+nTotB;
2180 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2181 Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2182 << lastBins
2183 << " while total number of events = " << totalSum
2184 << Endl;
2185 }
2186 }
2187 }
2188 // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2189 // now select the optimal cuts for each varable and find which one gives
2190 // the best separationGain at the current stage
2191 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2192 if (useVariable[ivar]) {
2193 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2194 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2195 // calculated by the "SamplePurities" fom the branches that would go to the
2196 // left or the right from this node if "these" cuts were used in the Node:
2197 // hereby: nSelS and nSelB would go to the right branch
2198 // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2199
2200 // only allow splits where both daughter nodes match the specified miniumum number
2201 // for this use the "unweighted" events, as you are interested in statistically
2202 // significant splits, which is determined by the actual number of entries
2203 // for a node, rather than the sum of event weights.
2204
2205 Double_t sl = nSelS_unWeighted[ivar][iBin];
2206 Double_t bl = nSelB_unWeighted[ivar][iBin];
2207 Double_t s = nTotS_unWeighted;
2208 Double_t b = nTotB_unWeighted;
2209 Double_t slW = nSelS[ivar][iBin];
2210 Double_t blW = nSelB[ivar][iBin];
2211 Double_t sW = nTotS;
2212 Double_t bW = nTotB;
2213 Double_t sr = s-sl;
2214 Double_t br = b-bl;
2215 Double_t srW = sW-slW;
2216 Double_t brW = bW-blW;
2217 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2218 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2219 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2220 ) {
2221
2222 if (DoRegression()) {
2223 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2224 target[ivar][iBin],target2[ivar][iBin],
2225 nTotS+nTotB,
2226 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2227 } else {
2228 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2229 }
2230 if (separationGain[ivar] < sepTmp) {
2231 separationGain[ivar] = sepTmp;
2232 cutIndex[ivar] = iBin;
2233 }
2234 }
2235 }
2236 }
2237 }
2238
2239 //now you have found the best separation cut for each variable, now compare the variables
2240 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2241 if (useVariable[ivar] ) {
2242 if (separationGainTotal < separationGain[ivar]) {
2243 separationGainTotal = separationGain[ivar];
2244 mxVar = ivar;
2245 }
2246 }
2247 }
2248
2249 if (mxVar >= 0) {
2250 if (DoRegression()) {
2251 node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2252 node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2253 if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2254 node->SetRMS(0);
2255 }else{
2256 node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2257 }
2258 }
2259 else {
2260 node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2261 if (mxVar >=0){
2262 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2263 else cutType=kFALSE;
2264 }
2265 }
2266 node->SetSelector((UInt_t)mxVar);
2267 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2268 node->SetCutType(cutType);
2269 node->SetSeparationGain(separationGainTotal);
2270 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2271 node->SetNFisherCoeff(0);
2272 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2273 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2274 }else{
2275 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2276 // be even less storage space on average than storing also the mapping used otherwise
2277 // can always be changed relatively easy
2278 node->SetNFisherCoeff(fNvars+1);
2279 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2280 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2281 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2282 if (ivar<fNvars){
2283 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2284 }
2285 }
2286 }
2287 }
2288 else {
2289 separationGainTotal = 0;
2290 }
2291 // if (mxVar > -1) {
2292 // std::cout << "------------------------------------------------------------------"<<std::endl;
2293 // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2294 // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2295 // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2296 // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2297 // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2298 // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2299 // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2300 // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2301 // } else {
2302 // std::cout << "------------------------------------------------------------------"<<std::endl;
2303 // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2304 // }
2305 for (UInt_t i=0; i<cNvars; i++) {
2306 delete [] nSelS[i];
2307 delete [] nSelB[i];
2308 delete [] nSelS_unWeighted[i];
2309 delete [] nSelB_unWeighted[i];
2310 delete [] target[i];
2311 delete [] target2[i];
2312 delete [] cutValues[i];
2313 }
2314 delete [] nSelS;
2315 delete [] nSelB;
2316 delete [] nSelS_unWeighted;
2317 delete [] nSelB_unWeighted;
2318 delete [] target;
2319 delete [] target2;
2320 delete [] cutValues;
2321
2322 delete [] xmin;
2323 delete [] xmax;
2324
2325 delete [] useVariable;
2326 delete [] mapVariable;
2327
2328 delete [] separationGain;
2329 delete [] cutIndex;
2330
2331 delete [] nBins;
2332 delete [] binWidth;
2333 delete [] invBinWidth;
2334
2335 return separationGainTotal;
2336
2337}
2338#endif
2339
2340
2341////////////////////////////////////////////////////////////////////////////////
2342/// calculate the fisher coefficients for the event sample and the variables used
2343
2344std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2345 std::vector<Double_t> fisherCoeff(fNvars+1);
2346
2347 // initialization of global matrices and vectors
2348 // average value of each variables for S, B, S+B
2349 TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2350
2351 // the covariance 'within class' and 'between class' matrices
2352 TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2353 TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2354 TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2355
2356 //
2357 // compute mean values of variables in each sample, and the overall means
2358 //
2359
2360 // initialize internal sum-of-weights variables
2361 Double_t sumOfWeightsS = 0;
2362 Double_t sumOfWeightsB = 0;
2363
2364
2365 // init vectors
2366 Double_t* sumS = new Double_t[nFisherVars];
2367 Double_t* sumB = new Double_t[nFisherVars];
2368 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2369
2370 UInt_t nevents = eventSample.size();
2371 // compute sample means
2372 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2373
2374 // read the Training Event into "event"
2375 const Event * ev = eventSample[ievt];
2376
2377 // sum of weights
2378 Double_t weight = ev->GetWeight();
2379 if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2380 else sumOfWeightsB += weight;
2381
2382 Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2383 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2384 sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2385 }
2386 }
2387 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2388 (*meanMatx)( ivar, 2 ) = sumS[ivar];
2389 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2390
2391 (*meanMatx)( ivar, 2 ) += sumB[ivar];
2392 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2393
2394 // signal + background
2395 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2396 }
2397
2398 delete [] sumS;
2399
2400 delete [] sumB;
2401
2402 // the matrix of covariance 'within class' reflects the dispersion of the
2403 // events relative to the center of gravity of their own class
2404
2405 // assert required
2406
2407 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2408
2409 // product matrices (x-<x>)(y-<y>) where x;y are variables
2410
2411 const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2412 Double_t *sum2Sig = new Double_t[nFisherVars2];
2413 Double_t *sum2Bgd = new Double_t[nFisherVars2];
2414 Double_t *xval = new Double_t[nFisherVars2];
2415 memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2416 memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2417
2418 // 'within class' covariance
2419 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2420
2421 // read the Training Event into "event"
2422 // const Event* ev = eventSample[ievt];
2423 const Event* ev = eventSample.at(ievt);
2424
2425 Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2426
2427 for (UInt_t x=0; x<nFisherVars; x++) {
2428 xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2429 }
2430 Int_t k=0;
2431 for (UInt_t x=0; x<nFisherVars; x++) {
2432 for (UInt_t y=0; y<nFisherVars; y++) {
2433 if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2434 else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2435 k++;
2436 }
2437 }
2438 }
2439 Int_t k=0;
2440 for (UInt_t x=0; x<nFisherVars; x++) {
2441 for (UInt_t y=0; y<nFisherVars; y++) {
2442 (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2443 k++;
2444 }
2445 }
2446
2447 delete [] sum2Sig;
2448 delete [] sum2Bgd;
2449 delete [] xval;
2450
2451
2452 // the matrix of covariance 'between class' reflects the dispersion of the
2453 // events of a class relative to the global center of gravity of all the class
2454 // hence the separation between classes
2455
2456
2457 Double_t prodSig, prodBgd;
2458
2459 for (UInt_t x=0; x<nFisherVars; x++) {
2460 for (UInt_t y=0; y<nFisherVars; y++) {
2461
2462 prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2463 ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2464 prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2465 ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2466
2467 (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2468 }
2469 }
2470
2471
2472
2473 // compute full covariance matrix from sum of within and between matrices
2474 for (UInt_t x=0; x<nFisherVars; x++)
2475 for (UInt_t y=0; y<nFisherVars; y++)
2476 (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2477
2478 // Fisher = Sum { [coeff]*[variables] }
2479 //
2480 // let Xs be the array of the mean values of variables for signal evts
2481 // let Xb be the array of the mean values of variables for backgd evts
2482 // let InvWith be the inverse matrix of the 'within class' correlation matrix
2483 //
2484 // then the array of Fisher coefficients is
2485 // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2486 TMatrixD* theMat = with; // Fishers original
2487 // TMatrixD* theMat = cov; // Mahalanobis
2488
2489 TMatrixD invCov( *theMat );
2490 if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2491 Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2492 << TMath::Abs(invCov.Determinant())
2493 << " did you use the variables that are linear combinations or highly correlated?"
2494 << Endl;
2495 }
2496 if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2497 Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2498 << TMath::Abs(invCov.Determinant())
2499 << " did you use the variables that are linear combinations?"
2500 << Endl;
2501 }
2502
2503 invCov.Invert();
2504
2505 // apply rescaling factor
2506 Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2507
2508 // compute difference of mean values
2509 std::vector<Double_t> diffMeans( nFisherVars );
2510
2511 for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2512 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2513 for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2514 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2515 fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2516 }
2517
2518 // rescale
2519 fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2520 }
2521
2522 // offset correction
2523 Double_t f0 = 0.0;
2524 for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2525 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2526 }
2527 f0 /= -2.0;
2528
2529 fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2530
2531 return fisherCoeff;
2532}
2533
2534////////////////////////////////////////////////////////////////////////////////
2535/// train a node by finding the single optimal cut for a single variable
2536/// that best separates signal and background (maximizes the separation gain)
2537
2540{
2541 Double_t nTotS = 0.0, nTotB = 0.0;
2542 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2543
2544 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2545
2546 // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2547 // each spot in parallel no problem
2548 std::vector<Double_t> lCutValue( fNvars, 0.0 );
2549 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2550 std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2551 lCutType.assign( fNvars, Char_t(kFALSE) );
2552
2553 // Initialize (un)weighted counters for signal & background
2554 // Construct a list of event wrappers that point to the original data
2555 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2556 if((*it)->GetClass() == fSigClass) { // signal or background event
2557 nTotS += (*it)->GetWeight();
2558 ++nTotS_unWeighted;
2559 }
2560 else {
2561 nTotB += (*it)->GetWeight();
2562 ++nTotB_unWeighted;
2563 }
2564 bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2565 }
2566
2567 std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2568 useVariable.assign( fNvars, Char_t(kTRUE) );
2569
2570 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2571 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2572 if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2573 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2574 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2575 }
2576 Int_t nSelectedVars = 0;
2577 while (nSelectedVars < fUseNvars) {
2578 Double_t bla = fMyTrandom->Rndm()*fNvars;
2579 useVariable[Int_t (bla)] = Char_t(kTRUE);
2580 nSelectedVars = 0;
2581 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2582 if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2583 }
2584 }
2585 }
2586 else {
2587 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2588 }
2589 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2590 if(!useVariable[ivar]) continue; // only optimze with selected variables
2591
2592 TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2593
2594 std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2595
2596
2597 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2598
2599 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2600 for( ; it != it_end; ++it ) {
2601 if((**it)->GetClass() == fSigClass ) // specify signal or background event
2602 sigWeightCtr += (**it)->GetWeight();
2603 else
2604 bkgWeightCtr += (**it)->GetWeight();
2605 // Store the accumulated signal (background) weights
2606 it->SetCumulativeWeight(false,bkgWeightCtr);
2607 it->SetCumulativeWeight(true,sigWeightCtr);
2608 }
2609
2610 const Double_t fPMin = 1.0e-6;
2611 Bool_t cutType = kFALSE;
2612 Long64_t index = 0;
2613 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2614
2615 // Locate the optimal cut for this (ivar-th) variable
2616 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2617 if( index == 0 ) { ++index; continue; }
2618 if( *(*it) == NULL ) {
2619 Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2620 << index << ", and parent node=" << node->GetParent() << Endl;
2621 break;
2622 }
2623 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2624 norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2625 // Only allow splits where both daughter nodes have the specified minimum number of events
2626 // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2627 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2628
2629 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2630 if( sepTmp > separationGain ) {
2631 separationGain = sepTmp;
2632 cutValue = it->GetVal() - 0.5*dVal;
2633 Double_t nSelS = it->GetCumulativeWeight(true);
2634 Double_t nSelB = it->GetCumulativeWeight(false);
2635 // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2636 // or by removing signal (enhancing background)
2637 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2638 else cutType = kFALSE;
2639 }
2640 }
2641 ++index;
2642 }
2643 lCutType[ivar] = Char_t(cutType);
2644 lCutValue[ivar] = cutValue;
2645 lSepGain[ivar] = separationGain;
2646 }
2647 Double_t separationGain = -1.0;
2648 Int_t iVarIndex = -1;
2649 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2650 if( lSepGain[ivar] > separationGain ) {
2651 iVarIndex = ivar;
2652 separationGain = lSepGain[ivar];
2653 }
2654 }
2655
2656 // #### storing the best values into the node
2657 if(iVarIndex >= 0) {
2658 node->SetSelector(iVarIndex);
2659 node->SetCutValue(lCutValue[iVarIndex]);
2660 node->SetSeparationGain(lSepGain[iVarIndex]);
2661 node->SetCutType(lCutType[iVarIndex]);
2662 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2663 }
2664 else {
2665 separationGain = 0.0;
2666 }
2667
2668 return separationGain;
2669}
2670
2671////////////////////////////////////////////////////////////////////////////////
2672/// get the pointer to the leaf node where a particular event ends up in...
2673/// (used in gradient boosting)
2674
2676{
2677 TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2678 while(current->GetNodeType() == 0) { // intermediate node in a tree
2679 current = (current->GoesRight(e)) ?
2680 (TMVA::DecisionTreeNode*)current->GetRight() :
2681 (TMVA::DecisionTreeNode*)current->GetLeft();
2682 }
2683 return current;
2684}
2685
2686////////////////////////////////////////////////////////////////////////////////
2687/// the event e is put into the decision tree (starting at the root node)
2688/// and the output is NodeType (signal) or (background) of the final node (basket)
2689/// in which the given events ends up. I.e. the result of the classification if
2690/// the event for this decision tree.
2691
2693{
2694 TMVA::DecisionTreeNode *current = this->GetRoot();
2695 if (!current){
2696 Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2697 return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2698 }
2699
2700 while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2701 current = (current->GoesRight(*e)) ?
2702 current->GetRight() :
2703 current->GetLeft();
2704 if (!current) {
2705 Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2706 }
2707
2708 }
2709
2710 if (DoRegression()) {
2711 // Note: This path is also taken for MethodBDT with analysis type
2712 // kClassification and kMulticlass when using GradBoost.
2713 // See TMVA::MethodBDT::InitGradBoost
2714 return current->GetResponse();
2715 } else {
2716 if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2717 else return current->GetPurity();
2718 }
2719}
2720
2721////////////////////////////////////////////////////////////////////////////////
2722/// calculates the purity S/(S+B) of a given event sample
2723
2724Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2725{
2726 Double_t sumsig=0, sumbkg=0, sumtot=0;
2727 for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2728 if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2729 else sumsig+=eventSample[ievt]->GetWeight();
2730 sumtot+=eventSample[ievt]->GetWeight();
2731 }
2732 // sanity check
2733 if (sumtot!= (sumsig+sumbkg)){
2734 Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2735 << sumtot << " " << sumsig << " " << sumbkg << Endl;
2736 }
2737 if (sumtot>0) return sumsig/(sumsig + sumbkg);
2738 else return -1;
2739}
2740
2741////////////////////////////////////////////////////////////////////////////////
2742/// Return the relative variable importance, normalized to all
2743/// variables together having the importance 1. The importance in
2744/// evaluated as the total separation-gain that this variable had in
2745/// the decision trees (weighted by the number of events)
2746
2748{
2749 std::vector<Double_t> relativeImportance(fNvars);
2750 Double_t sum=0;
2751 for (UInt_t i=0; i< fNvars; i++) {
2752 sum += fVariableImportance[i];
2753 relativeImportance[i] = fVariableImportance[i];
2754 }
2755
2756 for (UInt_t i=0; i< fNvars; i++) {
2758 relativeImportance[i] /= sum;
2759 else
2760 relativeImportance[i] = 0;
2761 }
2762 return relativeImportance;
2763}
2764
2765////////////////////////////////////////////////////////////////////////////////
2766/// returns the relative importance of variable ivar
2767
2769{
2770 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2771 if (ivar < fNvars) return relativeImportance[ivar];
2772 else {
2773 Log() << kFATAL << "<GetVariableImportance>" << Endl
2774 << "--- ivar = " << ivar << " is out of range " << Endl;
2775 }
2776
2777 return -1;
2778}
2779
bool almost_equal_double(double x, double y, int ulp=4)
bool almost_equal_float(float x, float y, int ulp=4)
ROOT::R::TRInterface & r
Definition: Object.C:4
#define d(i)
Definition: RSha256.hxx:102
#define b(i)
Definition: RSha256.hxx:100
#define f(i)
Definition: RSha256.hxx:104
#define e(i)
Definition: RSha256.hxx:103
int Int_t
Definition: RtypesCore.h:43
char Char_t
Definition: RtypesCore.h:31
unsigned int UInt_t
Definition: RtypesCore.h:44
const Bool_t kFALSE
Definition: RtypesCore.h:90
unsigned long ULong_t
Definition: RtypesCore.h:53
double Double_t
Definition: RtypesCore.h:57
long long Long64_t
Definition: RtypesCore.h:71
float Float_t
Definition: RtypesCore.h:55
const Bool_t kTRUE
Definition: RtypesCore.h:89
#define ClassImp(name)
Definition: Rtypes.h:361
int type
Definition: TGX11.cxx:120
float xmin
Definition: THbookFile.cxx:93
float xmax
Definition: THbookFile.cxx:93
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
TString operator+(const TString &s1, const TString &s2)
Use the special concatenation constructor.
Definition: TString.cxx:1474
static void SetVarIndex(Int_t iVar)
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void SetRoot(Node *r)
Definition: BinaryTree.h:80
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
Definition: Config.h:83
static Config & Instance()
static function: returns TMVA instance
Definition: Config.cxx:107
A class to prune a decision tree using the Cost Complexity method.
Class that contains all the data information.
Definition: DataSetInfo.h:60
void SetNEvents_unweighted(Float_t nev)
void SetSeparationGain(Float_t sep)
void SetNBkgEvents(Float_t b)
Double_t GetNSValidation() const
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
void SetNSigEvents_unboosted(Float_t s)
void SetAlphaMinSubtree(Double_t g)
void IncrementNBkgEvents(Float_t b)
void SetNEvents_unboosted(Float_t nev)
Float_t GetNSigEvents(void) const
virtual void SetLeft(Node *l)
void SetTerminal(Bool_t s=kTRUE)
void SetResponse(Float_t r)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node
void SetNBValidation(Double_t b)
void IncrementNEvents(Float_t nev)
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void SetSubTreeR(Double_t r)
void AddToSumTarget2(Float_t t2)
virtual DecisionTreeNode * GetLeft() const
Double_t GetNodeR() const
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
void SetNFisherCoeff(Int_t nvars)
Short_t GetSelector() const
void SetNSigEvents(Float_t s)
Float_t GetResponse(void) const
Float_t GetCutValue(void) const
Int_t GetNodeType(void) const
void SetNSigEvents_unweighted(Float_t s)
Double_t GetNBValidation() const
void SetAlpha(Double_t alpha)
void SetSeparationIndex(Float_t sep)
virtual void SetRight(Node *r)
void SetNBkgEvents_unboosted(Float_t b)
Float_t GetPurity(void) const
void IncrementNSigEvents(Float_t s)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node
void SetCutValue(Float_t c)
Float_t GetNBkgEvents(void) const
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node
void SetSelector(Short_t i)
virtual DecisionTreeNode * GetParent() const
void SetNBkgEvents_unweighted(Float_t b)
void SetNSValidation(Double_t s)
void AddToSumTarget(Float_t t)
void SetNEvents(Float_t nev)
virtual DecisionTreeNode * GetRight() const
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i....
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:73
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static const Int_t fgRandomSeed
Definition: DecisionTree.h:68
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
virtual ~DecisionTree(void)
destructor
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:238
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree,...
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time.
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
RegressionVariance * fRegType
Definition: DecisionTree.h:211
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
Double_t GetOriginalWeight() const
Definition: Event.h:84
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition: Event.cxx:381
UInt_t GetClass() const
Definition: Event.h:86
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:93
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
unsigned int GetPoolSize() const
Definition: Executor.h:99
auto MapReduce(F func, ROOT::TSeq< INTEGER > args, R redfunc) -> typename std::result_of< F(INTEGER)>::type
Wrap TExecutor::MapReduce functions.
Definition: Executor.h:145
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Definition: Executor.h:133
A helper class to prune a decision tree using the expected error (C4.5) method.
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
Node for the BinarySearch or Decision Trees.
Definition: Node.h:56
UInt_t GetDepth() const
Definition: Node.h:114
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:335
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:335
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event * > &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1525
Singleton class for Global types used by TMVA.
Definition: Types.h:73
@ kClassification
Definition: Types.h:128
@ kRegression
Definition: Types.h:129
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1399
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1364
Random number generator class based on M.
Definition: TRandom3.h:27
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
TClass * GetClass(T *)
Definition: TClass.h:658
TSeq< unsigned int > TSeqU
Definition: TSeq.hxx:195
static constexpr double sr
static constexpr double s
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
constexpr Double_t E()
Base of natural log:
Definition: TMath.h:97
Double_t Log(Double_t x)
Definition: TMath.h:750
Double_t Sqrt(Double_t x)
Definition: TMath.h:681
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:180
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
auto * l
Definition: textangle.C:4
static long int sum(long int i)
Definition: Factory.cxx:2275
REAL epsilon
Definition: triangle.c:617