ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
CostComplexityPruneTool.h
Go to the documentation of this file.
1 /**********************************************************************************
2  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
3  * Package: TMVA *
4  * Class : TMVA::DecisionTree *
5  * Web : http://tmva.sourceforge.net *
6  * *
7  * Description: *
8  * Implementation of a Decision Tree *
9  * *
10  * Authors (alphabetical): *
11  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
12  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
13  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
14  * Doug Schouten <dschoute@sfu.ca> - Simon Fraser U., Canada *
15  * *
16  * Copyright (c) 2005: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * *
21  * Redistribution and use in source and binary forms, with or without *
22  * modification, are permitted according to the terms listed in LICENSE *
23  * (http://mva.sourceforge.net/license.txt) *
24  * *
25  **********************************************************************************/
26 
27 #ifndef ROOT_TMVA_CostComplexityPruneTool
28 #define ROOT_TMVA_CostComplexityPruneTool
29 
30 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
31 // CostComplexityPruneTool - a class to prune a decision tree using the Cost Complexity method //
32 // (see "Classification and Regression Trees" by Leo Breiman et al) //
33 // //
34 // Some definitions: //
35 // //
36 // T_max - the initial, usually highly overtrained tree, that is to be pruned back //
37 // R(T) - quality index (Gini, misclassification rate, or other) of a tree T //
38 // ~T - set of terminal nodes in T //
39 // T' - the pruned subtree of T_max that has the best quality index R(T') //
40 // alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha*|~T|) //
41 // //
42 // There are two running modes in CostComplexityPruneTool: (i) one may select a prune strength and prune //
43 // the tree T_max until the criterion //
44 // R(T) - R(t) //
45 // alpha < ---------- //
46 // |~T_t| - 1 //
47 // //
48 // is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points //
49 // alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned //
50 // subtree, defined to be the subtree with the best quality index for the validation sample. //
51 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
52 
53 #ifndef ROOT_TMVA_SeparationBase
54 #include "TMVA/SeparationBase.h"
55 #endif
56 #ifndef ROOT_TMVA_GiniIndex
57 #include "TMVA/GiniIndex.h"
58 #endif
59 #ifndef ROOT_TMVA_DecisionTree
60 #include "TMVA/DecisionTree.h"
61 #endif
62 #ifndef ROOT_TMVA_Event
63 #include "TMVA/Event.h"
64 #endif
65 #ifndef ROOT_TMVA_IPruneTool
66 #include "TMVA/IPruneTool.h"
67 #endif
68 
69 namespace TMVA {
70 
72  public:
73  CostComplexityPruneTool( SeparationBase* qualityIndex = NULL );
74  virtual ~CostComplexityPruneTool( );
75 
76  // calculate the prune sequence for a given tree
77  virtual PruningInfo* CalculatePruningInfo( DecisionTree* dt, const IPruneTool::EventSample* testEvents = NULL, Bool_t isAutomatic = kFALSE );
78 
79  private:
80  SeparationBase* fQualityIndexTool; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
81 
82  std::vector<DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
83  std::vector<Double_t> fPruneStrengthList; //! map of alpha -> pruning index
84  std::vector<Double_t> fQualityIndexList; //! map of R(T) -> pruning index
85 
86  Int_t fOptimalK; //! the optimal index of the prune sequence
87 
88  private:
89  // set the meta data used for cost complexity pruning
91 
92  // optimize the pruning sequence
93  void Optimize( DecisionTree* dt, Double_t weights );
94 
95  mutable MsgLogger* fLogger; //! output stream to save logging information
96  MsgLogger& Log() const { return *fLogger; }
97 
98  };
99 }
100 
101 
102 #endif
Int_t fOptimalK
map of R(T) -> pruning index
virtual ~CostComplexityPruneTool()
the destructor for the cost complexity prunig
CostComplexityPruneTool(SeparationBase *qualityIndex=NULL)
the constructor for the cost complexity prunig
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
std::vector< const Event * > EventSample
Definition: IPruneTool.h:76
void Optimize(DecisionTree *dt, Double_t weights)
after the critical alpha values (at which the corresponding nodes would be pruned away) had been esta...
MsgLogger & Log() const
output stream to save logging information
double Double_t
Definition: RtypesCore.h:55
std::vector< Double_t > fPruneStrengthList
map of weakest links (i.e., branches to prune) -> pruning index
std::vector< Double_t > fQualityIndexList
map of alpha -> pruning index
std::vector< DecisionTreeNode * > fPruneSequence
the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
void InitTreePruningMetaData(DecisionTreeNode *n)
the optimal index of the prune sequence
#define NULL
Definition: Rtypes.h:82
const Int_t n
Definition: legend1.C:16
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const IPruneTool::EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)