Logo ROOT   6.10/09
Reference Guide
CostComplexityPruneTool.h
Go to the documentation of this file.
1 /**********************************************************************************
2  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
3  * Package: TMVA *
4  * Class : TMVA::DecisionTree *
5  * Web : http://tmva.sourceforge.net *
6  * *
7  * Description: *
8  * Implementation of a Decision Tree *
9  * *
10  * Authors (alphabetical): *
11  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
12  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
13  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
14  * Doug Schouten <dschoute@sfu.ca> - Simon Fraser U., Canada *
15  * *
16  * Copyright (c) 2005: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * *
21  * Redistribution and use in source and binary forms, with or without *
22  * modification, are permitted according to the terms listed in LICENSE *
23  * (http://mva.sourceforge.net/license.txt) *
24  * *
25  **********************************************************************************/
26 
27 #ifndef ROOT_TMVA_CostComplexityPruneTool
28 #define ROOT_TMVA_CostComplexityPruneTool
29 
30 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
31 // CostComplexityPruneTool - a class to prune a decision tree using the Cost Complexity method //
32 // (see "Classification and Regression Trees" by Leo Breiman et al) //
33 // //
34 // Some definitions: //
35 // //
36 // T_max - the initial, usually highly overtrained tree, that is to be pruned back //
37 // R(T) - quality index (Gini, misclassification rate, or other) of a tree T //
38 // ~T - set of terminal nodes in T //
39 // T' - the pruned subtree of T_max that has the best quality index R(T') //
40 // alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha*|~T|) //
41 // //
42 // There are two running modes in CostComplexityPruneTool: (i) one may select a prune strength and prune //
43 // the tree T_max until the criterion //
44 // R(T) - R(t) //
45 // alpha < ---------- //
46 // |~T_t| - 1 //
47 // //
48 // is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points //
49 // alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned //
50 // subtree, defined to be the subtree with the best quality index for the validation sample. //
51 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
52 
53 #include "TMVA/SeparationBase.h"
54 #include "TMVA/GiniIndex.h"
55 #include "TMVA/DecisionTree.h"
56 #include "TMVA/Event.h"
57 #include "TMVA/IPruneTool.h"
58 
59 namespace TMVA {
60 
62  public:
63  CostComplexityPruneTool( SeparationBase* qualityIndex = NULL );
64  virtual ~CostComplexityPruneTool( );
65 
66  // calculate the prune sequence for a given tree
67  virtual PruningInfo* CalculatePruningInfo( DecisionTree* dt, const IPruneTool::EventSample* testEvents = NULL, Bool_t isAutomatic = kFALSE );
68 
69  private:
70  SeparationBase* fQualityIndexTool; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
71 
72  std::vector<DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
73  std::vector<Double_t> fPruneStrengthList; //! map of alpha -> pruning index
74  std::vector<Double_t> fQualityIndexList; //! map of R(T) -> pruning index
75 
76  Int_t fOptimalK; //! the optimal index of the prune sequence
77 
78  private:
79  // set the meta data used for cost complexity pruning
81 
82  // optimize the pruning sequence
83  void Optimize( DecisionTree* dt, Double_t weights );
84 
85  mutable MsgLogger* fLogger; //! output stream to save logging information
86  MsgLogger& Log() const { return *fLogger; }
87 
88  };
89 }
90 
91 
92 #endif
Int_t fOptimalK
map of R(T) -> pruning index
virtual ~CostComplexityPruneTool()
the destructor for the cost complexity pruning
CostComplexityPruneTool(SeparationBase *qualityIndex=NULL)
the constructor for the cost complexity pruning
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
#define NULL
Definition: RtypesCore.h:88
std::vector< const Event * > EventSample
Definition: IPruneTool.h:74
void Optimize(DecisionTree *dt, Double_t weights)
after the critical values (at which the corresponding nodes would be pruned away) had been establish...
Implementation of a Decision Tree.
Definition: DecisionTree.h:59
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const Bool_t kFALSE
Definition: RtypesCore.h:92
A class to prune a decision tree using the Cost Complexity method.
double Double_t
Definition: RtypesCore.h:55
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
std::vector< Double_t > fPruneStrengthList
map of weakest links (i.e., branches to prune) -> pruning index
std::vector< Double_t > fQualityIndexList
map of alpha -> pruning index
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
std::vector< DecisionTreeNode * > fPruneSequence
the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
Abstract ClassifierFactory template that handles arbitrary types.
void InitTreePruningMetaData(DecisionTreeNode *n)
the optimal index of the prune sequence
const Int_t n
Definition: legend1.C:16
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const IPruneTool::EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)
the routine that basically "steers" the pruning process.
MsgLogger & Log() const
output stream to save logging information