Logo ROOT  
Reference Guide
CostComplexityPruneTool.h
Go to the documentation of this file.
1/**********************************************************************************
2 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
3 * Package: TMVA *
4 * Class : TMVA::DecisionTree *
5 * Web : http://tmva.sourceforge.net *
6 * *
7 * Description: *
8 * Implementation of a Decision Tree *
9 * *
10 * Authors (alphabetical): *
11 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
12 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
13 * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
14 * Doug Schouten <dschoute@sfu.ca> - Simon Fraser U., Canada *
15 * *
16 * Copyright (c) 2005: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * *
21 * Redistribution and use in source and binary forms, with or without *
22 * modification, are permitted according to the terms listed in LICENSE *
23 * (http://mva.sourceforge.net/license.txt) *
24 * *
25 **********************************************************************************/
26
27#ifndef ROOT_TMVA_CostComplexityPruneTool
28#define ROOT_TMVA_CostComplexityPruneTool
29
30////////////////////////////////////////////////////////////////////////////////////////////////////////////
31// CostComplexityPruneTool - a class to prune a decision tree using the Cost Complexity method //
32// (see "Classification and Regression Trees" by Leo Breiman et al) //
33// //
34// Some definitions: //
35// //
36// T_max - the initial, usually highly overtrained tree, that is to be pruned back //
37// R(T) - quality index (Gini, misclassification rate, or other) of a tree T //
38// ~T - set of terminal nodes in T //
39// T' - the pruned subtree of T_max that has the best quality index R(T') //
40// alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha*|~T|) //
41// //
42// There are two running modes in CostComplexityPruneTool: (i) one may select a prune strength and prune //
43// the tree T_max until the criterion //
44// R(T) - R(t) //
45// alpha < ---------- //
46// |~T_t| - 1 //
47// //
48// is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points //
49// alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned //
50// subtree, defined to be the subtree with the best quality index for the validation sample. //
51////////////////////////////////////////////////////////////////////////////////////////////////////////////
52
53#include "TMVA/SeparationBase.h"
54#include "TMVA/GiniIndex.h"
55#include "TMVA/DecisionTree.h"
56#include "TMVA/Event.h"
57#include "TMVA/IPruneTool.h"
58
59namespace TMVA {
60
62 public:
63 CostComplexityPruneTool( SeparationBase* qualityIndex = NULL );
64 virtual ~CostComplexityPruneTool( );
65
66 // calculate the prune sequence for a given tree
67 virtual PruningInfo* CalculatePruningInfo( DecisionTree* dt, const IPruneTool::EventSample* testEvents = NULL, Bool_t isAutomatic = kFALSE );
68
69 private:
70 SeparationBase* fQualityIndexTool; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
71
72 std::vector<DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
73 std::vector<Double_t> fPruneStrengthList; //! map of alpha -> pruning index
74 std::vector<Double_t> fQualityIndexList; //! map of R(T) -> pruning index
75
76 Int_t fOptimalK; //! the optimal index of the prune sequence
77
78 private:
79 // set the meta data used for cost complexity pruning
81
82 // optimize the pruning sequence
83 void Optimize( DecisionTree* dt, Double_t weights );
84
85 mutable MsgLogger* fLogger; //! output stream to save logging information
86 MsgLogger& Log() const { return *fLogger; }
87
88 };
89}
90
91
92#endif
const Bool_t kFALSE
Definition: RtypesCore.h:90
double Double_t
Definition: RtypesCore.h:57
A class to prune a decision tree using the Cost Complexity method.
void InitTreePruningMetaData(DecisionTreeNode *n)
the optimal index of the prune sequence
std::vector< Double_t > fPruneStrengthList
map of weakest links (i.e., branches to prune) -> pruning index
virtual ~CostComplexityPruneTool()
the destructor for the cost complexity pruning
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const IPruneTool::EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)
the routine that basically "steers" the pruning process.
std::vector< DecisionTreeNode * > fPruneSequence
the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
std::vector< Double_t > fQualityIndexList
map of alpha -> pruning index
void Optimize(DecisionTree *dt, Double_t weights)
after the critical values (at which the corresponding nodes would be pruned away) had been establish...
MsgLogger & Log() const
output stream to save logging information
CostComplexityPruneTool(SeparationBase *qualityIndex=NULL)
the constructor for the cost complexity pruning
Int_t fOptimalK
map of R(T) -> pruning index
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
std::vector< const Event * > EventSample
Definition: IPruneTool.h:74
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const Int_t n
Definition: legend1.C:16
create variable transformations