ROOT logo
/**********************************************************************************
 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
 * Package: TMVA                                                                  *
 * Class  : TMVA::DecisionTree                                                    *
 * Web    : http://tmva.sourceforge.net                                           *
 *                                                                                *
 * Description:                                                                   *
 *      Implementation of a Decision Tree                                         *
 *                                                                                *
 * Authors (alphabetical):                                                        *
 *      Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland              *
 *      Helge Voss      <Helge.Voss@cern.ch>     - MPI-K Heidelberg, Germany      *
 *      Kai Voss        <Kai.Voss@cern.ch>       - U. of Victoria, Canada         *
 *      Doug Schouten   <dschoute@sfu.ca>        - Simon Fraser U., Canada        *
 *                                                                                *
 * Copyright (c) 2005:                                                            *
 *      CERN, Switzerland                                                         *
 *      U. of Victoria, Canada                                                    *
 *      MPI-K Heidelberg, Germany                                                 *
 *                                                                                *
 * Redistribution and use in source and binary forms, with or without             *
 * modification, are permitted according to the terms listed in LICENSE           *
 * (http://mva.sourceforge.net/license.txt)                                       *
 *                                                                                *
 **********************************************************************************/

#ifndef ROOT_TMVA_CostComplexityPruneTool
#define ROOT_TMVA_CostComplexityPruneTool

////////////////////////////////////////////////////////////////////////////////////////////////////////////
// CostComplexityPruneTool - a class to prune a decision tree using the Cost Complexity method            //
// (see "Classification and Regression Trees" by Leo Breiman et al)                                       //
//                                                                                                        //
// Some definitions:                                                                                      //
//                                                                                                        //
// T_max - the initial, usually highly overtrained tree, that is to be pruned back                        //
// R(T) - quality index (Gini, misclassification rate, or other) of a tree T                              //
// ~T - set of terminal nodes in T                                                                        //
// T' - the pruned subtree of T_max that has the best quality index R(T')                                 //
// alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha*|~T|)       //
//                                                                                                        //
// There are two running modes in CostComplexityPruneTool: (i) one may select a prune strength and prune  //
// the tree T_max until the criterion                                                                     //
//             R(T) - R(t)                                                                                //
//  alpha <    ----------                                                                                 //
//             |~T_t| - 1                                                                                 //
//                                                                                                        //
// is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points              //
// alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned    //
// subtree, defined to be the subtree with the best quality index for the validation sample.              //
////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef ROOT_TMVA_SeparationBase
#include "TMVA/SeparationBase.h"
#endif
#ifndef ROOT_TMVA_GiniIndex
#include "TMVA/GiniIndex.h"
#endif
#ifndef ROOT_TMVA_DecisionTree
#include "TMVA/DecisionTree.h"
#endif
#ifndef ROOT_TMVA_Event
#include "TMVA/Event.h"
#endif
#ifndef ROOT_TMVA_IPruneTool
#include "TMVA/IPruneTool.h"
#endif

namespace TMVA {

   class CostComplexityPruneTool : public IPruneTool {
   public:
      CostComplexityPruneTool( SeparationBase* qualityIndex = NULL );
      virtual ~CostComplexityPruneTool( );

      // calculate the prune sequence for a given tree
      virtual PruningInfo* CalculatePruningInfo( DecisionTree* dt, const IPruneTool::EventSample* testEvents = NULL, Bool_t isAutomatic = kFALSE );

   private:
      SeparationBase* fQualityIndexTool; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }

      std::vector<DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
      std::vector<Double_t> fPruneStrengthList; //! map of alpha -> pruning index
      std::vector<Double_t> fQualityIndexList; //! map of R(T) -> pruning index

      Int_t fOptimalK; //! the optimal index of the prune sequence

   private:
      // set the meta data used for cost complexity pruning
      void InitTreePruningMetaData( DecisionTreeNode* n );

      // optimize the pruning sequence
      void Optimize( DecisionTree* dt, Double_t weights );

      mutable MsgLogger* fLogger; //! output stream to save logging information
      MsgLogger& Log() const { return *fLogger; }

   };
}


#endif
 CostComplexityPruneTool.h:1
 CostComplexityPruneTool.h:2
 CostComplexityPruneTool.h:3
 CostComplexityPruneTool.h:4
 CostComplexityPruneTool.h:5
 CostComplexityPruneTool.h:6
 CostComplexityPruneTool.h:7
 CostComplexityPruneTool.h:8
 CostComplexityPruneTool.h:9
 CostComplexityPruneTool.h:10
 CostComplexityPruneTool.h:11
 CostComplexityPruneTool.h:12
 CostComplexityPruneTool.h:13
 CostComplexityPruneTool.h:14
 CostComplexityPruneTool.h:15
 CostComplexityPruneTool.h:16
 CostComplexityPruneTool.h:17
 CostComplexityPruneTool.h:18
 CostComplexityPruneTool.h:19
 CostComplexityPruneTool.h:20
 CostComplexityPruneTool.h:21
 CostComplexityPruneTool.h:22
 CostComplexityPruneTool.h:23
 CostComplexityPruneTool.h:24
 CostComplexityPruneTool.h:25
 CostComplexityPruneTool.h:26
 CostComplexityPruneTool.h:27
 CostComplexityPruneTool.h:28
 CostComplexityPruneTool.h:29
 CostComplexityPruneTool.h:30
 CostComplexityPruneTool.h:31
 CostComplexityPruneTool.h:32
 CostComplexityPruneTool.h:33
 CostComplexityPruneTool.h:34
 CostComplexityPruneTool.h:35
 CostComplexityPruneTool.h:36
 CostComplexityPruneTool.h:37
 CostComplexityPruneTool.h:38
 CostComplexityPruneTool.h:39
 CostComplexityPruneTool.h:40
 CostComplexityPruneTool.h:41
 CostComplexityPruneTool.h:42
 CostComplexityPruneTool.h:43
 CostComplexityPruneTool.h:44
 CostComplexityPruneTool.h:45
 CostComplexityPruneTool.h:46
 CostComplexityPruneTool.h:47
 CostComplexityPruneTool.h:48
 CostComplexityPruneTool.h:49
 CostComplexityPruneTool.h:50
 CostComplexityPruneTool.h:51
 CostComplexityPruneTool.h:52
 CostComplexityPruneTool.h:53
 CostComplexityPruneTool.h:54
 CostComplexityPruneTool.h:55
 CostComplexityPruneTool.h:56
 CostComplexityPruneTool.h:57
 CostComplexityPruneTool.h:58
 CostComplexityPruneTool.h:59
 CostComplexityPruneTool.h:60
 CostComplexityPruneTool.h:61
 CostComplexityPruneTool.h:62
 CostComplexityPruneTool.h:63
 CostComplexityPruneTool.h:64
 CostComplexityPruneTool.h:65
 CostComplexityPruneTool.h:66
 CostComplexityPruneTool.h:67
 CostComplexityPruneTool.h:68
 CostComplexityPruneTool.h:69
 CostComplexityPruneTool.h:70
 CostComplexityPruneTool.h:71
 CostComplexityPruneTool.h:72
 CostComplexityPruneTool.h:73
 CostComplexityPruneTool.h:74
 CostComplexityPruneTool.h:75
 CostComplexityPruneTool.h:76
 CostComplexityPruneTool.h:77
 CostComplexityPruneTool.h:78
 CostComplexityPruneTool.h:79
 CostComplexityPruneTool.h:80
 CostComplexityPruneTool.h:81
 CostComplexityPruneTool.h:82
 CostComplexityPruneTool.h:83
 CostComplexityPruneTool.h:84
 CostComplexityPruneTool.h:85
 CostComplexityPruneTool.h:86
 CostComplexityPruneTool.h:87
 CostComplexityPruneTool.h:88
 CostComplexityPruneTool.h:89
 CostComplexityPruneTool.h:90
 CostComplexityPruneTool.h:91
 CostComplexityPruneTool.h:92
 CostComplexityPruneTool.h:93
 CostComplexityPruneTool.h:94
 CostComplexityPruneTool.h:95
 CostComplexityPruneTool.h:96
 CostComplexityPruneTool.h:97
 CostComplexityPruneTool.h:98
 CostComplexityPruneTool.h:99
 CostComplexityPruneTool.h:100
 CostComplexityPruneTool.h:101
 CostComplexityPruneTool.h:102