class TMVA::DecisionTree: public TMVA::BinaryTree


 Implementation of a Decision Tree

 In a decision tree successive decision nodes are used to categorize the
 events out of the sample as either signal or background. Each node
 uses only a single discriminating variable to decide if the event is
 signal-like ("goes right") or background-like ("goes left"). This
 forms a tree like structure with "baskets" at the end (leave nodes),
 and an event is classified as either signal or background according to
 whether the basket where it ends up has been classified signal or
 background during the training. Training of a decision tree is the
 process to define the "cut criteria" for each node. The training
 starts with the root node. Here one takes the full training event
 sample and selects the variable and corresponding cut value that gives
 the best separation between signal and background at this stage. Using
 this cut criterion, the sample is then divided into two subsamples, a
 signal-like (right) and a background-like (left) sample. Two new nodes
 are then created for each of the two sub-samples and they are
 constructed using the same mechanism as described for the root
 node. The devision is stopped once a certain node has reached either a
 minimum number of events, or a minimum or maximum signal purity. These
 leave nodes are then called "signal" or "background" if they contain
 more signal respective background events from the training sample.

Function Members (Methods)

public:

virtual	~DecisionTree()
Int_t	BuildTree(vector<TMVA::Event>& eventSample, TMVA::DecisionTreeNode node = NULL)
Double_t	CheckEvent(const TMVA::Event&, Bool_t UseYesNoLeaf = kFALSE)
static TClass*	Class()
void	CleanTree(TMVA::DecisionTreeNode* node = NULL)
void	ClearTree()
UInt_t	CountLeafNodes(TMVA::DecisionTreeNode* n = NULL)
UInt_t	TMVA::BinaryTree::CountNodes(TMVA::Node* n = NULL)
virtual TMVA::Node*	CreateNode()
TMVA::DecisionTree	DecisionTree()
TMVA::DecisionTree	DecisionTree(TMVA::DecisionTreeNode* n)
TMVA::DecisionTree	DecisionTree(const TMVA::DecisionTree& d)
TMVA::DecisionTree	DecisionTree(TMVA::SeparationBase* sepType, Int_t minSize, Int_t nCuts, TMVA::SeparationBase* qtype = NULL, Bool_t randomisedTree = kFALSE, Int_t useNvars = 0, Int_t iSeed = 0)
void	DescendTree(TMVA::DecisionTreeNode* n = NULL)
void	FillEvent(TMVA::Event& event, TMVA::DecisionTreeNode* node)
void	FillLinkStrengthMap(TMVA::DecisionTreeNode* n = NULL)
void	FillQualityGainMap(TMVA::DecisionTreeNode* n = NULL)
void	FillQualityMap(TMVA::DecisionTreeNode* n = NULL)
void	FillTree(vector<TMVA::Event*>& eventSample)
TMVA::DecisionTreeNode*	FindCCPruneCandidate()
Double_t	GetCostComplexity(Double_t alpha)
Double_t	GetCostComplexityIfNextPruneStep(Double_t alpha)
TMVA::DecisionTreeNode*	GetLeftDaughter(TMVA::DecisionTreeNode* n)
multimap<Double_t,TMVA::DecisionTreeNode*>&	GetLinkStrengthMap()
UInt_t	TMVA::BinaryTree::GetNNodes() const
TMVA::DecisionTreeNode*	GetNode(ULong_t sequence, UInt_t depth)
multimap<Double_t,TMVA::DecisionTreeNode*>&	GetQualityGainMap()
multimap<Double_t,TMVA::DecisionTreeNode*>&	GetQualityMap()
TMVA::DecisionTreeNode*	GetRightDaughter(TMVA::DecisionTreeNode* n)
TMVA::Node*	TMVA::BinaryTree::GetRoot() const
UInt_t	TMVA::BinaryTree::GetTotalTreeDepth() const
vector<Double_t>	GetVariableImportance()
Double_t	GetVariableImportance(Int_t ivar)
TMVA::DecisionTreeNode*	GetWeakestLink()
virtual TClass*	IsA() const
Double_t	MisClassificationCostOfNode(TMVA::DecisionTreeNode* n)
Double_t	MisClassificationCostOfSubTree(TMVA::DecisionTreeNode* n = NULL)
void	TMVA::BinaryTree::Print(ostream& os) const
void	PruneNode(TMVA::DecisionTreeNode* node)
void	PruneTree()
void	PruneTreeCC()
void	PruneTreeEEP(TMVA::DecisionTreeNode* node)
void	PruneTreeMCC()
void	TMVA::BinaryTree::Read(istream& istr)
void	SetParentTreeInNodes(TMVA::DecisionTreeNode* n = NULL)
void	SetPruneMethod(TMVA::DecisionTree::EPruneMethod m = kCostComplexityPruning)
void	SetPruneStrength(Double_t p)
void	TMVA::BinaryTree::SetRoot(TMVA::Node* r)
void	TMVA::BinaryTree::SetTotalTreeDepth(Int_t depth)
void	TMVA::BinaryTree::SetTotalTreeDepth(TMVA::Node* n = NULL)
virtual void	ShowMembers(TMemberInspector& insp, char* parent)
virtual void	Streamer(TBuffer& b)
void	StreamerNVirtual(TBuffer& b)
Double_t	TrainNode(vector<TMVA::Event>& eventSample, TMVA::DecisionTreeNode node)

protected:

void	TMVA::BinaryTree::DeleteNode(TMVA::Node*)

private:

void	FindMinAndMax(vector<TMVA::Event*>& eventSample, vector<Double_t>& min, vector<Double_t>& max)
Double_t	GetNodeError(TMVA::DecisionTreeNode* node)
Double_t	GetSubTreeError(TMVA::DecisionTreeNode* node)
Double_t	SamplePurity(TMVA::DecisionTree::vector<Event*> eventSample)
void	SetCutPoints(vector<Double_t>& cut_points, Double_t xmin, Double_t xmax, Int_t num_gridpoints)

Data Members

public:

enum EPruneMethod {	kExpectedErrorPruning
	kCostComplexityPruning
	kMCC
	kNoPruning
};

protected:

UInt_t	TMVA::BinaryTree::fDepth	maximal depth in tree reached
TMVA::MsgLogger	TMVA::BinaryTree::fLogger	message loggera
Int_t	TMVA::BinaryTree::fNNodes	total number of nodes in the tree (counted)

private:

multimap<Double_t,TMVA::DecisionTreeNode*>	fLinkStrengthMap	prunestrenghts at which the subtree below the node would be pruned
Double_t	fMinSepGain	min number of separation gain to perform node splitting
Double_t	fMinSize	min number of events in node
TRandom2*	fMyTrandom	random number generator for randomised trees
Int_t	fNCuts	number of grid point in variable cut scans
Int_t	fNvars	number of variables used to separate S and B
TMVA::DecisionTree::EPruneMethod	fPruneMethod	method used for prunig
Double_t	fPruneStrength	a parameter to set the "amount" of pruning..needs to be adjusted
multimap<Double_t,TMVA::DecisionTreeNode*>	fQualityGainMap	the quality-gain of pre-leaf nodes
TMVA::SeparationBase*	fQualityIndex	separation/quality criterio for CC-pruning
multimap<Double_t,TMVA::DecisionTreeNode*>	fQualityMap	the quality of leaf nodes
Bool_t	fRandomisedTree	choose at each node splitting a random set of variables
TMVA::SeparationBase*	fSepType	the separation crition
Int_t	fUseNvars	the number of variables used in randomised trees;
Bool_t	fUseSearchTree	cut scan done with binary trees or simple event loop.
vector<Double_t>	fVariableImportance	the relative importance of the different variables
static const Int_t	fgDebugLevel	debug level determining some printout/control plots etc.

Class Charts

Function documentation

DecisionTree(TMVA::SeparationBase* sepType, Int_t minSize, Int_t nCuts, TMVA::SeparationBase* qtype = NULL, Bool_t randomisedTree = kFALSE, Int_t useNvars = 0, Int_t iSeed = 0)

 constructor specifying the separation type, the min number of
 events in a no that is still subjected to further splitting, the
 number of bins in the grid used in applying the cut for the node
 splitting.

void PruneTree()

 prune (get rid of internal nodes) the Decision tree to avoid overtraining
 serveral different pruning methods can be applied as selected by the
 variable "fPruneMethod". Currently however only the Expected Error Pruning
 is implemented

void PruneTreeCC()

 prunig of nodes using the Cost Complexity criteria. The Pruning is performed
 until a minimum in the cost complexity CC(alpha) is reached.
 CC(alpha) = alpha*NLeafs + sum_over_leafs[ N*Quality(leaf) ]
 where Quality(leaf) is given by the 1-purity (for Misclassification Error)
 purity(1-purity) for Gini-Index..e.t.c.. typically the Misclassification Error
 is used for guiding the pruning.

void PruneTreeMCC()

 Similar to the CostCoplexity pruning, only here I calculate immediately
 the "prunestrength" (= alpha, the regularisation parameter in the CostComplexity)
 for which the respective subtree below a node would be pruned. Then I continue
 pruning until all nodes have such a value larger than the specified prunestrength

void FillLinkStrengthMap(TMVA::DecisionTreeNode* n = NULL)

 loop over all non-leaf nodes of the tree and calculate for each
 of these nodes the prunestrenght ("alpha") at which this node
 would win against its subtree.(hence it's subtree would be pruned)
 this is given by:
          R(t) - R(T_t)       where R(t)  : MisClassificationCost of the node t
 alpha < --------------             R(T_t): MisClassificationCost of subtree blow t
          |T_t| - 1                 |T|   : # nodes in Tree T

Double_t GetCostComplexity(Double_t alpha)

 returns the cost complexity criterion for the decision tree
 see "L.Breiman, J.H.Friedman, R.A.Olshen, C.J.Stone; "Classification and
 Regression Trees", Wadsworth International Group (1984), Chapman & Hall/CRC (1984)

Double_t GetCostComplexityIfNextPruneStep(Double_t alpha)

 returns the cost complexity criterion for the decision tree
 see "L.Breiman, J.H.Friedman, R.A.Olshen, C.J.Stone; "Classification and
 Regression Trees", Wadsworth International Group (1984), Chapman & Hall/CRC (1984)

Double_t GetNodeError(TMVA::DecisionTreeNode* node)

 calculate an UPPER limit on the error made by the classification done
 by this node. If the S/S+B of the node is f, then according to the
 training sample, the error rate (fraction of misclassified events by
 this node) is (1-f)
 now f has a statistical error according to the binomial distribution
 hence the error on f can be estimated (same error as the binomial error
 for efficency calculations ( sigma = sqrt(eff(1-eff)/nEvts ) )

TMVA::DecisionTreeNode* GetNode(ULong_t sequence, UInt_t depth)

 retrieve node from the tree. Its position (up to a maximal tree depth of 64)
 is coded as a sequence of left-right moves starting from the root, coded as
 0-1 bit patterns stored in the "long-integer"  (i.e. 0:left ; 1:right

Double_t TrainNode(vector<TMVA::Event*>& eventSample, TMVA::DecisionTreeNode* node)

 decide how to split a node. At each node, ONE of the variables is
 choosen, which gives the best separation between signal and bkg on
 the sample which enters the Node.
 In order to do this, for each variable a scan of the different cut
 values in a grid (grid = fNCuts) is performed and the resulting separation
 gains are compared.. This cut scan uses either a binary search tree
 or a simple loop over the events depending on the number of events
 in the sample

Double_t CheckEvent(const TMVA::Event& , Bool_t UseYesNoLeaf = kFALSE)

 the event e is put into the decision tree (starting at the root node)
 and the output is NodeType (signal) or (background) of the final node (basket)
 in which the given events ends up. I.e. the result of the classification if
 the event for this decision tree.

vector< Double_t > GetVariableImportance(Int_t ivar)

return the relative variable importance, normalized to all
variables together having the importance 1. The importance in
evaluated as the total separation-gain that this variable had in
the decision trees (weighted by the number of events)