Logo ROOT   6.08/07
Reference Guide
MethodRuleFit.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Fredrik Tegenfeldt
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodRuleFit *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Friedman's RuleFit method *
12  * *
13  * Authors (alphabetical): *
14  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
15  * *
16  * Copyright (c) 2005: *
17  * CERN, Switzerland *
18  * Iowa State U. *
19  * MPI-K Heidelberg, Germany *
20  * *
21  * Redistribution and use in source and binary forms, with or without *
22  * modification, are permitted according to the terms listed in LICENSE *
23  * *
24  **********************************************************************************/
25 
26 #ifndef ROOT_TMVA_MethodRuleFit
27 #define ROOT_TMVA_MethodRuleFit
28 
29 //////////////////////////////////////////////////////////////////////////
30 // //
31 // MethodRuleFit //
32 // //
33 // J Friedman's RuleFit method //
34 // //
35 //////////////////////////////////////////////////////////////////////////
36 
37 #ifndef ROOT_TMVA_MethodBase
38 #include "TMVA/MethodBase.h"
39 #endif
40 #ifndef ROOT_TMatrixDfwd
41 #include "TMatrixDfwd.h"
42 #endif
43 #ifndef ROOT_TVectorD
44 #include "TVectorD.h"
45 #endif
46 #ifndef ROOT_TMVA_DecisionTree
47 #include "TMVA/DecisionTree.h"
48 #endif
49 #ifndef ROOT_TMVA_RuleFit
50 #include "TMVA/RuleFit.h"
51 #endif
52 
53 namespace TMVA {
54 
55  class SeparationBase;
56 
57  class MethodRuleFit : public MethodBase {
58 
59  public:
60 
61  MethodRuleFit( const TString& jobName,
62  const TString& methodTitle,
63  DataSetInfo& theData,
64  const TString& theOption = "");
65 
66  MethodRuleFit( DataSetInfo& theData,
67  const TString& theWeightFile);
68 
69  virtual ~MethodRuleFit( void );
70 
71  virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ );
72 
73  // training method
74  void Train( void );
75 
77 
78  // write weights to file
79  void AddWeightsXMLTo ( void* parent ) const;
80 
81  // read weights from file
82  void ReadWeightsFromStream( std::istream& istr );
83  void ReadWeightsFromXML ( void* wghtnode );
84 
85  // calculate the MVA value
86  Double_t GetMvaValue( Double_t* err = 0, Double_t* errUpper = 0 );
87 
88  // write method specific histos to target file
89  void WriteMonitoringHistosToFile( void ) const;
90 
91  // ranking of input variables
92  const Ranking* CreateRanking();
93 
94  Bool_t UseBoost() const { return fUseBoost; }
95 
96  // accessors
97  RuleFit* GetRuleFitPtr() { return &fRuleFit; }
98  const RuleFit* GetRuleFitConstPtr() const { return &fRuleFit; }
99  TDirectory* GetMethodBaseDir() const { return BaseDir(); }
100  const std::vector<TMVA::Event*>& GetTrainingEvents() const { return fEventSample; }
101  const std::vector<TMVA::DecisionTree*>& GetForest() const { return fForest; }
102  Int_t GetNTrees() const { return fNTrees; }
104  const SeparationBase* GetSeparationBaseConst() const { return fSepType; }
110  Int_t GetNCuts() const { return fNCuts; }
111  //
112  Int_t GetGDNPathSteps() const { return fGDNPathSteps; }
113  Double_t GetGDPathStep() const { return fGDPathStep; }
114  Double_t GetGDErrScale() const { return fGDErrScale; }
117  //
119 
120  const TString GetRFWorkDir() const { return fRFWorkDir; }
121  Int_t GetRFNrules() const { return fRFNrules; }
122  Int_t GetRFNendnodes() const { return fRFNendnodes; }
123 
124  protected:
125 
126  // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
127  void MakeClassSpecific( std::ostream&, const TString& ) const;
128 
129  void MakeClassRuleCuts( std::ostream& ) const;
130 
131  void MakeClassLinear( std::ostream& ) const;
132 
133  // get help message text
134  void GetHelpMessage() const;
135 
136  // initialize rulefit
137  void Init( void );
138 
139  // copy all training events into a stl::vector
140  void InitEventSample( void );
141 
142  // initialize monitor ntuple
143  void InitMonitorNtuple();
144 
145  void TrainTMVARuleFit();
146  void TrainJFRuleFit();
147 
148  private:
149 
150  // check variable range and set var to lower or upper if out of range
151  template<typename T>
152  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax );
153 
154  template<typename T>
155  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef );
156 
157  template<typename T>
158  inline Int_t VerifyRange( const T& var, const T& vmin, const T& vmax );
159 
160  // the option handling methods
161  void DeclareOptions();
162  void ProcessOptions();
163 
164  RuleFit fRuleFit; // RuleFit instance
165  std::vector<TMVA::Event *> fEventSample; // the complete training sample
166  Double_t fSignalFraction; // scalefactor for bkg events to modify initial s/b fraction in training data
167 
168  // ntuple
169  TTree *fMonitorNtuple; // pointer to monitor rule ntuple
170  Double_t fNTImportance; // ntuple: rule importance
171  Double_t fNTCoefficient; // ntuple: rule coefficient
172  Double_t fNTSupport; // ntuple: rule support
173  Int_t fNTNcuts; // ntuple: rule number of cuts
174  Int_t fNTNvars; // ntuple: rule number of vars
175  Double_t fNTPtag; // ntuple: rule P(tag)
176  Double_t fNTPss; // ntuple: rule P(tag s, true s)
177  Double_t fNTPsb; // ntuple: rule P(tag s, true b)
178  Double_t fNTPbs; // ntuple: rule P(tag b, true s)
179  Double_t fNTPbb; // ntuple: rule P(tag b, true b)
180  Double_t fNTSSB; // ntuple: rule S/(S+B)
181  Int_t fNTType; // ntuple: rule type (+1->signal, -1->bkg)
182 
183  // options
184  TString fRuleFitModuleS;// which rulefit module to use
185  Bool_t fUseRuleFitJF; // if true interface with J.Friedmans RuleFit module
186  TString fRFWorkDir; // working directory from Friedmans module
187  Int_t fRFNrules; // max number of rules (only Friedmans module)
188  Int_t fRFNendnodes; // max number of rules (only Friedmans module)
189  std::vector<DecisionTree *> fForest; // the forest
190  Int_t fNTrees; // number of trees in forest
191  Double_t fTreeEveFrac; // fraction of events used for traing each tree
192  SeparationBase *fSepType; // the separation used in node splitting
193  Double_t fMinFracNEve; // min fraction of number events
194  Double_t fMaxFracNEve; // ditto max
195  Int_t fNCuts; // grid used in cut applied in node splitting
196  TString fSepTypeS; // forest generation: separation type - see DecisionTree
197  TString fPruneMethodS; // forest generation: prune method - see DecisionTree
198  TMVA::DecisionTree::EPruneMethod fPruneMethod; // forest generation: method used for pruning - see DecisionTree
199  Double_t fPruneStrength; // forest generation: prune strength - see DecisionTree
200  TString fForestTypeS; // forest generation: how the trees are generated
201  Bool_t fUseBoost; // use boosted events for forest generation
202  //
203  Double_t fGDPathEveFrac; // GD path: fraction of subsamples used for the fitting
204  Double_t fGDValidEveFrac; // GD path: fraction of subsamples used for the fitting
205  Double_t fGDTau; // GD path: def threshhold fraction [0..1]
206  Double_t fGDTauPrec; // GD path: precision of estimated tau
207  Double_t fGDTauMin; // GD path: min threshhold fraction [0..1]
208  Double_t fGDTauMax; // GD path: max threshhold fraction [0..1]
209  UInt_t fGDTauScan; // GD path: number of points to scan
210  Double_t fGDPathStep; // GD path: step size in path
211  Int_t fGDNPathSteps; // GD path: number of steps
212  Double_t fGDErrScale; // GD path: stop
213  Double_t fMinimp; // rule/linear: minimum importance
214  //
215  TString fModelTypeS; // rule ensemble: which model (rule,linear or both)
216  Double_t fRuleMinDist; // rule min distance - see RuleEnsemble
217  Double_t fLinQuantile; // quantile cut to remove outliers - see RuleEnsemble
218 
219  ClassDef(MethodRuleFit,0); // Friedman's RuleFit method
220  };
221 
222 } // namespace TMVA
223 
224 
225 //_______________________________________________________________________
226 template<typename T>
227 inline Int_t TMVA::MethodRuleFit::VerifyRange( const T& var, const T& vmin, const T& vmax )
228 {
229  // check range and return +1 if above, -1 if below or 0 if inside
230  if (var>vmax) return 1;
231  if (var<vmin) return -1;
232  return 0;
233 }
234 
235 //_______________________________________________________________________
236 template<typename T>
237 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax )
238 {
239  // verify range and print out message
240  // if outside range, set to closest limit
241  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
242  Bool_t modif=kFALSE;
243  if (dir==1) {
244  modif = kTRUE;
245  var=vmax;
246  }
247  if (dir==-1) {
248  modif = kTRUE;
249  var=vmin;
250  }
251  if (modif) {
252  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to new value = " << var << Endl;
253  }
254  return modif;
255 }
256 
257 //_______________________________________________________________________
258 template<typename T>
259 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef )
260 {
261  // verify range and print out message
262  // if outside range, set to given default value
263  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
264  Bool_t modif=kFALSE;
265  if (dir!=0) {
266  modif = kTRUE;
267  var=vdef;
268  }
269  if (modif) {
270  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to default value = " << var << Endl;
271  }
272  return modif;
273 }
274 
275 
276 #endif // MethodRuleFit_H
const std::vector< TMVA::Event * > & GetTrainingEvents() const
void DeclareOptions()
define the options (their key words) that can be set in the option string know options.
void Init(void)
default initialization
void ReadWeightsFromXML(void *wghtnode)
read rules from XML node
Double_t GetTreeEveFrac() const
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
Double_t GetGDErrScale() const
void WriteMonitoringHistosToFile(void) const
write special monitoring histograms to file (here ntuple)
void ReadWeightsFromStream(std::istream &istr)
read rules from an std::istream
double T(double x)
Definition: ChebyshevPol.h:34
void InitMonitorNtuple()
initialize the monitoring ntuple
EAnalysisType
Definition: Types.h:129
Basic string class.
Definition: TString.h:137
Double_t GetGDValidEveFrac() const
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
const Ranking * CreateRanking()
computes ranking of input variables
void TrainJFRuleFit()
training of rules using Jerome Friedmans implementation
MethodRuleFit(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
standard constructor
const std::vector< TMVA::DecisionTree * > & GetForest() const
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
returns MVA value for given event
Double_t GetGDPathStep() const
TMVA::DecisionTree::EPruneMethod fPruneMethod
#define ClassDef(name, id)
Definition: Rtypes.h:254
void ProcessOptions()
process the options specified by the user
TMVA::DecisionTree::EPruneMethod GetPruneMethod() const
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
void MakeClassLinear(std::ostream &) const
print out the linear terms
std::vector< TMVA::Event * > fEventSample
SeparationBase * GetSeparationBase() const
void TrainTMVARuleFit()
training of rules using TMVA implementation
Double_t GetPruneStrength() const
SeparationBase * fSepType
Double_t GetGDPathEveFrac() const
Int_t GetRFNrules() const
Double_t GetLinQuantile() const
unsigned int UInt_t
Definition: RtypesCore.h:42
RuleFit * GetRuleFitPtr()
Definition: MethodRuleFit.h:97
const SeparationBase * GetSeparationBaseConst() const
Int_t GetRFNendnodes() const
void GetHelpMessage() const
get help message text
Bool_t VerifyRange(MsgLogger &mlog, const char *varstr, T &var, const T &vmin, const T &vmax)
const TString GetRFWorkDir() const
Int_t GetNCuts() const
double Double_t
Definition: RtypesCore.h:55
const RuleFit * GetRuleFitConstPtr() const
Definition: MethodRuleFit.h:98
Int_t GetGDNPathSteps() const
Describe directory structure in memory.
Definition: TDirectory.h:44
int type
Definition: TGX11.cxx:120
Int_t GetNTrees() const
Bool_t UseBoost() const
Definition: MethodRuleFit.h:94
Abstract ClassifierFactory template that handles arbitrary types.
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
TDirectory * GetMethodBaseDir() const
Definition: MethodRuleFit.h:99
A TTree object has a header with a name and a title.
Definition: TTree.h:98
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
RuleFit can handle classification with 2 classes.
virtual ~MethodRuleFit(void)
destructor
void MakeClassRuleCuts(std::ostream &) const
print out the rule cuts
virtual void ReadWeightsFromStream(std::istream &)=0
const Bool_t kTRUE
Definition: Rtypes.h:91
std::vector< DecisionTree * > fForest
void AddWeightsXMLTo(void *parent) const
add the rules to XML node
Double_t GetMaxFracNEve() const
void InitEventSample(void)
write all Events from the Tree into a vector of Events, that are more easily manipulated.
Double_t GetMinFracNEve() const