ROOT  6.06/09
Reference Guide
MethodC50.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/rmva $Id$
2 // Author: Omar Zapata,Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodC50 *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * Decision Trees and Rule-Based Models *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #include <iomanip>
21 
22 #include "TMath.h"
23 #include "Riostream.h"
24 #include "TMatrix.h"
25 #include "TMatrixD.h"
26 #include "TVectorD.h"
27 
29 #include "TMVA/MethodC50.h"
30 #include "TMVA/Tools.h"
31 #include "TMVA/Config.h"
32 #include "TMVA/Ranking.h"
33 #include "TMVA/Types.h"
34 #include "TMVA/PDF.h"
35 #include "TMVA/ClassifierFactory.h"
36 
37 #include "TMVA/Results.h"
38 
39 using namespace TMVA;
40 
41 REGISTER_METHOD(C50)
42 
44 
45 //creating an Instance
46 Bool_t MethodC50::IsModuleLoaded = ROOT::R::TRInterface::Instance().Require("C50");
47 
48 //_______________________________________________________________________
49 MethodC50::MethodC50(const TString &jobName,
50  const TString &methodTitle,
51  DataSetInfo &dsi,
52  const TString &theOption,
53  TDirectory *theTargetDir) : RMethodBase(jobName, Types::kC50, methodTitle, dsi, theOption, theTargetDir),
54  fNTrials(1),
55  fRules(kFALSE),
56  fMvaCounter(0),
57  predict("predict.C5.0"),
58  C50("C5.0"),
59  C50Control("C5.0Control"),
60  asfactor("as.factor"),
61  fModel(NULL)
62 {
63  // standard constructor for the C50
64 
65  //C5.0Control options
66  fControlSubset = kTRUE;
67  fControlBands = 0;
68  fControlWinnow = kFALSE;
69  fControlNoGlobalPruning = kFALSE;
70  fControlCF = 0.25;
71  fControlMinCases = 2;
72  fControlFuzzyThreshold = kFALSE;
73  fControlSample = 0;
74  r["sample.int(4096, size = 1) - 1L"] >> fControlSeed;
75  fControlEarlyStopping = kTRUE;
76 
77  ListOfVariables = DataInfo().GetListOfVariables();
78 // default extension for weight files
79  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
80 }
81 
82 //_______________________________________________________________________
83 MethodC50::MethodC50(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
84  : RMethodBase(Types::kC50, theData, theWeightFile, theTargetDir),
85  fNTrials(1),
86  fRules(kFALSE),
87  fMvaCounter(0),
88  predict("predict.C5.0"),
89  C50("C5.0"),
90  C50Control("C5.0Control"),
91  asfactor("as.factor"),
92  fModel(NULL)
93 {
94 
95  // constructor from weight file
97  fControlBands = 0;
100  fControlCF = 0.25;
101  fControlMinCases = 2;
103  fControlSample = 0;
104  r["sample.int(4096, size = 1) - 1L"] >> fControlSeed;
106 // default extension for weight files
107  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
108 }
109 
110 
111 //_______________________________________________________________________
113 {
114  if (fModel) delete fModel;
115 }
116 
117 //_______________________________________________________________________
119 {
120  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
121  return kFALSE;
122 }
123 
124 
125 //_______________________________________________________________________
127 {
128 
129  if (!IsModuleLoaded) {
130  Error("Init", "R's package C50 can not be loaded.");
131  Log() << kFATAL << " R's package C50 can not be loaded."
132  << Endl;
133  return;
134  }
135 }
136 
138 {
139  if (Data()->GetNTrainingEvents() == 0) Log() << kFATAL << "<Train> Data() has zero events" << Endl;
140  SEXP Model = C50(ROOT::R::Label["x"] = fDfTrain, \
142  ROOT::R::Label["trials"] = fNTrials, \
143  ROOT::R::Label["rules"] = fRules, \
144  ROOT::R::Label["weights"] = fWeightTrain, \
145  ROOT::R::Label["control"] = fModelControl);
146  fModel = new ROOT::R::TRObject(Model);
147  TString path = GetWeightFileDir() + "/C50Model.RData";
148  Log() << Endl;
149  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
150  Log() << Endl;
151  r["C50Model"] << Model;
152  r << "save(C50Model,file='" + path + "')";
153 }
154 
155 //_______________________________________________________________________
157 {
158  //
159  DeclareOptionRef(fNTrials, "NTrials", "An integer specifying the number of boosting iterations");
160  DeclareOptionRef(fRules, "Rules", "A logical: should the tree be decomposed into a rule-basedmodel?");
161 
162  //C5.0Control Options
163  DeclareOptionRef(fControlSubset, "ControlSubset", "A logical: should the model evaluate groups of discrete \
164  predictors for splits? Note: the C5.0 command line version defaults this \
165  parameter to ‘FALSE’, meaning no attempted gropings will be evaluated \
166  during the tree growing stage.");
167  DeclareOptionRef(fControlBands, "ControlBands", "An integer between 2 and 1000. If ‘TRUE’, the model orders \
168  the rules by their affect on the error rate and groups the \
169  rules into the specified number of bands. This modifies the \
170  output so that the effect on the error rate can be seen for \
171  the groups of rules within a band. If this options is \
172  selected and ‘rules = kFALSE’, a warning is issued and ‘rules’ \
173  is changed to ‘kTRUE’.");
174  DeclareOptionRef(fControlWinnow, "ControlWinnow", "A logical: should predictor winnowing (i.e feature selection) be used?");
175  DeclareOptionRef(fControlNoGlobalPruning, "ControlNoGlobalPruning", "A logical to toggle whether the final, global pruning \
176  step to simplify the tree.");
177  DeclareOptionRef(fControlCF, "ControlCF", "A number in (0, 1) for the confidence factor.");
178  DeclareOptionRef(fControlMinCases, "ControlMinCases", "an integer for the smallest number of samples that must be \
179  put in at least two of the splits.");
180 
181  DeclareOptionRef(fControlFuzzyThreshold, "ControlFuzzyThreshold", "A logical toggle to evaluate possible advanced splits \
182  of the data. See Quinlan (1993) for details and examples.");
183  DeclareOptionRef(fControlSample, "ControlSample", "A value between (0, .999) that specifies the random \
184  proportion of the data should be used to train the model. By \
185  default, all the samples are used for model training. Samples \
186  not used for training are used to evaluate the accuracy of \
187  the model in the printed output.");
188  DeclareOptionRef(fControlSeed, "ControlSeed", " An integer for the random number seed within the C code.");
189  DeclareOptionRef(fControlEarlyStopping, "ControlEarlyStopping", " A logical to toggle whether the internal method for \
190  stopping boosting should be used.");
191 
192 
193 }
194 
195 //_______________________________________________________________________
197 {
198  if (fNTrials <= 0) {
199  Log() << kERROR << " fNTrials <=0... that does not work !! "
200  << " I set it to 1 .. just so that the program does not crash"
201  << Endl;
202  fNTrials = 1;
203  }
205  ROOT::R::Label["bands"] = fControlBands, \
206  ROOT::R::Label["winnow"] = fControlWinnow, \
207  ROOT::R::Label["noGlobalPruning"] = fControlNoGlobalPruning, \
208  ROOT::R::Label["CF"] = fControlCF, \
209  ROOT::R::Label["minCases"] = fControlMinCases, \
210  ROOT::R::Label["fuzzyThreshold"] = fControlFuzzyThreshold, \
211  ROOT::R::Label["sample"] = fControlSample, \
212  ROOT::R::Label["seed"] = fControlSeed, \
213  ROOT::R::Label["earlyStopping"] = fControlEarlyStopping);
214 }
215 
216 //_______________________________________________________________________
218 {
219  Log() << kINFO << "Testing Classification C50 METHOD " << Endl;
221 }
222 
223 
224 //_______________________________________________________________________
226 {
227  NoErrorCalc(errLower, errUpper);
228  Double_t mvaValue;
229  const TMVA::Event *ev = GetEvent();
230  const UInt_t nvar = DataInfo().GetNVariables();
231  ROOT::R::TRDataFrame fDfEvent;
232  for (UInt_t i = 0; i < nvar; i++) {
233  fDfEvent[DataInfo().GetListOfVariables()[i].Data()] = ev->GetValues()[i];
234  }
235  //if using persistence model
236  if (!fModel) {
238  }
239  TVectorD result = predict(*fModel, fDfEvent, ROOT::R::Label["type"] = "prob");
240  mvaValue = result[1]; //returning signal prob
241  return mvaValue;
242 }
243 
244 //_______________________________________________________________________
246 {
247 // get help message text
248 //
249 // typical length of text line:
250 // "|--------------------------------------------------------------|"
251  Log() << Endl;
252  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
253  Log() << Endl;
254  Log() << "Decision Trees and Rule-Based Models " << Endl;
255  Log() << Endl;
256  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
257  Log() << Endl;
258  Log() << Endl;
259  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
260  Log() << Endl;
261  Log() << "<None>" << Endl;
262 }
263 
264 //_______________________________________________________________________
266 {
268  TString path = GetWeightFileDir() + "/C50Model.RData";
269  Log() << Endl;
270  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
271  Log() << Endl;
272  r << "load('" + path + "')";
273  SEXP Model;
274  r["C50Model"] >> Model;
275  fModel = new ROOT::R::TRObject(Model);
276 
277 }
278 
279 //_______________________________________________________________________
280 void TMVA::MethodC50::MakeClass(const TString &theClassFileName) const
281 {
282 }
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
Definition: MethodC50.cxx:225
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
Int_t fControlSeed
Definition: MethodC50.h:96
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Bool_t fControlWinnow
Definition: MethodC50.h:90
Basic string class.
Definition: TString.h:137
TString as(SEXP s)
Definition: RExports.h:85
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
Bool_t fControlSubset
Definition: MethodC50.h:88
UInt_t GetNVariables() const
Definition: DataSetInfo.h:128
Double_t fControlCF
Definition: MethodC50.h:92
ROOT::R::TRFunctionImport C50Control
Definition: MethodC50.h:104
TVectorD fWeightTrain
Definition: RMethodBase.h:96
Tools & gTools()
Definition: Tools.cxx:79
Bool_t fControlFuzzyThreshold
Definition: MethodC50.h:94
Double_t fControlSample
Definition: MethodC50.h:95
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:918
static Bool_t IsModuleLoaded
Definition: MethodC50.h:100
ROOT::R::TRFunctionImport asfactor
Definition: MethodC50.h:105
Bool_t Require(TString pkg)
Method to load an R's package.
MethodC50(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
virtual void TestClassification()
initialization
Definition: MethodC50.cxx:217
This is a class to get ROOT's objects from R's objects
Definition: TRObject.h:73
std::vector< std::string > fFactorTrain
Definition: RMethodBase.h:98
ROOT::R::TRInterface & r
Definition: Object.C:4
Bool_t fRules
Definition: MethodC50.h:85
unsigned int UInt_t
Definition: RtypesCore.h:42
ROOT::R::TRInterface & r
Definition: RMethodBase.h:53
const Event * GetEvent() const
Definition: MethodBase.h:667
virtual void MakeClass(const TString &classFileName=TString("")) const
create reader class for method (classification only at present)
Definition: MethodC50.cxx:280
Bool_t fControlEarlyStopping
Definition: MethodC50.h:97
ROOT::R::TRObject fModelControl
Definition: MethodC50.h:107
#define ClassImp(name)
Definition: Rtypes.h:279
ROOT::R::TRFunctionImport predict
Definition: MethodC50.h:102
double Double_t
Definition: RtypesCore.h:55
UInt_t fControlMinCases
Definition: MethodC50.h:93
Describe directory structure in memory.
Definition: TDirectory.h:41
void GetHelpMessage() const
Definition: MethodC50.cxx:245
int type
Definition: TGX11.cxx:120
Bool_t fControlNoGlobalPruning
Definition: MethodC50.h:91
static TRInterface & Instance()
static method to get an TRInterface instance reference
MsgLogger & Log() const
Definition: Configurable.h:130
ROOT::R::TRFunctionImport C50
Definition: MethodC50.h:103
DataSetInfo & DataInfo() const
Definition: MethodBase.h:364
void DeclareOptions()
Definition: MethodC50.cxx:156
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
Definition: MethodC50.cxx:118
UInt_t fControlBands
Definition: MethodC50.h:89
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< Float_t > & GetValues()
Definition: Event.h:93
ROOT::R::TRDataFrame fDfTrain
Definition: RMethodBase.h:94
void SetWeightFileDir(TString fileDir)
set directory of weight file
ROOT::R::TRObject * fModel
Definition: MethodC50.h:106
void ProcessOptions()
Definition: MethodC50.cxx:196
void ReadStateFromFile()
Definition: MethodC50.cxx:265
std::vector< TString > GetListOfVariables() const
returns list of variables
double result[121]
Rcpp::internal::NamedPlaceHolder Label
Definition: RExports.cxx:14
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
TRandom3 R
a TMatrixD.
Definition: testIO.cxx:28
Definition: math.cpp:60
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:820
This is a class to create DataFrames from ROOT to R
Definition: TRDataFrame.h:183