ROOT  6.06/09
Reference Guide
MethodPyAdaBoost.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyAdaBoost *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * AdaBoost Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 #pragma GCC diagnostic ignored "-Wunused-parameter"
20 #include <iomanip>
21 #include <fstream>
22 
23 #include <Python.h>
24 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
25 #include <numpy/arrayobject.h>
26 
27 #include "TMath.h"
28 #include "Riostream.h"
29 #include "TMatrix.h"
30 #include "TMatrixD.h"
31 #include "TVectorD.h"
32 
34 #include "TMVA/MethodPyAdaBoost.h"
35 #include "TMVA/Tools.h"
36 #include "TMVA/Ranking.h"
37 #include "TMVA/Types.h"
38 #include "TMVA/Config.h"
39 #include "TMVA/PDF.h"
40 #include "TMVA/ClassifierFactory.h"
41 
42 #include "TMVA/Results.h"
43 
44 
45 
46 using namespace TMVA;
47 
48 REGISTER_METHOD(PyAdaBoost)
49 
51 
52 //_______________________________________________________________________
54  const TString &methodTitle,
55  DataSetInfo &dsi,
56  const TString &theOption,
57  TDirectory *theTargetDir) :
58  PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption, theTargetDir),
59  base_estimator("None"),
60  n_estimators(50),
61  learning_rate(1.0),
62  algorithm("SAMME.R"),
63  random_state("None")
64 {
65  // standard constructor for the PyAdaBoost
66  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
67 
68 }
69 
70 //_______________________________________________________________________
71 MethodPyAdaBoost::MethodPyAdaBoost(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
72  : PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile, theTargetDir),
73  base_estimator("None"),
74  n_estimators(50),
75  learning_rate(1.0),
76  algorithm("SAMME.R"),
77  random_state("None")
78 {
79  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
80 }
81 
82 
83 //_______________________________________________________________________
85 {
86 }
87 
88 //_______________________________________________________________________
90 {
91  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
92  return kFALSE;
93 }
94 
95 
96 //_______________________________________________________________________
98 {
100 
101  DeclareOptionRef(base_estimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
102  The base estimator from which the boosted ensemble is built.\
103  Support for sample weighting is required, as well as proper `classes_`\
104  and `n_classes_` attributes.");
105 
106  DeclareOptionRef(n_estimators, "NEstimators", "integer, optional (default=50)\
107  The maximum number of estimators at which boosting is terminated.\
108  In case of perfect fit, the learning procedure is stopped early.");
109 
110  DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=1.)\
111  Learning rate shrinks the contribution of each classifier by\
112  ``learning_rate``. There is a trade-off between ``learning_rate`` and\
113  ``n_estimators``.");
114 
115  DeclareOptionRef(algorithm, "Algorithm", "{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
116  If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
117  ``base_estimator`` must support calculation of class probabilities.\
118  If 'SAMME' then use the SAMME discrete boosting algorithm.\
119  The SAMME.R algorithm typically converges faster than SAMME,\
120  achieving a lower test error with fewer boosting iterations.");
121 
122  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
123  If int, random_state is the seed used by the random number generator;\
124  If RandomState instance, random_state is the random number generator;\
125  If None, the random number generator is the RandomState instance used\
126  by `np.random`.");
127 }
128 
129 //_______________________________________________________________________
131 {
132  PyObject *pobase_estimator = Eval(base_estimator);
133  if (!pobase_estimator) {
134  Log() << kFATAL << Form(" BaseEstimator = %s... that does not work !! ", base_estimator.Data())
135  << " The options are Object or None."
136  << Endl;
137  }
138  Py_DECREF(pobase_estimator);
139 
140  if (n_estimators <= 0) {
141  Log() << kERROR << " NEstimators <=0... that does not work !! "
142  << " I set it to 10 .. just so that the program does not crash"
143  << Endl;
144  n_estimators = 10;
145  }
146  if (learning_rate <= 0) {
147  Log() << kERROR << " LearningRate <=0... that does not work !! "
148  << " I set it to 1.0 .. just so that the program does not crash"
149  << Endl;
150  learning_rate = 1.0;
151  }
152 
153  if (algorithm != "SAMME" && algorithm != "SAMME.R") {
154  Log() << kFATAL << Form(" Algorithm = %s... that does not work !! ", algorithm.Data())
155  << " The options are SAMME of SAMME.R."
156  << Endl;
157  }
158  PyObject *porandom_state = Eval(random_state);
159  if (!porandom_state) {
160  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
161  << "If int, random_state is the seed used by the random number generator;"
162  << "If RandomState instance, random_state is the random number generator;"
163  << "If None, the random number generator is the RandomState instance used by `np.random`."
164  << Endl;
165  }
166  Py_DECREF(porandom_state);
167 }
168 
169 
170 //_______________________________________________________________________
172 {
173  ProcessOptions();
174  _import_array();//require to use numpy arrays
175 
176  //Import sklearn
177  // Convert the file name to a Python string.
178  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
179  // Import the file as a Python module.
180  fModule = PyImport_Import(pName);
181  Py_DECREF(pName);
182 
183  if (!fModule) {
184  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
185  Log() << Endl;
186  }
187 
188 
189  //Training data
190  UInt_t fNvars = Data()->GetNVariables();
191  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
192  int *dims = new int[2];
193  dims[0] = fNrowsTraining;
194  dims[1] = fNvars;
195  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
196  float *TrainData = (float *)(PyArray_DATA(fTrainData));
197 
198 
199  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
200  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
201 
202  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
203  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
204 
205  for (int i = 0; i < fNrowsTraining; i++) {
206  const TMVA::Event *e = Data()->GetTrainingEvent(i);
207  for (UInt_t j = 0; j < fNvars; j++) {
208  TrainData[j + i * fNvars] = e->GetValue(j);
209  }
210  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
211  else TrainDataClasses[i] = TMVA::Types::kBackground;
212 
213  TrainDataWeights[i] = e->GetWeight();
214  }
215 }
216 
218 {
219 // base_estimator("None"),
220 // n_estimators(50),
221 // learning_rate(1.0),
222 // algorithm("SAMME.R"),
223 // random_state("None")
224  PyObject *pobase_estimator = Eval(base_estimator);
225  PyObject *porandom_state = Eval(random_state);
226 
227  PyObject *args = Py_BuildValue("(OifsO)", pobase_estimator, n_estimators, learning_rate, algorithm.Data(), porandom_state);
228  PyObject_Print(args, stdout, 0);
229  std::cout << std::endl;
230  PyObject *pDict = PyModule_GetDict(fModule);
231  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "AdaBoostClassifier");
232 
233  // Create an instance of the class
234  if (PyCallable_Check(fClassifierClass)) {
235  //instance
236  fClassifier = PyObject_CallObject(fClassifierClass , args);
237  PyObject_Print(fClassifier, stdout, 0);
238 
239  Py_DECREF(args);
240  } else {
241  PyErr_Print();
242  Py_DECREF(pDict);
243  Py_DECREF(fClassifierClass);
244  Log() << kFATAL << "Can't call function AdaBoostClassifier" << Endl;
245  Log() << Endl;
246 
247  }
248 
249  fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);
250 // PyObject_Print(fClassifier, stdout, 0);
251 // std::cout<<std::endl;
252  // pValue =PyObject_CallObject(fClassifier, PyUnicode_FromString("classes_"));
253  // PyObject_Print(pValue, stdout, 0);
254 
255  TString path = GetWeightFileDir() + "/PyAdaBoostModel.PyData";
256  Log() << Endl;
257  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
258  Log() << Endl;
259 
260  Serialize(path,fClassifier);
261 }
262 
263 //_______________________________________________________________________
265 {
267 }
268 
269 
270 //_______________________________________________________________________
272 {
273  // cannot determine error
274  NoErrorCalc(errLower, errUpper);
275 
277 
278  Double_t mvaValue;
279  const TMVA::Event *e = Data()->GetEvent();
280  UInt_t nvars = e->GetNVariables();
281  int *dims = new int[2];
282  dims[0] = 1;
283  dims[1] = nvars;
284  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
285  float *pValue = (float *)(PyArray_DATA(pEvent));
286 
287  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
288 
289  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
290  double *proba = (double *)(PyArray_DATA(result));
291  mvaValue = proba[0]; //getting signal prob
292  Py_DECREF(result);
293  Py_DECREF(pEvent);
294  delete dims;
295  return mvaValue;
296 }
297 
298 //_______________________________________________________________________
300 {
301  if (!PyIsInitialized()) {
302  PyInitialize();
303  }
304 
305  TString path = GetWeightFileDir() + "/PyAdaBoostModel.PyData";
306  Log() << Endl;
307  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
308  Log() << Endl;
309  UnSerialize(path,&fClassifier);
310 }
311 
312 //_______________________________________________________________________
314 {
315  // get help message text
316  //
317  // typical length of text line:
318  // "|--------------------------------------------------------------|"
319  Log() << Endl;
320  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
321  Log() << Endl;
322  Log() << "Decision Trees and Rule-Based Models " << Endl;
323  Log() << Endl;
324  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
325  Log() << Endl;
326  Log() << Endl;
327  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
328  Log() << Endl;
329  Log() << "<None>" << Endl;
330 }
331 
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:114
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
ClassImp(MethodPyAdaBoost) MethodPyAdaBoost
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
virtual void TestClassification()
initialization
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:118
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:117
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:113
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:116
const Event * GetEvent() const
Definition: DataSet.cxx:180
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:194
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:599
void SetWeightFileDir(TString fileDir)
set directory of weight file
MethodPyAdaBoost(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
TRandom3 R
a TMatrixD.
Definition: testIO.cxx:28
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:820