22 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 23 #include <numpy/arrayobject.h> 56 const TString &methodTitle,
58 const TString &theOption) :
65 fMinWeightFractionLeaf(0),
66 fMaxFeatures(
"'auto'"),
67 fMaxLeafNodes(
"None"),
120 The function to measure the quality of a split. Supported criteria are \ 121 'gini' for the Gini impurity and 'entropy' for the information gain. \ 122 Note: this parameter is tree-specific.");
125 The maximum depth of the tree. If None, then nodes are expanded until \ 126 all leaves are pure or until all leaves contain less than \ 127 min_samples_split samples. \ 128 Ignored if ``max_leaf_nodes`` is not None.");
131 The minimum number of samples required to split an internal node.");
134 The minimum number of samples in newly created leaves. A split is \ 135 discarded if after the split, one of the leaves would contain less then \ 136 ``min_samples_leaf`` samples.");
138 The minimum weighted fraction of the input samples required to be at a \ 143 Grow trees with ``max_leaf_nodes`` in best-first fashion.\ 144 Best nodes are defined as relative reduction in impurity.\ 145 If None then unlimited number of leaf nodes.\ 146 If not None then ``max_depth`` will be ignored.");
149 Whether bootstrap samples are used when building trees.");
152 the generalization error.");
155 The number of jobs to run in parallel for both `fit` and `predict`. \ 156 If -1, then the number of jobs is set to the number of cores.");
159 If int, random_state is the seed used by the random number generator;\ 160 If RandomState instance, random_state is the random number generator;\ 161 If None, the random number generator is the RandomState instance used\ 165 Controls the verbosity of the tree building process.");
168 When set to ``True``, reuse the solution of the previous call to fit\ 169 and add more estimators to the ensemble, otherwise, just fit a whole\ 173 Weights associated with classes in the form ``{class_label: weight}``.\ 174 If not given, all classes are supposed to have weight one. For\ 175 multi-output problems, a list of dicts can be provided in the same\ 176 order as the columns of y.\ 177 The \"auto\" mode uses the values of y to automatically adjust\ 178 weights inversely proportional to class frequencies in the input data.\ 179 The \"subsample\" mode is the same as \"auto\" except that weights are\ 180 computed based on the bootstrap sample for every tree grown.\ 181 For multi-output, the weights of each column of y will be multiplied.\ 182 Note that these weights will be multiplied with sample_weight (passed\ 183 through the fit method) if sample_weight is specified.");
186 "Store trained classifier in this file");
194 Log() << kFATAL <<
" NEstimators <=0... that does not work !! " <<
Endl;
200 Log() << kFATAL <<
Form(
" Criterion = %s... that does not work !! ",
fCriterion.Data())
201 <<
" The options are `gini` or `entropy`." <<
Endl;
204 PyDict_SetItemString(
fLocalNS,
"criterion", pCriterion);
209 Log() << kFATAL <<
Form(
" MaxDepth = %s... that does not work !! ",
fMaxDepth.Data())
210 <<
" The options are None or integer." <<
Endl;
214 Log() << kFATAL <<
" MinSamplesSplit < 0... that does not work !! " <<
Endl;
220 Log() << kFATAL <<
" MinSamplesLeaf < 0... that does not work !! " <<
Endl;
226 Log() << kERROR <<
" MinWeightFractionLeaf < 0... that does not work !! " <<
Endl;
235 PyDict_SetItemString(
fLocalNS,
"maxFeatures", pMaxFeatures);
238 Log() << kFATAL <<
Form(
" MaxFeatures = %s... that does not work !! ",
fMaxFeatures.Data())
239 <<
"int, float, string or None, optional (default='auto')" 240 <<
"The number of features to consider when looking for the best split:" 241 <<
"If int, then consider `max_features` features at each split." 242 <<
"If float, then `max_features` is a percentage and" 243 <<
"`int(max_features * n_features)` features are considered at each split." 244 <<
"If 'auto', then `max_features=sqrt(n_features)`." 245 <<
"If 'sqrt', then `max_features=sqrt(n_features)`." 246 <<
"If 'log2', then `max_features=log2(n_features)`." 247 <<
"If None, then `max_features=n_features`." <<
Endl;
251 if (!pMaxLeafNodes) {
253 <<
" The options are None or integer." <<
Endl;
255 PyDict_SetItemString(
fLocalNS,
"maxLeafNodes", pMaxLeafNodes);
259 Log() << kFATAL <<
Form(
" RandomState = %s... that does not work !! ",
fRandomState.Data())
260 <<
"If int, random_state is the seed used by the random number generator;" 261 <<
"If RandomState instance, random_state is the random number generator;" 262 <<
"If None, the random number generator is the RandomState instance used by `np.random`." <<
Endl;
268 Log() << kFATAL <<
Form(
" ClassWeight = %s... that does not work !! ",
fClassWeight.Data())
269 <<
"dict, list of dicts, 'auto', 'subsample' or None, optional" <<
Endl;
274 Log() << kFATAL <<
Form(
" NJobs = %i... that does not work !! ",
fNjobs)
275 <<
"Value has to be greater than zero." <<
Endl;
317 npy_intp dimsData[2];
318 dimsData[0] = fNrowsTraining;
320 fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
322 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
324 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
325 fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
327 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
329 fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
331 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
333 for (
int i = 0; i < fNrowsTraining; i++) {
341 TrainDataClasses[i] = e->
GetClass();
348 PyRunString(
"classifier = sklearn.ensemble.RandomForestClassifier(bootstrap=bootstrap, class_weight=classWeight, criterion=criterion, max_depth=maxDepth, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_samples_leaf=minSamplesLeaf, min_samples_split=minSamplesSplit, min_weight_fraction_leaf=minWeightFractionLeaf, n_estimators=nEstimators, n_jobs=nJobs, oob_score=oobScore, random_state=randomState, verbose=verbose, warm_start=warmStart)",
349 "Failed to setup classifier");
353 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
358 Log() << kFATAL <<
"Can't create classifier object from RandomForestClassifier" <<
Endl;
384 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
385 if (firstEvt < 0) firstEvt = 0;
386 nEvents = lastEvt-firstEvt;
392 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
393 float *pValue = (
float *)(PyArray_DATA(pEvent));
395 for (
Int_t ievt=0; ievt<nEvents; ievt++) {
399 pValue[ievt * fNvars + i] = e->
GetValue(i);
404 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
405 double *proba = (
double *)(PyArray_DATA(result));
409 for (
int i = 0; i < nEvents; ++i) {
433 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
434 float *pValue = (
float *)(PyArray_DATA(pEvent));
438 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
439 double *proba = (
double *)(PyArray_DATA(result));
462 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
463 float *pValue = (
float *)(PyArray_DATA(pEvent));
467 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
468 double *proba = (
double *)(PyArray_DATA(result));
512 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(
fClassifier,
"feature_importances_");
513 if(pRanking == 0)
Log() << kFATAL <<
"Failed to get ranking from classifier" <<
Endl;
532 Log() <<
"A random forest is a meta estimator that fits a number of decision" <<
Endl;
533 Log() <<
"tree classifiers on various sub-samples of the dataset and use" <<
Endl;
534 Log() <<
"averaging to improve the predictive accuracy and control over-fitting." <<
Endl;
536 Log() <<
"Check out the scikit-learn documentation for more information." <<
Endl;
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
std::vector< Double_t > mvaValues
~MethodPyRandomForest(void)
void SetCurrentEvent(Long64_t ievt) const
MsgLogger & Endl(MsgLogger &ml)
Singleton class for Global types used by TMVA.
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
PyObject * pMinSamplesLeaf
Ranking for variables in method (implementation)
UInt_t GetNClasses() const
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
TString fFilenameClassifier
PyArrayObject * fTrainDataClasses
static int PyIsInitialized()
Check Python interpreter initialization status.
static void PyInitialize()
Initialize Python interpreter.
const TString & GetInputLabel(Int_t i) const
const TString & GetWeightFileDir() const
const Ranking * CreateRanking()
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=Py_single_input)
Execute Python code from string.
Double_t fMinWeightFractionLeaf
PyObject * Eval(TString code)
Evaluate Python code.
DataSetInfo & DataInfo() const
Class that contains all the data information.
PyArrayObject * fTrainDataWeights
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
Long64_t GetNTrainingEvents() const
const Event * GetTrainingEvent(Long64_t ievt) const
const char * GetName() const
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
PyObject * pMinSamplesSplit
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
UInt_t GetNVariables() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
void GetHelpMessage() const
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
virtual void TestClassification()
initialization
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
std::vector< Float_t > classValues
virtual void TestClassification()
initialization
const Event * GetEvent() const
PyObject * pMinWeightFractionLeaf
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
std::vector< Float_t > & GetMulticlassValues()
Bool_t IsModelPersistence()