Logo ROOT  
Reference Guide
VariableImportance.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Omar Zapata and Sergei Gleyzer
3
4/*! \class TMVA::VariableImportanceResult
5\ingroup TMVA
6*/
7
8/*! \class TMVA::VariableImportance
9\ingroup TMVA
10*/
11
13
14#include "TMVA/Config.h"
15#include "TMVA/DataSetInfo.h"
16#include "TMVA/Envelope.h"
17#include "TMVA/Factory.h"
18#include "TMVA/OptionMap.h"
19#include "TMVA/MethodBase.h"
20#include "TMVA/MethodCategory.h"
21#include "TMVA/MsgLogger.h"
22#include "TMVA/Types.h"
24
25#include "TAxis.h"
26#include "TGraph.h"
27#include "TCanvas.h"
28#include "TH1.h"
29#include "TRandom3.h"
30#include "TStyle.h"
31#include "TSystem.h"
32
33#include <bitset>
34#include <iostream>
35#include <memory>
36#include <utility>
37
38
39//number of bits for bitset
40#define NBITS 32
41
42////////////////////////////////////////////////////////////////////////////////
43
44TMVA::VariableImportanceResult::VariableImportanceResult():fImportanceValues("VariableImportance"),
45 fImportanceHist(nullptr)
46{
47
48}
49
50////////////////////////////////////////////////////////////////////////////////
51
53{
54 fImportanceValues = obj.fImportanceValues;
55 fImportanceHist = obj.fImportanceHist;
56}
57
58////////////////////////////////////////////////////////////////////////////////
59
61{
64
65 MsgLogger fLogger("VariableImportance");
66 if(fType==VIType::kShort)
67 {
68 fLogger<<kINFO<<"Variable Importance Results (Short)"<<Endl;
69 }else if(fType==VIType::kAll)
70 {
71 fLogger<<kINFO<<"Variable Importance Results (All)"<<Endl;
72 }else{
73 fLogger<<kINFO<<"Variable Importance Results (Random)"<<Endl;
74 }
75
76 fImportanceValues.Print();
78}
79
80////////////////////////////////////////////////////////////////////////////////
81
83{
84 TCanvas *c=new TCanvas(name.Data());
85 fImportanceHist->Draw("");
86 fImportanceHist->GetXaxis()->SetTitle(" Variable Names ");
87 fImportanceHist->GetYaxis()->SetTitle(" Importance (%) ");
88 c->Draw();
89 return c;
90}
91
92////////////////////////////////////////////////////////////////////////////////
93
94TMVA::VariableImportance::VariableImportance(TMVA::DataLoader *dataloader):TMVA::Envelope("VariableImportance",dataloader,nullptr),fType(VIType::kShort)
95{
96 fClassifier=std::unique_ptr<Factory>(new TMVA::Factory("VariableImportance","!V:!ROC:!ModelPersistence:Silent:Color:!DrawProgressBar:AnalysisType=Classification"));
97}
98
99////////////////////////////////////////////////////////////////////////////////
100
102{
103 fClassifier=nullptr;
104}
105
106////////////////////////////////////////////////////////////////////////////////
107
109{
110
111 //NOTE: Put the type of VI Algorithm in the results Print
112 if(fType==VIType::kShort)
113 {
114 EvaluateImportanceShort();
115 }else if(fType==VIType::kAll)
116 {
117 EvaluateImportanceAll();
118 }else{
119 UInt_t nbits=fDataLoader->GetDefaultDataSetInfo().GetNVariables();
120 if(nbits<10)
121 Log()<<kERROR<<"Running variable importance with less that 10 varibales in Random mode "<<
122 "can to produce inconsisten results"<<Endl;
123 EvaluateImportanceRandom(pow(nbits,2));
124 }
125 fResults.fType = fType;
128 Log()<<kINFO<<"Evaluation done."<<Endl;
130}
131
132////////////////////////////////////////////////////////////////////////////////
133
135{
136 ULong_t sum=0;
137 for(ULong_t n=0;n<i;n++) sum+=pow(2,n);
138 return sum;
139}
140
141////////////////////////////////////////////////////////////////////////////////
142
143TH1F* TMVA::VariableImportance::GetImportance(const UInt_t nbits,std::vector<Float_t> &importances,std::vector<TString> &varNames)
144{
145 TH1F *vihist = new TH1F("vihist", "", nbits, 0, nbits);
146
147 gStyle->SetOptStat(000000);
148
149 Float_t normalization = 0.0;
150 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
151
152 Float_t roc = 0.0;
153
156
157
158 for (UInt_t i = 1; i < nbits + 1; i++) {
159 roc = 100.0 * importances[i - 1] / normalization;
160 vihist->GetXaxis()->SetBinLabel(i, varNames[i - 1].Data());
161 vihist->SetBinContent(i, roc);
162 }
163
164 vihist->LabelsOption("v >", "X");
165 vihist->SetBarWidth(0.97);
166 vihist->SetFillColor(TColor::GetColor("#006600"));
167
168 vihist->GetXaxis()->SetTitle(" Variable Names ");
169 vihist->GetXaxis()->SetTitleSize(0.045);
170 vihist->GetXaxis()->CenterTitle();
171 vihist->GetXaxis()->SetTitleOffset(1.24);
172
173 vihist->GetYaxis()->SetTitle(" Importance (%)");
174 vihist->GetYaxis()->SetTitleSize(0.045);
175 vihist->GetYaxis()->CenterTitle();
176 vihist->GetYaxis()->SetTitleOffset(1.24);
177
178 vihist->GetYaxis()->SetRangeUser(-7, 50);
179 vihist->SetDirectory(0);
180
181 return vihist;
182}
183
184////////////////////////////////////////////////////////////////////////////////
185
187{
188 for (auto &meth : fMethods) {
189 TString methodName = meth.GetValue<TString>("MethodName");
190 TString methodTitle = meth.GetValue<TString>("MethodTitle");
191 TString methodOptions = meth.GetValue<TString>("MethodOptions");
192
193 uint32_t x = 0;
194 uint32_t y = 0;
195 // getting number of variables and variable names from loader
196 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
197 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
198
199 ULong_t range = Sum(nbits);
200
201 // vector to save importances
202 std::vector<Float_t> importances(nbits);
203 for (UInt_t i = 0; i < nbits; i++)
204 importances[i] = 0;
205
206 Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
207
208 x = range;
209
210 std::bitset<NBITS> xbitset(x);
211 if (x == 0)
212 Log() << kFATAL << "Error: need at least one variable."; // dataloader need at least one variable
213
214 // creating loader for seed
215 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
216
217 // adding variables from seed
218 for (UInt_t index = 0; index < nbits; index++) {
219 if (xbitset[index])
220 seeddl->AddVariable(varNames[index], 'F');
221 }
222
223 // Loading Dataset
224 DataLoaderCopy(seeddl, fDataLoader.get());
225
226 // Booking Seed
227 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
228
229 // Train/Test/Evaluation
230 fClassifier->TrainAllMethods();
231 fClassifier->TestAllMethods();
232 fClassifier->EvaluateAllMethods();
233
234 // getting ROC
235 SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
236
237 delete seeddl;
238 fClassifier->DeleteAllMethods();
239 fClassifier->fMethodsMap.clear();
240
241 for (uint32_t i = 0; i < NBITS; ++i) {
242 if (x & (1 << i)) {
243 y = x & ~(1 << i);
244 std::bitset<NBITS> ybitset(y);
245 //need at least one variable
246 //NOTE: if subssed is zero then is the special case
247 //that count in xbitset is 1
248 Double_t ny = log(x - y) / 0.693147;
249 if (y == 0) {
250 importances[ny] = SROC - 0.5;
251 continue;
252 }
253
254 //creating loader for subseed
255 TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
256 //adding variables from subseed
257 for (UInt_t index = 0; index < nbits; index++) {
258 if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
259 }
260
261 //Loading Dataset
262 DataLoaderCopy(subseeddl,fDataLoader.get());
263
264 //Booking SubSeed
265 fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
266
267 //Train/Test/Evaluation
268 fClassifier->TrainAllMethods();
269 fClassifier->TestAllMethods();
270 fClassifier->EvaluateAllMethods();
271
272 //getting ROC
273 SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
274 importances[ny] += SROC - SSROC;
275
276 delete subseeddl;
277 fClassifier->DeleteAllMethods();
278 fClassifier->fMethodsMap.clear();
279 }
280 }
281 Float_t normalization = 0.0;
282 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
283
284 for(UInt_t i=0;i<nbits;i++){
285 //adding values
286 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
287 //adding sufix
288 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
289 }
290 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
291 }
292}
293
294////////////////////////////////////////////////////////////////////////////////
295
297{
298 for (auto &meth : fMethods) {
299
300 TString methodName = meth.GetValue<TString>("MethodName");
301 TString methodTitle = meth.GetValue<TString>("MethodTitle");
302 TString methodOptions = meth.GetValue<TString>("MethodOptions");
303
304 TRandom3 *rangen = new TRandom3(0); // Random Gen.
305
306 uint32_t x = 0;
307 uint32_t y = 0;
308
309 // getting number of variables and variable names from loader
310 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
311 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
312
313 ULong_t range = pow(2, nbits);
314
315 // vector to save importances
316 std::vector<Float_t> importances(nbits);
317 Float_t importances_norm = 0;
318
319 for (UInt_t i = 0; i < nbits; i++)
320 importances[i] = 0;
321
322 Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
323
324 x = range;
325
326 for (UInt_t n = 0; n < seeds; n++) {
327 x = rangen->Integer(range);
328
329 std::bitset<NBITS> xbitset(x);
330 if (x == 0)
331 continue; // dataloader need at least one variable
332
333 // creating loader for seed
334 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
335
336 // adding variables from seed
337 for (UInt_t index = 0; index < nbits; index++) {
338 if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
339 }
340
341 //Loading Dataset
342 DataLoaderCopy(seeddl,fDataLoader.get());
343
344 //Booking Seed
345 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
346
347 //Train/Test/Evaluation
348 fClassifier->TrainAllMethods();
349 fClassifier->TestAllMethods();
350 fClassifier->EvaluateAllMethods();
351
352 //getting ROC
353 SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
354
355 delete seeddl;
356 fClassifier->DeleteAllMethods();
357 fClassifier->fMethodsMap.clear();
358
359 for (uint32_t i = 0; i < 32; ++i) {
360 if (x & (1 << i)) {
361 y = x & ~(1 << i);
362 std::bitset<NBITS> ybitset(y);
363 //need at least one variable
364 //NOTE: if subssed is zero then is the special case
365 //that count in xbitset is 1
366 Double_t ny = log(x - y) / 0.693147;
367 if (y == 0) {
368 importances[ny] = SROC - 0.5;
369 importances_norm += importances[ny];
370 continue;
371 }
372
373 //creating loader for subseed
374 TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
375 //adding variables from subseed
376 for (UInt_t index = 0; index < nbits; index++) {
377 if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
378 }
379
380 //Loading Dataset
381 DataLoaderCopy(subseeddl,fDataLoader.get());
382
383 //Booking SubSeed
384 fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
385
386 //Train/Test/Evaluation
387 fClassifier->TrainAllMethods();
388 fClassifier->TestAllMethods();
389 fClassifier->EvaluateAllMethods();
390
391 //getting ROC
392 SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
393 importances[ny] += SROC - SSROC;
394
395 delete subseeddl;
396 fClassifier->DeleteAllMethods();
397 fClassifier->fMethodsMap.clear();
398 }
399 }
400 }
401
402 Float_t normalization = 0.0;
403 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
404
405 for(UInt_t i=0;i<nbits;i++){
406 //adding values
407 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
408 //adding sufix
409 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
410 }
411 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
412 delete rangen;
413 }
414}
415
416////////////////////////////////////////////////////////////////////////////////
417
419{
420 for (auto &meth : fMethods) {
421 TString methodName = meth.GetValue<TString>("MethodName");
422 TString methodTitle = meth.GetValue<TString>("MethodTitle");
423 TString methodOptions = meth.GetValue<TString>("MethodOptions");
424
425 uint32_t x = 0;
426 uint32_t y = 0;
427
428 // getting number of variables and variable names from loader
429 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
430 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
431
432 ULong_t range = pow(2, nbits);
433
434 // vector to save importances
435 std::vector<Float_t> importances(nbits);
436
437 // vector to save ROC-Integral values
438 std::vector<Float_t> ROC(range);
439 ROC[0] = 0.5;
440 for (UInt_t i = 0; i < nbits; i++)
441 importances[i] = 0;
442
443 Float_t SROC, SSROC; // computed ROC value
444 for (x = 1; x < range; x++) {
445
446 std::bitset<NBITS> xbitset(x);
447 if (x == 0)
448 continue; // dataloader need at least one variable
449
450 // creating loader for seed
451 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
452
453 // adding variables from seed
454 for (UInt_t index = 0; index < nbits; index++) {
455 if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
456 }
457
458 DataLoaderCopy(seeddl,fDataLoader.get());
459
460 seeddl->PrepareTrainingAndTestTree(fDataLoader->GetDefaultDataSetInfo().GetCut("Signal"), fDataLoader->GetDefaultDataSetInfo().GetCut("Background"), fDataLoader->GetDefaultDataSetInfo().GetSplitOptions());
461
462 //Booking Seed
463 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
464
465 //Train/Test/Evaluation
466 fClassifier->TrainAllMethods();
467 fClassifier->TestAllMethods();
468 fClassifier->EvaluateAllMethods();
469
470 //getting ROC
471 ROC[x] = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
472
473 delete seeddl;
474 fClassifier->DeleteAllMethods();
475 fClassifier->fMethodsMap.clear();
476 }
477
478
479 for ( x = 0; x <range ; x++)
480 {
481 SROC=ROC[x];
482 for (uint32_t i = 0; i < NBITS; ++i) {
483 if (x & (1 << i)) {
484 y = x & ~(1 << i);
485 std::bitset<NBITS> ybitset(y);
486
487 Float_t ny = log(x - y) / 0.693147;
488 if (y == 0) {
489 importances[ny] = SROC - 0.5;
490 continue;
491 }
492
493 //getting ROC
494 SSROC = ROC[y];
495 importances[ny] += SROC - SSROC;
496 }
497
498 }
499 }
500 Float_t normalization = 0.0;
501 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
502
503 for(UInt_t i=0;i<nbits;i++){
504 //adding values
505 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
506 //adding sufix
507 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
508 }
509 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
510 }
511}
PyObject * fType
#define c(i)
Definition: RSha256.hxx:101
unsigned int UInt_t
Definition: RtypesCore.h:42
const Bool_t kFALSE
Definition: RtypesCore.h:88
unsigned long ULong_t
Definition: RtypesCore.h:51
double Double_t
Definition: RtypesCore.h:55
float Float_t
Definition: RtypesCore.h:53
const Bool_t kTRUE
Definition: RtypesCore.h:87
char name[80]
Definition: TGX11.cxx:109
double pow(double, double)
double log(double)
R__EXTERN TStyle * gStyle
Definition: TStyle.h:407
#define NBITS
virtual void SetTitleOffset(Float_t offset=1)
Set distance between the axis and the axis title.
Definition: TAttAxis.cxx:294
virtual void SetTitleSize(Float_t size=0.04)
Set size of axis title.
Definition: TAttAxis.cxx:304
virtual void SetFillColor(Color_t fcolor)
Set the fill area color.
Definition: TAttFill.h:37
virtual void SetBinLabel(Int_t bin, const char *label)
Set label for bin.
Definition: TAxis.cxx:809
void CenterTitle(Bool_t center=kTRUE)
Center axis title.
Definition: TAxis.h:184
virtual void SetRangeUser(Double_t ufirst, Double_t ulast)
Set the viewing range for the axis from ufirst to ulast (in user coordinates).
Definition: TAxis.cxx:928
The Canvas class.
Definition: TCanvas.h:31
static Int_t GetColor(const char *hexcolor)
Static method returning color number for color specified by hex color string of form: "#rrggbb",...
Definition: TColor.cxx:1764
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:571
virtual void SetDirectory(TDirectory *dir)
By default when an histogram is created, it is added to the list of histogram objects in the current ...
Definition: TH1.cxx:8381
virtual void LabelsOption(Option_t *option="h", Option_t *axis="X")
Set option(s) to draw axis with labels.
Definition: TH1.cxx:5214
TAxis * GetXaxis()
Get the behaviour adopted by the object about the statoverflows. See EStatOverflows for more informat...
Definition: TH1.h:316
TAxis * GetYaxis()
Definition: TH1.h:317
virtual void SetBarWidth(Float_t width=0.5)
Definition: TH1.h:356
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8666
void SetSilent(Bool_t s)
Definition: Config.h:65
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:617
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:470
Abstract base class for all high level ml algorithms, you can book ml methods like BDT,...
Definition: Envelope.h:47
This is the main MVA steering class.
Definition: Factory.h:81
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
static void EnableOutput()
Definition: MsgLogger.cxx:75
std::shared_ptr< TH1F > fImportanceHist
TCanvas * Draw(const TString name="VariableImportance") const
std::unique_ptr< Factory > fClassifier
virtual void Evaluate()
Virtual method to be implemented with your algorithm.
void EvaluateImportanceRandom(UInt_t nseeds)
VariableImportance(DataLoader *loader)
TH1F * GetImportance(const UInt_t nbits, std::vector< Float_t > &importances, std::vector< TString > &varNames)
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:164
virtual void Print(Option_t *option="") const
This method must be overridden when a class wants to print itself.
Definition: TObject.cxx:550
Random number generator class based on M.
Definition: TRandom3.h:27
Basic string class.
Definition: TString.h:131
void SetOptStat(Int_t stat=1)
The type of information printed in the histogram statistics box can be selected via the parameter mod...
Definition: TStyle.cxx:1450
void SetTitleXOffset(Float_t offset=1)
Definition: TStyle.h:387
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
T Sum(const RVec< T > &v)
Sum elements of an RVec.
Definition: RVec.hxx:758
create variable transformations
void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
Config & gConfig()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Double_t Log(Double_t x)
Definition: TMath.h:750
static long int sum(long int i)
Definition: Factory.cxx:2276