Logo ROOT   6.08/07
Reference Guide
VariableImportance.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Omar Zapata and Sergei Gleyzer
3 
5 
6 #include "TMVA/Config.h"
7 #include "TMVA/DataSetInfo.h"
8 #include "TMVA/Envelope.h"
9 #include "TMVA/Factory.h"
10 #include "TMVA/OptionMap.h"
11 #include "TMVA/MethodBase.h"
12 #include "TMVA/MethodCategory.h"
13 #include "TMVA/MsgLogger.h"
14 #include "TMVA/Types.h"
16 
17 #include "TAxis.h"
18 #include "TGraph.h"
19 #include "TCanvas.h"
20 #include "TH1.h"
21 #include "TRandom3.h"
22 #include "TStyle.h"
23 #include "TSystem.h"
24 
25 #include <bitset>
26 #include <iostream>
27 #include <memory>
28 #include <utility>
29 
30 
31 //number of bits for bitset
32 #define NBITS 32
33 
34 TMVA::VariableImportanceResult::VariableImportanceResult():fImportanceValues("VariableImportance"),
35  fImportanceHist(nullptr)
36 {
37 
38 }
39 
41 {
44 }
45 
46 
48 {
51 
52  MsgLogger fLogger("VariableImportance");
54  {
55  fLogger<<kINFO<<"Variable Importance Results (Short)"<<Endl;
56  }else if(fType==VIType::kAll)
57  {
58  fLogger<<kINFO<<"Variable Importance Results (All)"<<Endl;
59  }else{
60  fLogger<<kINFO<<"Variable Importance Results (Random)"<<Endl;
61  }
62 
65 }
66 
67 
69 {
70  TCanvas *c=new TCanvas(name.Data());
71  fImportanceHist->Draw("");
72  fImportanceHist->GetXaxis()->SetTitle(" Variable Names ");
73  fImportanceHist->GetYaxis()->SetTitle(" Importance (%) ");
74  c->Draw();
75  return c;
76 }
77 
79 {
80  fClassifier=std::unique_ptr<Factory>(new TMVA::Factory("VariableImportance","!V:!ROC:!ModelPersistence:Silent:Color:!DrawProgressBar:AnalysisType=Classification"));
81 }
82 
84 {
85  fClassifier=nullptr;
86 }
87 
88 
90 {
91  TString methodName = fMethod.GetValue<TString>("MethodName");
92  TString methodTitle = fMethod.GetValue<TString>("MethodTitle");
93  TString methodOptions = fMethod.GetValue<TString>("MethodOptions");
94 
95  //NOTE: Put the type of VI Algorithm in the results Print
97  {
99  }else if(fType==VIType::kAll)
100  {
102  }else{
103  UInt_t nbits=fDataLoader->GetDefaultDataSetInfo().GetNVariables();
104  if(nbits<10)
105  Log()<<kERROR<<"Running variable importance with less that 10 varibales in Random mode "<<
106  "can to produce inconsisten results"<<Endl;
107  EvaluateImportanceRandom(pow(2,nbits));
108  }
109  fResults.fType = fType;
112  Log()<<kINFO<<"Evaluation done."<<Endl;
114 }
115 
117 {
118  ULong_t sum=0;
119  for(ULong_t n=0;n<i;n++) sum+=pow(2,n);
120  return sum;
121 }
122 
123 TH1F* TMVA::VariableImportance::GetImportance(const UInt_t nbits,std::vector<Float_t> &importances,std::vector<TString> &varNames)
124 {
125  TH1F *vihist = new TH1F("vihist", "", nbits, 0, nbits);
126 
127  gStyle->SetOptStat(000000);
128 
129  Float_t normalization = 0.0;
130  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
131 
132  Float_t roc = 0.0;
133 
134  gStyle->SetTitleXOffset(0.4);
135  gStyle->SetTitleXOffset(1.2);
136 
137 
138  for (UInt_t i = 1; i < nbits + 1; i++) {
139  roc = 100.0 * importances[i - 1] / normalization;
140  vihist->GetXaxis()->SetBinLabel(i, varNames[i - 1].Data());
141  vihist->SetBinContent(i, roc);
142  }
143 
144  vihist->LabelsOption("v >", "X");
145  vihist->SetBarWidth(0.97);
146  vihist->SetFillColor(TColor::GetColor("#006600"));
147 
148  vihist->GetXaxis()->SetTitle(" Variable Names ");
149  vihist->GetXaxis()->SetTitleSize(0.045);
150  vihist->GetXaxis()->CenterTitle();
151  vihist->GetXaxis()->SetTitleOffset(1.24);
152 
153  vihist->GetYaxis()->SetTitle(" Importance (%)");
154  vihist->GetYaxis()->SetTitleSize(0.045);
155  vihist->GetYaxis()->CenterTitle();
156  vihist->GetYaxis()->SetTitleOffset(1.24);
157 
158  vihist->GetYaxis()->SetRangeUser(-7, 50);
159  vihist->SetDirectory(0);
160 
161  return vihist;
162 }
163 
165 {
166  TString methodName = fMethod.GetValue<TString>("MethodName");
167  TString methodTitle = fMethod.GetValue<TString>("MethodTitle");
168  TString methodOptions = fMethod.GetValue<TString>("MethodOptions");
169 
170  uint32_t x = 0;
171  uint32_t y = 0;
172  //getting number of variables and variable names from loader
173  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
174  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
175 
176  ULong_t range = Sum(nbits);
177 
178  //vector to save importances
179  std::vector<Float_t> importances(nbits);
180  for (UInt_t i = 0; i < nbits; i++)importances[i] = 0;
181 
182  Float_t SROC, SSROC; //computed ROC value for every Seed and SubSeed
183 
184  x = range;
185 
186  std::bitset<NBITS> xbitset(x);
187  if (x == 0) Log()<<kFATAL<<"Error: need at least one variable."; //dataloader need at least one variable
188 
189 
190  //creating loader for seed
191  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
192 
193  //adding variables from seed
194  for (UInt_t index = 0; index < nbits; index++){
195  if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
196  }
197 
198  //Loading Dataset
199  DataLoaderCopy(seeddl,fDataLoader.get());
200 
201  //Booking Seed
202  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
203 
204  //Train/Test/Evaluation
205  fClassifier->TrainAllMethods();
206  fClassifier->TestAllMethods();
207  fClassifier->EvaluateAllMethods();
208 
209  //getting ROC
210  SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
211 
212  delete seeddl;
213  fClassifier->DeleteAllMethods();
214  fClassifier->fMethodsMap.clear();
215 
216  for (uint32_t i = 0; i < NBITS; ++i) {
217  if (x & (1 << i)) {
218  y = x & ~(1 << i);
219  std::bitset<NBITS> ybitset(y);
220  //need at least one variable
221  //NOTE: if subssed is zero then is the special case
222  //that count in xbitset is 1
223  Double_t ny = log(x - y) / 0.693147;
224  if (y == 0) {
225  importances[ny] = SROC - 0.5;
226  continue;
227  }
228 
229  //creating loader for subseed
230  TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
231  //adding variables from subseed
232  for (UInt_t index = 0; index < nbits; index++) {
233  if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
234  }
235 
236  //Loading Dataset
237  DataLoaderCopy(subseeddl,fDataLoader.get());
238 
239  //Booking SubSeed
240  fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
241 
242  //Train/Test/Evaluation
243  fClassifier->TrainAllMethods();
244  fClassifier->TestAllMethods();
245  fClassifier->EvaluateAllMethods();
246 
247  //getting ROC
248  SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
249  importances[ny] += SROC - SSROC;
250 
251  delete subseeddl;
252  fClassifier->DeleteAllMethods();
253  fClassifier->fMethodsMap.clear();
254  }
255  }
256  Float_t normalization = 0.0;
257  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
258 
259  for(UInt_t i=0;i<nbits;i++){
260  //adding values
261  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
262  //adding sufix
263  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
264  }
265  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
266 }
267 
269 {
270  TString methodName = fMethod.GetValue<TString>("MethodName");
271  TString methodTitle = fMethod.GetValue<TString>("MethodTitle");
272  TString methodOptions = fMethod.GetValue<TString>("MethodOptions");
273 
274  TRandom3 *rangen = new TRandom3(0); //Random Gen.
275 
276  uint32_t x = 0;
277  uint32_t y = 0;
278 
279  //getting number of variables and variable names from loader
280  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
281  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
282 
283  ULong_t range = pow(2, nbits);
284 
285  //vector to save importances
286  std::vector<Float_t> importances(nbits);
287  Float_t importances_norm = 0;
288 
289  for (UInt_t i = 0; i < nbits; i++)importances[i] = 0;
290 
291  Float_t SROC, SSROC; //computed ROC value for every Seed and SubSeed
292 
293  x = range;
294 
295  for (UInt_t n = 0; n < seeds; n++) {
296  x = rangen -> Integer(range);
297 
298  std::bitset<NBITS> xbitset(x);
299  if (x == 0) continue; //dataloader need at least one variable
300 
301 
302  //creating loader for seed
303  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
304 
305  //adding variables from seed
306  for (UInt_t index = 0; index < nbits; index++) {
307  if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
308  }
309 
310  //Loading Dataset
311  DataLoaderCopy(seeddl,fDataLoader.get());
312 
313  //Booking Seed
314  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
315 
316  //Train/Test/Evaluation
317  fClassifier->TrainAllMethods();
318  fClassifier->TestAllMethods();
319  fClassifier->EvaluateAllMethods();
320 
321  //getting ROC
322  SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
323 
324  delete seeddl;
325  fClassifier->DeleteAllMethods();
326  fClassifier->fMethodsMap.clear();
327 
328  for (uint32_t i = 0; i < 32; ++i) {
329  if (x & (1 << i)) {
330  y = x & ~(1 << i);
331  std::bitset<NBITS> ybitset(y);
332  //need at least one variable
333  //NOTE: if subssed is zero then is the special case
334  //that count in xbitset is 1
335  Double_t ny = log(x - y) / 0.693147;
336  if (y == 0) {
337  importances[ny] = SROC - 0.5;
338  importances_norm += importances[ny];
339  continue;
340  }
341 
342  //creating loader for subseed
343  TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
344  //adding variables from subseed
345  for (UInt_t index = 0; index < nbits; index++) {
346  if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
347  }
348 
349  //Loading Dataset
350  DataLoaderCopy(subseeddl,fDataLoader.get());
351 
352  //Booking SubSeed
353  fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
354 
355  //Train/Test/Evaluation
356  fClassifier->TrainAllMethods();
357  fClassifier->TestAllMethods();
358  fClassifier->EvaluateAllMethods();
359 
360  //getting ROC
361  SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
362  importances[ny] += SROC - SSROC;
363 
364  delete subseeddl;
365  fClassifier->DeleteAllMethods();
366  fClassifier->fMethodsMap.clear();
367  }
368  }
369  }
370 
371  Float_t normalization = 0.0;
372  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
373 
374  for(UInt_t i=0;i<nbits;i++){
375  //adding values
376  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
377  //adding sufix
378  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
379  }
380  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
381  delete rangen;
382 
383 }
384 
385 
387 {
388 
389  TString methodName = fMethod.GetValue<TString>("MethodName");
390  TString methodTitle = fMethod.GetValue<TString>("MethodTitle");
391  TString methodOptions = fMethod.GetValue<TString>("MethodOptions");
392 
393  uint32_t x = 0;
394  uint32_t y = 0;
395 
396  //getting number of variables and variable names from loader
397  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
398  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
399 
400  ULong_t range = pow(2, nbits);
401 
402  //vector to save importances
403  std::vector<Float_t> importances(nbits);
404 
405  //vector to save ROC-Integral values
406  std::vector<Float_t> ROC(range);
407  ROC[0]=0.5;
408  for (UInt_t i = 0; i < nbits; i++) importances[i] = 0;
409 
410  Float_t SROC, SSROC; //computed ROC value
411  for ( x = 1; x <range ; x++) {
412 
413  std::bitset<NBITS> xbitset(x);
414  if (x == 0) continue; //dataloader need at least one variable
415 
416  //creating loader for seed
417  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
418 
419  //adding variables from seed
420  for (UInt_t index = 0; index < nbits; index++) {
421  if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
422  }
423 
424  DataLoaderCopy(seeddl,fDataLoader.get());
425 
426  seeddl->PrepareTrainingAndTestTree(fDataLoader->GetDefaultDataSetInfo().GetCut("Signal"), fDataLoader->GetDefaultDataSetInfo().GetCut("Background"), fDataLoader->GetDefaultDataSetInfo().GetSplitOptions());
427 
428  //Booking Seed
429  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
430 
431  //Train/Test/Evaluation
432  fClassifier->TrainAllMethods();
433  fClassifier->TestAllMethods();
434  fClassifier->EvaluateAllMethods();
435 
436  //getting ROC
437  ROC[x] = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
438 
439  delete seeddl;
440  fClassifier->DeleteAllMethods();
441  fClassifier->fMethodsMap.clear();
442  }
443 
444 
445  for ( x = 0; x <range ; x++)
446  {
447  SROC=ROC[x];
448  for (uint32_t i = 0; i < NBITS; ++i) {
449  if (x & (1 << i)) {
450  y = x & ~(1 << i);
451  std::bitset<NBITS> ybitset(y);
452 
453  Float_t ny = log(x - y) / 0.693147;
454  if (y == 0) {
455  importances[ny] = SROC - 0.5;
456  continue;
457  }
458 
459  //getting ROC
460  SSROC = ROC[y];
461  importances[ny] += SROC - SSROC;
462  }
463 
464  }
465  }
466  Float_t normalization = 0.0;
467  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
468 
469  for(UInt_t i=0;i<nbits;i++){
470  //adding values
471  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
472  //adding sufix
473  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
474  }
475  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
476 }
477 
478 
479 
480 
Config & gConfig()
Definition: Config.cxx:43
virtual void SetTitleOffset(Float_t offset=1)
Set distance between the axis and the axis title Offset is a correction factor with respect to the "s...
Definition: TAttAxis.cxx:262
static long int sum(long int i)
Definition: Factory.cxx:1786
Random number generator class based on M.
Definition: TRandom3.h:29
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
virtual void LabelsOption(Option_t *option="h", Option_t *axis="X")
Set option(s) to draw axis with labels.
Definition: TH1.cxx:4934
float Float_t
Definition: RtypesCore.h:53
virtual void SetDirectory(TDirectory *dir)
By default when an histogram is created, it is added to the list of histogram objects in the current ...
Definition: TH1.cxx:8051
virtual void Evaluate()
Virtual method to be implmented with your algorithm.
return c
T GetValue(const TString &key)
Definition: OptionMap.h:152
VIType
Definition: Types.h:75
R__EXTERN TStyle * gStyle
Definition: TStyle.h:418
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
Definition: THist.hxx:302
MsgLogger & Log() const
Definition: Configurable.h:128
Basic string class.
Definition: TString.h:137
tomato 1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:575
const Bool_t kFALSE
Definition: Rtypes.h:92
void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
Definition: DataLoader.cxx:802
void CenterTitle(Bool_t center=kTRUE)
Center axis title.
Definition: TAxis.h:190
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
Definition: DataLoader.cxx:456
std::shared_ptr< TH1F > fImportanceHist
virtual void SetBarWidth(Float_t width=0.5)
Definition: TH1.h:362
TCanvas * Draw(const TString name="VariableImportance") const
virtual void SetRangeUser(Double_t ufirst, Double_t ulast)
Set the viewing range for the axis from ufirst to ulast (in user coordinates).
Definition: TAxis.cxx:925
Double_t x[n]
Definition: legend1.C:17
OptionMap fMethod
Definition: Envelope.h:58
const int ny
Definition: kalman.C:17
std::unique_ptr< Factory > fClassifier
double pow(double, double)
std::vector< std::vector< double > > Data
Base class for all machine learning algorithms.
Definition: Envelope.h:55
VariableImportance(DataLoader *loader)
void Print() const
Definition: OptionMap.h:143
virtual void SetFillColor(Color_t fcolor)
Set the fill area color.
Definition: TAttFill.h:42
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8323
unsigned int UInt_t
Definition: RtypesCore.h:42
static Int_t GetColor(const char *hexcolor)
Static method returning color number for color specified by hex color string of form: "#rrggbb"...
Definition: TColor.cxx:1706
TAxis * GetYaxis()
Definition: TH1.h:325
virtual void SetTitleSize(Float_t size=0.04)
Set size of axis title The size is expressed in per cent of the pad width.
Definition: TAttAxis.cxx:272
The Canvas class.
Definition: TCanvas.h:41
#define NBITS
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
Definition: DataLoader.cxx:580
double Double_t
Definition: RtypesCore.h:55
std::shared_ptr< DataLoader > fDataLoader
Definition: Envelope.h:59
unsigned long ULong_t
Definition: RtypesCore.h:51
Double_t y[n]
Definition: legend1.C:17
virtual void SetBinLabel(Int_t bin, const char *label)
Set label for bin.
Definition: TAxis.cxx:808
TH1F * GetImportance(const UInt_t nbits, std::vector< Float_t > &importances, std::vector< TString > &varNames)
virtual void Draw(Option_t *option="")
Draw a canvas.
Definition: TCanvas.cxx:795
void SetTitleXOffset(Float_t offset=1)
Definition: TStyle.h:398
Abstract ClassifierFactory template that handles arbitrary types.
#define nullptr
Definition: Rtypes.h:87
void SetSilent(Bool_t s)
Definition: Config.h:64
void SetOptStat(Int_t stat=1)
The type of information printed in the histogram statistics box can be selected via the parameter mod...
Definition: TStyle.cxx:1257
static void EnableOutput()
Definition: MsgLogger.cxx:70
VariableImportanceResult fResults
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:155
void EvaluateImportanceRandom(UInt_t nseeds)
const Int_t n
Definition: legend1.C:16
char name[80]
Definition: TGX11.cxx:109
double log(double)
TAxis * GetXaxis()
Definition: TH1.h:324
const char * Data() const
Definition: TString.h:349