Logo ROOT  
Reference Guide
TMVA_Higgs_Classification.C
Go to the documentation of this file.
1/// \file
2/// \ingroup tutorial_tmva
3/// \notebook
4/// Classification example of TMVA based on public Higgs UCI dataset
5///
6/// The UCI data set is a public HIGGS data set , see http://archive.ics.uci.edu/ml/datasets/HIGGS
7/// used in this paper: Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics
8/// with Deep Learning.” Nature Communications 5 (July 2, 2014).
9///
10/// \macro_image
11/// \macro_output
12/// \macro_code
13///
14/// \author Lorenzo Moneta
15
16/***
17## Declare Factory
18
19Create the Factory class. Later you can choose the methods
20whose performance you'd like to investigate.
21
22The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass
23
24 - The first argument is the base of the name of all the output
25weightfiles in the directory weight/ that will be created with the
26method parameters
27
28 - The second argument is the output file for the training results
29
30 - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string
31
32**/
33
34void TMVA_Higgs_Classification() {
35
36 // options to control used methods
37
38 bool useLikelihood = true; // likelihood based discriminant
39 bool useLikelihoodKDE = false; // likelihood based discriminant
40 bool useFischer = true; // Fischer discriminant
41 bool useMLP = false; // Multi Layer Perceptron (old TMVA NN implementation)
42 bool useBDT = true; // BOosted Decision Tree
43 bool useDL = true; // TMVA Deep learning ( CPU or GPU)
44
45
46
48
49 auto outputFile = TFile::Open("Higgs_ClassificationOutput.root", "RECREATE");
50
51 TMVA::Factory factory("TMVA_Higgs_Classification", outputFile,
52 "!V:ROC:!Silent:Color:AnalysisType=Classification" );
53
54/**
55
56## Setup Dataset(s)
57
58Define now input data file and signal and background trees
59
60 **/
61
62 TString inputFileName = "Higgs_data.root";
63 TString inputFileLink = "http://root.cern.ch/files/" + inputFileName;
64
65 TFile *inputFile = nullptr;
66
67 if (!gSystem->AccessPathName(inputFileName)) {
68 // file exists
69 inputFile = TFile::Open( inputFileName );
70 }
71
72 if (!inputFile) {
73 // download file from Cernbox location
74 Info("TMVA_Higgs_Classification","Download Higgs_data.root file");
76 inputFile = TFile::Open(inputFileLink, "CACHEREAD");
77 if (!inputFile) {
78 Error("TMVA_Higgs_Classification","Input file cannot be downloaded - exit");
79 return;
80 }
81 }
82
83// --- Register the training and test trees
84
85 TTree *signalTree = (TTree*)inputFile->Get("sig_tree");
86 TTree *backgroundTree = (TTree*)inputFile->Get("bkg_tree");
87
88 signalTree->Print();
89
90/***
91## Declare DataLoader(s)
92
93The next step is to declare the DataLoader class that deals with input variables
94
95Define the input variables that shall be used for the MVA training
96note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )]
97
98***/
99
100 TMVA::DataLoader * loader = new TMVA::DataLoader("dataset");
101
102 loader->AddVariable("m_jj");
103 loader->AddVariable("m_jjj");
104 loader->AddVariable("m_lv");
105 loader->AddVariable("m_jlv");
106 loader->AddVariable("m_bb");
107 loader->AddVariable("m_wbb");
108 loader->AddVariable("m_wwbb");
109
110/// We set now the input data trees in the TMVA DataLoader class
111
112// global event weights per tree (see below for setting event-wise weights)
113 Double_t signalWeight = 1.0;
114 Double_t backgroundWeight = 1.0;
115
116// You can add an arbitrary number of signal or background trees
117 loader->AddSignalTree ( signalTree, signalWeight );
118 loader->AddBackgroundTree( backgroundTree, backgroundWeight );
119
120
121// Set individual event weights (the variables must exist in the original TTree)
122// for signal : factory->SetSignalWeightExpression ("weight1*weight2");
123// for background: factory->SetBackgroundWeightExpression("weight1*weight2");
124//loader->SetBackgroundWeightExpression( "weight" );
125
126// Apply additional cuts on the signal and background samples (can be different)
127 TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
128 TCut mycutb = ""; // for example: TCut mycutb = "abs(var1)<0.5";
129
130// Tell the factory how to use the training and testing events
131//
132// If no numbers of events are given, half of the events in the tree are used
133// for training, and the other half for testing:
134// loader->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
135// To also specify the number of testing events, use:
136
137 loader->PrepareTrainingAndTestTree( mycuts, mycutb,
138 "nTrain_Signal=7000:nTrain_Background=7000:SplitMode=Random:NormMode=NumEvents:!V" );
139
140/***
141## Booking Methods
142
143Here we book the TMVA methods. We book first a Likelihood based on KDE (Kernel Density Estimation), a Fischer discriminant, a BDT
144and a shallow neural network
145
146 */
147
148
149// Likelihood ("naive Bayes estimator")
150if (useLikelihood) {
151 factory.BookMethod(loader, TMVA::Types::kLikelihood, "Likelihood",
152 "H:!V:TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10:NSmooth=1:NAvEvtPerBin=50" );
153}
154// Use a kernel density estimator to approximate the PDFs
155if (useLikelihoodKDE) {
156 factory.BookMethod(loader, TMVA::Types::kLikelihood, "LikelihoodKDE",
157 "!H:!V:!TransformOutput:PDFInterpol=KDE:KDEtype=Gauss:KDEiter=Adaptive:KDEFineFactor=0.3:KDEborder=None:NAvEvtPerBin=50" );
158
159}
160
161// Fisher discriminant (same as LD)
162if (useFischer) {
163 factory.BookMethod(loader, TMVA::Types::kFisher, "Fisher", "H:!V:Fisher:VarTransform=None:CreateMVAPdfs:PDFInterpolMVAPdf=Spline2:NbinsMVAPdf=50:NsmoothMVAPdf=10" );
164}
165
166//Boosted Decision Trees
167if (useBDT) {
168 factory.BookMethod(loader,TMVA::Types::kBDT, "BDT",
169 "!V:NTrees=200:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" );
170}
171
172//Multi-Layer Perceptron (Neural Network)
173if (useMLP) {
174 factory.BookMethod(loader, TMVA::Types::kMLP, "MLP",
175 "!H:!V:NeuronType=tanh:VarTransform=N:NCycles=100:HiddenLayers=N+5:TestRate=5:!UseRegulator" );
176}
177
178
179/// Here we book the new DNN of TMVA if we have support in ROOT. We will use GPU version if ROOT is enabled with GPU
180
181
182 /***
183
184## Booking Deep Neural Network
185
186Here we define the option string for building the Deep Neural network model.
187
188#### 1. Define DNN layout
189
190The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed.
191
192We define first the DNN layout:
193
194- **input layout** : this defines the input data format for the DNN as ``input depth | height | width``.
195 In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features)
196- **batch layout** : this defines how are the input batch. It is related to input layout but not the same.
197 If the first layer is dense it should be ``1 | batch size ! number of variables`` (features)
198
199 *(note the use of the character `|` as separator of input parameters for DNN layout)*
200
201note that in case of only dense layer the input layout could be omitted but it is required when defining more
202complex architectures
203
204- **layer layout** string defining the layer architecture. The syntax is
205 - layer type (e.g. DENSE, CONV, RNN)
206 - layer parameters (e.g. number of units)
207 - activation function (e.g TANH, RELU,...)
208
209 *the different layers are separated by the ``","`` *
210
211#### 2. Define Training Strategy
212
213We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator.
214One can then concatenate different training strategy with different parameters. The training strategy are separated by
215the ``"|"`` separator.
216
217 - Optimizer
218 - Learning rate
219 - Momentum (valid for SGD and RMSPROP)
220 - Regularization and Weight Decay
221 - Dropout
222 - Max number of epochs
223 - Convergence steps. if the test error will not decrease after that value the training will stop
224 - Batch size (This value must be the same specified in the input layout)
225 - Test Repetitions (the interval when the test error will be computed)
226
227
228#### 3. Define general DNN options
229
230We define the general DNN options concatenating in the final string the previously defined layout and training strategy.
231Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods.
232In addition to input layout, batch layout and training strategy we add now:
233
234- Type of Loss function (e.g. CROSSENTROPY)
235- Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL )
236- Variable Transformation
237- Type of Architecture (e.g. CPU, GPU, Standard)
238
239We can then book the DL method using the built option string
240
241 ***/
242
243 if (useDL) {
244
245 bool useDLGPU = false;
246#ifdef R__HAS_TMVAGPU
247 useDLGPU = true;
248#endif
249
250 // Define DNN layout
251 TString inputLayoutString = "InputLayout=1|1|7";
252 TString batchLayoutString= "BatchLayout=1|128|7";
253 TString layoutString ("Layout=DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|1|LINEAR");
254 // Define Training strategies
255 // one can catenate several training strategies
256 TString training1("LearningRate=1e-3,Momentum=0.9,"
257 "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
258 "MaxEpochs=30,WeightDecay=1e-4,Regularization=None,"
259 "Optimizer=ADAM,ADAM_beta1=0.9,ADAM_beta2=0.999,ADAM_eps=1.E-7," // ADAM default parameters
260 "DropConfig=0.0+0.0+0.0+0.");
261 // TString training2("LearningRate=1e-3,Momentum=0.9"
262 // "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
263 // "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
264 // "Optimizer=SGD,DropConfig=0.0+0.0+0.0+0.");
265
266 TString trainingStrategyString ("TrainingStrategy=");
267 trainingStrategyString += training1; // + "|" + training2;
268
269 // General Options.
270
271 TString dnnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=G:"
272 "WeightInitialization=XAVIER");
273 dnnOptions.Append (":"); dnnOptions.Append (inputLayoutString);
274 dnnOptions.Append (":"); dnnOptions.Append (batchLayoutString);
275 dnnOptions.Append (":"); dnnOptions.Append (layoutString);
276 dnnOptions.Append (":"); dnnOptions.Append (trainingStrategyString);
277
278 TString dnnMethodName = "DNN_CPU";
279 if (useDLGPU) {
280 dnnOptions += ":Architecture=GPU";
281 dnnMethodName = "DNN_GPU";
282 } else {
283 dnnOptions += ":Architecture=CPU";
284 }
285
286 factory.BookMethod(loader, TMVA::Types::kDL, dnnMethodName, dnnOptions);
287
288 }
289
290 /**
291## Train Methods
292
293Here we train all the previously booked methods.
294
295 */
296
297 factory.TrainAllMethods();
298
299/**
300 ## Test all methods
301
302 Now we test and evaluate all methods using the test data set
303*/
304
305 factory.TestAllMethods();
306
307 factory.EvaluateAllMethods();
308
309/// after we get the ROC curve and we display
310
311 auto c1 = factory.GetROCCurve(loader);
312 c1->Draw();
313
314/// at the end we close the output file which contains the evaluation result of all methods and it can be used by TMVAGUI
315/// to display additional plots
316
317 outputFile->Close();
318
319
320}
double Double_t
Definition: RtypesCore.h:59
R__EXTERN TSystem * gSystem
Definition: TSystem.h:559
A specialized string object used for TTree selections.
Definition: TCut.h:25
TObject * Get(const char *namecycle) override
Return pointer to object identified by namecycle.
A ROOT file is a suite of consecutive data records (TKey instances) with a well defined format.
Definition: TFile.h:54
static Bool_t SetCacheFileDir(ROOT::Internal::TStringView cacheDir, Bool_t operateDisconnected=kTRUE, Bool_t forceCacheread=kFALSE)
Definition: TFile.h:324
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition: TFile.cxx:4011
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:371
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:632
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:402
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:485
This is the main MVA steering class.
Definition: Factory.h:80
static Tools & Instance()
Definition: Tools.cxx:75
@ kFisher
Definition: Types.h:84
@ kBDT
Definition: Types.h:88
@ kLikelihood
Definition: Types.h:81
@ kMLP
Definition: Types.h:92
Basic string class.
Definition: TString.h:136
virtual Bool_t AccessPathName(const char *path, EAccessMode mode=kFileExists)
Returns FALSE if one can access a file using the specified access mode.
Definition: TSystem.cxx:1296
A TTree represents a columnar dataset.
Definition: TTree.h:79
virtual void Print(Option_t *option="") const
Print a summary of the tree contents.
Definition: TTree.cxx:7162
return c1
Definition: legend1.C:41
void Info(const char *location, const char *va_(fmt),...)
Definition: TClingUtils.h:809
void Error(const char *location, const char *va_(fmt),...)
Definition: TClingUtils.h:789