Logo ROOT  
Reference Guide
TMVA_Higgs_Classification.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_tmva
3## \notebook
4## Classification example of TMVA based on public Higgs UCI dataset
5##
6## The UCI data set is a public HIGGS data set , see http://archive.ics.uci.edu/ml/datasets/HIGGS
7## used in this paper: Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics
8## with Deep Learning.” Nature Communications 5 (July 2, 2014).
9##
10## \macro_image
11## \macro_output
12## \macro_code
13##
14## \author Harshal Shende
15
16## Declare Factory
17
18
19## Create the Factory class. Later you can choose the methods
20## whose performance you'd like to investigate.
21
22## The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass
23
24## - The first argument is the base of the name of all the output
25## weightfiles in the directory weight/ that will be created with the
26## method parameters
27
28## - The second argument is the output file for the training results
29
30## - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string
31
32import ROOT
33import os
34
35TMVA = ROOT.TMVA
36TFile = ROOT.TFile
37
39
40# options to control used methods
41useLikelihood = True # likelihood based discriminant
42useLikelihoodKDE = False # likelihood based discriminant
43useFischer = True # Fischer discriminant
44useMLP = False # Multi Layer Perceptron (old TMVA NN implementation)
45useBDT = True # Boosted Decision Tree
46useDL = True # TMVA Deep learning ( CPU or GPU)
47useKeras = True # Use Keras Deep Learning via PyMVA
48
49if ROOT.gSystem.GetFromPipe("root-config --has-tmva-pymva") == "yes":
51else:
52 useKeras = False # cannot use Keras if PYMVA is not available
53
54if useKeras:
55 try:
56 import tensorflow
57 except:
58 ROOT.Warning("TMVA_Higgs_Classification", "Skip using Keras since tensorflow is not available")
59 useKeras = False
60
61outputFile = TFile.Open("Higgs_ClassificationOutput.root", "RECREATE")
62factory = TMVA.Factory(
63 "TMVA_Higgs_Classification", outputFile, V=False, ROC=True, Silent=False, Color=True, AnalysisType="Classification"
64)
65
66
67## Setup Dataset(s)
68
69# Define now input data file and signal and background trees
70
71inputFileName = "Higgs_data.root"
72inputFileLink = "http://root.cern.ch/files/" + inputFileName
73
74
75if ROOT.gSystem.AccessPathName(inputFileName):
76 # file exists
77 ROOT.Info("TMVA_Higgs_Classification", "Download Higgs_data.root file")
79 inputFile = TFile.Open(inputFileLink, "CACHEREAD")
80 if inputFile is None:
81 raise FileNotFoundError("Input file cannot be downloaded - exit")
82else:
83 inputFile = TFile.Open(inputFileName)
84
85
86# --- Register the training and test trees
87signalTree = inputFile.Get("sig_tree")
88backgroundTree = inputFile.Get("bkg_tree")
89signalTree.Print()
90
91## Declare DataLoader(s)
92
93# The next step is to declare the DataLoader class that deals with input variables
94# Define the input variables that shall be used for the MVA training
95# note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )]
96loader = TMVA.DataLoader("dataset")
97
98loader.AddVariable("m_jj")
99loader.AddVariable("m_jjj")
100loader.AddVariable("m_lv")
101loader.AddVariable("m_jlv")
102loader.AddVariable("m_bb")
103loader.AddVariable("m_wbb")
104loader.AddVariable("m_wwbb")
105
106# We set now the input data trees in the TMVA DataLoader class
107# global event weights per tree (see below for setting event-wise weights)
108signalWeight = 1.0
109backgroundWeight = 1.0
110# You can add an arbitrary number of signal or background trees
111loader.AddSignalTree(signalTree, signalWeight)
112loader.AddBackgroundTree(backgroundTree, backgroundWeight)
113
114# Set individual event weights (the variables must exist in the original TTree)
115# for signal : factory->SetSignalWeightExpression ("weight1*weight2");
116# for background: factory->SetBackgroundWeightExpression("weight1*weight2");
117# loader->SetBackgroundWeightExpression( "weight" );
118
119
120# Apply additional cuts on the signal and background samples (can be different)
121mycuts = ROOT.TCut("") # for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
122mycutb = ROOT.TCut("") # for example: TCut mycutb = "abs(var1)<0.5";
123
124# Tell the factory how to use the training and testing events
125#
126# If no numbers of events are given, half of the events in the tree are used
127# for training, and the other half for testing:
128# loader->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
129# To also specify the number of testing events, use:
130
131loader.PrepareTrainingAndTestTree(
132 mycuts, mycutb, nTrain_Signal=7000, nTrain_Background=7000, SplitMode="Random", NormMode="NumEvents", V=False
133)
134
135## Booking Methods
136
137# Here we book the TMVA methods. We book first a Likelihood based on KDE (Kernel Density Estimation), a Fischer discriminant, a BDT
138# and a shallow neural network
139# Likelihood ("naive Bayes estimator")
140if useLikelihood:
141 factory.BookMethod(
142 loader,
143 TMVA.Types.kLikelihood,
144 "Likelihood",
145 H=True,
146 V=False,
147 TransformOutput=True,
148 PDFInterpol="Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10",
149 NSmooth=1,
150 NAvEvtPerBin=50,
151 )
152
153# Use a kernel density estimator to approximate the PDFs
154if useLikelihoodKDE:
155 factory.BookMethod(
156 loader,
157 TMVA.Types.kLikelihood,
158 "LikelihoodKDE",
159 H=False,
160 V=False,
161 TransformOutput=False,
162 PDFInterpol="KDE",
163 KDEtype="Gauss",
164 KDEiter="Adaptive",
165 KDEFineFactor=0.3,
166 KDEborder=None,
167 NAvEvtPerBin=50,
168 )
169
170# Fisher discriminant (same as LD)
171if useFischer:
172 factory.BookMethod(
173 loader,
174 TMVA.Types.kFisher,
175 "Fisher",
176 H=True,
177 V=False,
178 Fisher=True,
179 VarTransform=None,
180 CreateMVAPdfs=True,
181 PDFInterpolMVAPdf="Spline2",
182 NbinsMVAPdf=50,
183 NsmoothMVAPdf=10,
184 )
185
186# Boosted Decision Trees
187if useBDT:
188 factory.BookMethod(
189 loader,
190 TMVA.Types.kBDT,
191 "BDT",
192 V=False,
193 NTrees=200,
194 MinNodeSize="2.5%",
195 MaxDepth=2,
196 BoostType="AdaBoost",
197 AdaBoostBeta=0.5,
198 UseBaggedBoost=True,
199 BaggedSampleFraction=0.5,
200 SeparationType="GiniIndex",
201 nCuts=20,
202 )
203
204# Multi-Layer Perceptron (Neural Network)
205if useMLP:
206 factory.BookMethod(
207 loader,
208 TMVA.Types.kMLP,
209 "MLP",
210 H=False,
211 V=False,
212 NeuronType="tanh",
213 VarTransform="N",
214 NCycles=100,
215 HiddenLayers="N+5",
216 TestRate=5,
217 UseRegulator=False,
218 )
219
220## Here we book the new DNN of TMVA if we have support in ROOT. We will use GPU version if ROOT is enabled with GPU
221
222
223## Booking Deep Neural Network
224
225# Here we define the option string for building the Deep Neural network model.
226
227#### 1. Define DNN layout
228
229# The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed.
230
231# We define first the DNN layout:
232
233# - **input layout** : this defines the input data format for the DNN as ``input depth | height | width``.
234# In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features)
235# - **batch layout** : this defines how are the input batch. It is related to input layout but not the same.
236# If the first layer is dense it should be ``1 | batch size ! number of variables`` (features)
237
238# *(note the use of the character `|` as separator of input parameters for DNN layout)*
239
240# note that in case of only dense layer the input layout could be omitted but it is required when defining more
241# complex architectures
242
243# - **layer layout** string defining the layer architecture. The syntax is
244# - layer type (e.g. DENSE, CONV, RNN)
245# - layer parameters (e.g. number of units)
246# - activation function (e.g TANH, RELU,...)
247
248# *the different layers are separated by the ``","`` *
249
250#### 2. Define Training Strategy
251
252# We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator.
253# One can then concatenate different training strategy with different parameters. The training strategy are separated by
254# the ``"|"`` separator.
255
256# - Optimizer
257# - Learning rate
258# - Momentum (valid for SGD and RMSPROP)
259# - Regularization and Weight Decay
260# - Dropout
261# - Max number of epochs
262# - Convergence steps. if the test error will not decrease after that value the training will stop
263# - Batch size (This value must be the same specified in the input layout)
264# - Test Repetitions (the interval when the test error will be computed)
265
266
267#### 3. Define general DNN options
268
269# We define the general DNN options concatenating in the final string the previously defined layout and training strategy.
270# Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods.
271# In addition to input layout, batch layout and training strategy we add now:
272
273# - Type of Loss function (e.g. CROSSENTROPY)
274# - Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL )
275# - Variable Transformation
276# - Type of Architecture (e.g. CPU, GPU, Standard)
277
278# We can then book the DL method using the built option string
279if useDL:
280 useDLGPU = ROOT.gSystem.GetFromPipe("root-config --has-tmva-gpu") == "yes"
281
282 # Define DNN layout
283 # Define Training strategies
284 # one can catenate several training strategies
285 training1 = ROOT.TString(
286 "LearningRate=1e-3,Momentum=0.9,"
287 "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
288 "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
289 "Optimizer=ADAM,ADAM_beta1=0.9,ADAM_beta2=0.999,ADAM_eps=1.E-7," # ADAM default parameters
290 "DropConfig=0.0+0.0+0.0+0."
291 )
292 # training2 = ROOT.TString("LearningRate=1e-3,Momentum=0.9"
293 # "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
294 # "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
295 # "Optimizer=SGD,DropConfig=0.0+0.0+0.0+0.")
296
297 # General Options.
298 dnnMethodName = ROOT.TString("DNN_CPU")
299
300 if useDLGPU:
301 arch = "GPU"
302 dnnMethodName = "DNN_GPU"
303 else:
304 arch = "CPU"
305
306 factory.BookMethod(
307 loader,
308 TMVA.Types.kDL,
309 dnnMethodName,
310 H=False,
311 V=True,
312 ErrorStrategy="CROSSENTROPY",
313 VarTransform="G",
314 WeightInitialization="XAVIER",
315 InputLayout="1|1|7",
316 BatchLayout="1|128|7",
317 Layout="DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|1|LINEAR",
318 TrainingStrategy=training1,
319 Architecture=arch,
320 )
321
322#Keras DL
323if useKeras:
324 ROOT.Info("TMVA_Higgs_Classification", "Building Deep Learning keras model")
325 # create Keras model with 4 layers of 64 units and relu activations
326 import tensorflow
327 from tensorflow.keras.models import Sequential
328 from tensorflow.keras.optimizers import Adam
329 from tensorflow.keras.layers import Input, Dense
330
331 model = Sequential()
332 model.add(Dense(64, activation='relu',input_dim=7))
333 model.add(Dense(64, activation='relu'))
334 model.add(Dense(64, activation='relu'))
335 model.add(Dense(64, activation='relu'))
336 model.add(Dense(2, activation='sigmoid'))
337 model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate = 0.001), metrics = ['accuracy'])
338 model.save('model_higgs.h5')
339 model.summary()
340
341 if not os.path.exists("model_higgs.h5"):
342 raise FileNotFoundError("Error creating Keras model file - skip using Keras")
343 else:
344 # book PyKeras method only if Keras model could be created
345 ROOT.Info("TMVA_Higgs_Classification", "Booking Deep Learning keras model")
346 factory.BookMethod(
347 loader,
348 TMVA.Types.kPyKeras,
349 "PyKeras",
350 H=True,
351 V=False,
352 VarTransform=None,
353 FilenameModel="model_higgs.h5",
354 FilenameTrainedModel="trained_model_higgs.h5",
355 NumEpochs=20,
356 BatchSize=100 )
357# GpuOptions="allow_growth=True",
358# ) # needed for RTX NVidia card and to avoid TF allocates all GPU memory
359
360
361## Train Methods
362
363# Here we train all the previously booked methods.
364
365factory.TrainAllMethods()
366## Test all methods
367
368# Now we test and evaluate all methods using the test data set
369factory.TestAllMethods()
370
371factory.EvaluateAllMethods()
372
373# after we get the ROC curve and we display
374
375c1 = factory.GetROCCurve(loader)
376c1.Draw()
377# at the end we close the output file which contains the evaluation result of all methods and it can be used by TMVAGUI
378# to display additional plots
379
380outputFile.Close()
static Bool_t SetCacheFileDir(ROOT::Internal::TStringView cacheDir, Bool_t operateDisconnected=kTRUE, Bool_t forceCacheread=kFALSE)
Definition: TFile.h:326
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition: TFile.cxx:4053
This is the main MVA steering class.
Definition: Factory.h:80
static void PyInitialize()
Initialize Python interpreter.
static Tools & Instance()
Definition: Tools.cxx:71