Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
TMVA_Higgs_Classification.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_tmva
3## \notebook
4## Classification example of TMVA based on public Higgs UCI dataset
5##
6## The UCI data set is a public HIGGS data set , see http://archive.ics.uci.edu/ml/datasets/HIGGS
7## used in this paper: Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics
8## with Deep Learning.” Nature Communications 5 (July 2, 2014).
9##
10## \macro_image
11## \macro_output
12## \macro_code
13##
14## \author Harshal Shende
15
16## Declare Factory
17
18
19## Create the Factory class. Later you can choose the methods
20## whose performance you'd like to investigate.
21
22## The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass
23
24## - The first argument is the base of the name of all the output
25## weightfiles in the directory weight/ that will be created with the
26## method parameters
27
28## - The second argument is the output file for the training results
29
30## - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string
31
32import ROOT
33import os
34
35TMVA = ROOT.TMVA
36TFile = ROOT.TFile
37
39
40# options to control used methods
41useLikelihood = True # likelihood based discriminant
42useLikelihoodKDE = False # likelihood based discriminant
43useFischer = True # Fischer discriminant
44useMLP = False # Multi Layer Perceptron (old TMVA NN implementation)
45useBDT = True # Boosted Decision Tree
46useDL = True # TMVA Deep learning ( CPU or GPU)
47useKeras = True # Use Keras Deep Learning via PyMVA
48
49if ROOT.gSystem.GetFromPipe("root-config --has-tmva-pymva") == "yes":
51else:
52 useKeras = False # cannot use Keras if PYMVA is not available
53
54if useKeras:
55 try:
56 import tensorflow
57 except:
58 ROOT.Warning("TMVA_Higgs_Classification", "Skip using Keras since tensorflow is not available")
59 useKeras = False
60
61outputFile = TFile.Open("Higgs_ClassificationOutput.root", "RECREATE")
62factory = TMVA.Factory(
63 "TMVA_Higgs_Classification", outputFile, V=False, ROC=True, Silent=False, Color=True, AnalysisType="Classification"
64)
65
66
67## Setup Dataset(s)
68
69# Define now input data file and signal and background trees
70
71inputFileName = str(ROOT.gROOT.GetTutorialDir()) + "/machine_learning/data/Higgs_data.root"
72
73inputFile = TFile.Open(inputFileName)
74if inputFile is None:
75 raise FileNotFoundError("Input file is not found - exit")
76
77
78# --- Register the training and test trees
79signalTree = inputFile.Get("sig_tree")
80backgroundTree = inputFile.Get("bkg_tree")
82
83## Declare DataLoader(s)
84
85# The next step is to declare the DataLoader class that deals with input variables
86# Define the input variables that shall be used for the MVA training
87# note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )]
88loader = TMVA.DataLoader("dataset")
89
91loader.AddVariable("m_jjj")
93loader.AddVariable("m_jlv")
95loader.AddVariable("m_wbb")
96loader.AddVariable("m_wwbb")
97
98# We set now the input data trees in the TMVA DataLoader class
99# global event weights per tree (see below for setting event-wise weights)
100signalWeight = 1.0
101backgroundWeight = 1.0
102# You can add an arbitrary number of signal or background trees
103loader.AddSignalTree(signalTree, signalWeight)
104loader.AddBackgroundTree(backgroundTree, backgroundWeight)
105
106# Set individual event weights (the variables must exist in the original TTree)
107# for signal : factory->SetSignalWeightExpression ("weight1*weight2");
108# for background: factory->SetBackgroundWeightExpression("weight1*weight2");
109# loader->SetBackgroundWeightExpression( "weight" );
110
111
112# Apply additional cuts on the signal and background samples (can be different)
113mycuts = ROOT.TCut("") # for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
114mycutb = ROOT.TCut("") # for example: TCut mycutb = "abs(var1)<0.5";
115
116# Tell the factory how to use the training and testing events
117#
118# If no numbers of events are given, half of the events in the tree are used
119# for training, and the other half for testing:
120# loader->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
121# To also specify the number of testing events, use:
122
124 mycuts, mycutb, nTrain_Signal=7000, nTrain_Background=7000, SplitMode="Random", NormMode="NumEvents", V=False
125)
126
127## Booking Methods
128
129# Here we book the TMVA methods. We book first a Likelihood based on KDE (Kernel Density Estimation), a Fischer discriminant, a BDT
130# and a shallow neural network
131# Likelihood ("naive Bayes estimator")
132if useLikelihood:
134 loader,
136 "Likelihood",
137 H=True,
138 V=False,
139 TransformOutput=True,
140 PDFInterpol="Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10",
141 NSmooth=1,
142 NAvEvtPerBin=50,
143 )
144
145# Use a kernel density estimator to approximate the PDFs
146if useLikelihoodKDE:
148 loader,
150 "LikelihoodKDE",
151 H=False,
152 V=False,
153 TransformOutput=False,
154 PDFInterpol="KDE",
155 KDEtype="Gauss",
156 KDEiter="Adaptive",
157 KDEFineFactor=0.3,
158 KDEborder=None,
159 NAvEvtPerBin=50,
160 )
161
162# Fisher discriminant (same as LD)
163if useFischer:
165 loader,
167 "Fisher",
168 H=True,
169 V=False,
170 Fisher=True,
171 VarTransform=None,
172 CreateMVAPdfs=True,
173 PDFInterpolMVAPdf="Spline2",
174 NbinsMVAPdf=50,
175 NsmoothMVAPdf=10,
176 )
177
178# Boosted Decision Trees
179if useBDT:
181 loader,
183 "BDT",
184 V=False,
185 NTrees=200,
186 MinNodeSize="2.5%",
187 MaxDepth=2,
188 BoostType="AdaBoost",
189 AdaBoostBeta=0.5,
190 UseBaggedBoost=True,
191 BaggedSampleFraction=0.5,
192 SeparationType="GiniIndex",
193 nCuts=20,
194 )
195
196# Multi-Layer Perceptron (Neural Network)
197if useMLP:
199 loader,
201 "MLP",
202 H=False,
203 V=False,
204 NeuronType="tanh",
205 VarTransform="N",
206 NCycles=100,
207 HiddenLayers="N+5",
208 TestRate=5,
209 UseRegulator=False,
210 )
211
212## Here we book the new DNN of TMVA if we have support in ROOT. We will use GPU version if ROOT is enabled with GPU
213
214
215## Booking Deep Neural Network
216
217# Here we define the option string for building the Deep Neural network model.
218
219#### 1. Define DNN layout
220
221# The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed.
222
223# We define first the DNN layout:
224
225# - **input layout** : this defines the input data format for the DNN as ``input depth | height | width``.
226# In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features)
227# - **batch layout** : this defines how are the input batch. It is related to input layout but not the same.
228# If the first layer is dense it should be ``1 | batch size ! number of variables`` (features)
229
230# *(note the use of the character `|` as separator of input parameters for DNN layout)*
231
232# note that in case of only dense layer the input layout could be omitted but it is required when defining more
233# complex architectures
234
235# - **layer layout** string defining the layer architecture. The syntax is
236# - layer type (e.g. DENSE, CONV, RNN)
237# - layer parameters (e.g. number of units)
238# - activation function (e.g TANH, RELU,...)
239
240# *the different layers are separated by the ``","`` *
241
242#### 2. Define Training Strategy
243
244# We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator.
245# One can then concatenate different training strategy with different parameters. The training strategy are separated by
246# the ``"|"`` separator.
247
248# - Optimizer
249# - Learning rate
250# - Momentum (valid for SGD and RMSPROP)
251# - Regularization and Weight Decay
252# - Dropout
253# - Max number of epochs
254# - Convergence steps. if the test error will not decrease after that value the training will stop
255# - Batch size (This value must be the same specified in the input layout)
256# - Test Repetitions (the interval when the test error will be computed)
257
258
259#### 3. Define general DNN options
260
261# We define the general DNN options concatenating in the final string the previously defined layout and training strategy.
262# Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods.
263# In addition to input layout, batch layout and training strategy we add now:
264
265# - Type of Loss function (e.g. CROSSENTROPY)
266# - Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL )
267# - Variable Transformation
268# - Type of Architecture (e.g. CPU, GPU, Standard)
269
270# We can then book the DL method using the built option string
271if useDL:
272 useDLGPU = ROOT.gSystem.GetFromPipe("root-config --has-tmva-gpu") == "yes"
273
274 # Define DNN layout
275 # Define Training strategies
276 # one can catenate several training strategies
277 training1 = ROOT.TString(
278 "LearningRate=1e-3,Momentum=0.9,"
279 "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
280 "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
281 "Optimizer=ADAM,ADAM_beta1=0.9,ADAM_beta2=0.999,ADAM_eps=1.E-7," # ADAM default parameters
282 "DropConfig=0.0+0.0+0.0+0."
283 )
284 # training2 = ROOT.TString("LearningRate=1e-3,Momentum=0.9"
285 # "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
286 # "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
287 # "Optimizer=SGD,DropConfig=0.0+0.0+0.0+0.")
288
289 # General Options.
290 dnnMethodName = ROOT.TString("DNN_CPU")
291
292 if useDLGPU:
293 arch = "GPU"
294 dnnMethodName = "DNN_GPU"
295 else:
296 arch = "CPU"
297
299 loader,
301 dnnMethodName,
302 H=False,
303 V=True,
304 ErrorStrategy="CROSSENTROPY",
305 VarTransform="G",
306 WeightInitialization="XAVIER",
307 InputLayout="1|1|7",
308 BatchLayout="1|128|7",
309 Layout="DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|1|LINEAR",
310 TrainingStrategy=training1,
311 Architecture=arch,
312 )
313
314# Keras DL
315if useKeras:
316 ROOT.Info("TMVA_Higgs_Classification", "Building Deep Learning keras model")
317 # create Keras model with 4 layers of 64 units and relu activations
318 import tensorflow
319 from tensorflow.keras.models import Sequential
320 from tensorflow.keras.optimizers import Adam
321 from tensorflow.keras.layers import Input, Dense
322
323 model = Sequential()
324 model.add(Dense(64, activation="relu", input_dim=7))
325 model.add(Dense(64, activation="relu"))
326 model.add(Dense(64, activation="relu"))
327 model.add(Dense(64, activation="relu"))
328 model.add(Dense(2, activation="sigmoid"))
329 model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), weighted_metrics=["accuracy"])
330 model.save("model_higgs.h5")
332
333 if not os.path.exists("model_higgs.h5"):
334 raise FileNotFoundError("Error creating Keras model file - skip using Keras")
335 else:
336 # book PyKeras method only if Keras model could be created
337 ROOT.Info("TMVA_Higgs_Classification", "Booking Deep Learning keras model")
339 loader,
341 "PyKeras",
342 H=True,
343 V=False,
344 VarTransform=None,
345 FilenameModel="model_higgs.h5",
346 FilenameTrainedModel="trained_model_higgs.h5",
347 NumEpochs=20,
348 BatchSize=100,
349 )
350# GpuOptions="allow_growth=True",
351# ) # needed for RTX NVidia card and to avoid TF allocates all GPU memory
352
353
354## Train Methods
355
356# Here we train all the previously booked methods.
357
359## Test all methods
360
361# Now we test and evaluate all methods using the test data set
363
365
366# after we get the ROC curve and we display
367
368c1 = factory.GetROCCurve(loader)
369c1.Draw()
370# at the end we close the output file which contains the evaluation result of all methods and it can be used by TMVAGUI
371# to display additional plots
372
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
This is the main MVA steering class.
Definition Factory.h:80