Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
ml_dataloader_Higgs_Classification.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_dataframe
3## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
4##
5## This tutorial is a continuation of the HiggsToFourLeptons tutorial.
6## We will build a model to classify the data as Higgs or not Higgs.
7##
8##
9## \macro_code
10## \macro_output
11##
12## \date June 2026
13## \authors Jonah Ascoli (CERN), Martin Foll (CERN, University of Oslo (UiO)), Silia Taider (CERN)
14
15import matplotlib.pyplot as plt
16import ROOT
17import sklearn.metrics as skl
18import torch
19from matplotlib import use
20from torch import nn
21
22print("Loading dataframes...")
23data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
24df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
25df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
26df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")
27
28
29# Classifier model with adjustable hidden layers
31 def __init__(
32 self,
33 num_features: int,
34 hidden_layers: list[int],
35 p: float = 0.2,
36 use_dropout: bool = False,
37 use_batchnorm: bool = True,
38 ):
39 super().__init__()
40
41 layers = []
42 in_dim = num_features
43
44 for out_dim in hidden_layers:
45 block = [nn.Linear(in_dim, out_dim)]
46
47 if use_batchnorm:
49
51
52 if use_dropout:
54
56 in_dim = out_dim
57
58 self.hidden = nn.Sequential(*layers)
59 self.output_layer = nn.Linear(in_dim, 1)
60
61 def forward(self, x):
62 x = self.hidden(x)
63 x = self.output_layer(x)
64 return torch.sigmoid(x)
65
66
67batch_size = 1000
68batches_in_memory = 1000
69drop_remainder = True
70columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
71target = "isHiggsRef"
72max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
73shuffle = True
74set_seed = 42
75
76# Normalize the data!
77print("Normalizing data...")
78for var in columns[:-1]:
79 if var == "m4l": # The only non-vector column
80 mean = df_train.Mean(var).GetValue()
81 stddev = df_train.StdDev(var).GetValue()
82 df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
83 # The validation and testing data should be normalized based on the
84 # mean and standard deviation calculated from the training data.
85 df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
86 df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
87 else:
88 # Each vector event has 4 columns, and we need to take a column-wise mean and stddev
89 means = []
90 stddevs = []
91 for i in range(max_vec_sizes[var]):
92 scalar_column = f"{var}_{i}"
93 df_train = df_train.Define(scalar_column, f"{var}[{i}]")
94 means.append(df_train.Mean(scalar_column).GetValue())
95 stddevs.append(df_train.StdDev(scalar_column).GetValue())
96 mean_vec = ROOT.RVec("double")(means)
97 stddev_vec = ROOT.RVec("double")(stddevs)
98 for i in range(len(stddevs)):
99 if stddevs[i] == 0:
100 stddevs[i] = 0.01 # Avoids division by 0
101 expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
102 df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
103 # The validation and testing data should be normalized based on the
104 # mean and standard deviation calculated from the training data.
105 df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
106 df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
107
108print("Creating dataloaders...")
110 df_train,
111 batch_size=batch_size,
112 batches_in_memory=batches_in_memory,
113 drop_remainder=drop_remainder,
114 columns=columns,
115 target=target,
116 max_vec_sizes=max_vec_sizes,
117 shuffle=shuffle,
118 set_seed=set_seed,
119)
121 df_val,
122 batch_size=batch_size,
123 batches_in_memory=batches_in_memory,
124 drop_remainder=drop_remainder,
125 columns=columns,
126 target=target,
127 max_vec_sizes=max_vec_sizes,
128 shuffle=shuffle,
129 set_seed=set_seed,
130)
132 df_test,
133 batch_size=batch_size,
134 batches_in_memory=batches_in_memory,
135 drop_remainder=drop_remainder,
136 columns=columns,
137 target=target,
138 max_vec_sizes=max_vec_sizes,
139 shuffle=shuffle,
140 set_seed=set_seed,
141)
142
143# num_features must be calculated manually since the train.training_columns includes condensed vector columns.
144# Vector columns are lazily expanded while receiving batches, unless eager_loading is enabled.
145num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
146
147torch.manual_seed(set_seed)
148hidden_layers = [60, 60]
149model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
150loss_fn = nn.BCELoss()
151optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
152
153
154def print_epoch_summary(epoch: int, val_loss: float, val_accuracy: float):
155 print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")
156
157
158epochs = 1000
159last_val_losses = [float("inf")] * 6
160# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
161avg_val_losses = []
162print("Starting training...")
163for epoch in range(epochs):
164 # training
166
167 for i, (x_train, y_train) in enumerate(train.as_torch()):
168 outputs = model(x_train)
169 loss = loss_fn(outputs, y_train)
170
174
175 # validation
176 model.eval()
177 val_loss = 0
178 val_correct = 0
179 val_total = 0
180
181 with torch.no_grad():
182 for j, (x_val, y_val) in enumerate(val.as_torch()):
183 outputs = model(x_val)
184 loss = loss_fn(outputs, y_val)
185 val_loss += loss.item()
186
187 preds = (outputs > 0.5).float()
188 val_correct += (preds == y_val).sum().item()
189 val_total += y_val.size(0)
190
191 avg_val_loss = val_loss / (j + 1)
192 avg_val_losses.append(avg_val_loss)
193 val_accuracy = val_correct / val_total
194
195 if epoch % 10 == 9:
196 print_epoch_summary(epoch + 1, val_loss, val_accuracy)
197 del last_val_losses[0]
198 last_val_losses.append(avg_val_loss)
199 # Early stopping check
200 if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
201 print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
202 epochs = epoch + 1
203 break
204
205# Testing
207test_loss = 0
208test_correct = 0
209test_total = 0
210
211test_preds = []
212test_true = []
213with torch.no_grad():
214 for j, (x_test, y_test) in enumerate(test.as_torch()):
215 outputs = model(x_test)
216 loss = loss_fn(outputs, y_test)
217 test_loss += loss.item()
218 test_preds += outputs.tolist()
219 test_true += y_test.tolist()
220
221 preds = (outputs > 0.5).float()
222 test_correct += (preds == y_test).sum().item()
223 test_total += y_test.size(0)
224
225avg_test_loss = test_loss / (j + 1)
226test_accuracy = test_correct / test_total
227
228print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
229
230# Analysis
231use("Agg") # Non-interactive backend for writing to files
232
233fig = plt.figure()
234ax = plt.axes()
235ax.plot([i for i in range(epochs)], avg_val_losses)
236plt.title("Loss curve")
237plt.xlabel("Epoch")
238plt.ylabel("Validation loss")
239plt.savefig("loss_curve")
240print("Loss curve saved to loss_curve.png")
241
242fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
243fig = plt.figure()
244ax = plt.axes()
245ax.plot(fpr[:-1], tpr[:-1])
246plt.title("ROC curve")
247plt.xlabel("False positive rate")
248plt.ylabel("True positive rate")
249plt.savefig("ROC_curve")
250print("ROC curve saved to ROC_curve.png")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t UChar_t len
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1530
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2338