Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
ml_dataloader_Higgs_Classification.py File Reference

Detailed Description

The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.

This tutorial is a continuation of the HiggsToFourLeptons tutorial. We will build a model to classify the data as Higgs or not Higgs.

import matplotlib.pyplot as plt
import ROOT
import sklearn.metrics as skl
import torch
from matplotlib import use
from torch import nn
print("Loading dataframes...")
data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")
# Classifier model with adjustable hidden layers
def __init__(
self,
num_features: int,
hidden_layers: list[int],
p: float = 0.2,
use_dropout: bool = False,
use_batchnorm: bool = True,
):
super().__init__()
layers = []
in_dim = num_features
for out_dim in hidden_layers:
block = [nn.Linear(in_dim, out_dim)]
if use_batchnorm:
if use_dropout:
in_dim = out_dim
self.hidden = nn.Sequential(*layers)
self.output_layer = nn.Linear(in_dim, 1)
def forward(self, x):
x = self.hidden(x)
x = self.output_layer(x)
return torch.sigmoid(x)
batch_size = 1000
batches_in_memory = 1000
drop_remainder = True
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
target = "isHiggsRef"
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
shuffle = True
set_seed = 42
# Normalize the data!
print("Normalizing data...")
for var in columns[:-1]:
if var == "m4l": # The only non-vector column
mean = df_train.Mean(var).GetValue()
stddev = df_train.StdDev(var).GetValue()
df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
# The validation and testing data should be normalized based on the
# mean and standard deviation calculated from the training data.
df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
else:
# Each vector event has 4 columns, and we need to take a column-wise mean and stddev
means = []
stddevs = []
for i in range(max_vec_sizes[var]):
scalar_column = f"{var}_{i}"
df_train = df_train.Define(scalar_column, f"{var}[{i}]")
means.append(df_train.Mean(scalar_column).GetValue())
stddevs.append(df_train.StdDev(scalar_column).GetValue())
mean_vec = ROOT.RVec("double")(means)
stddev_vec = ROOT.RVec("double")(stddevs)
for i in range(len(stddevs)):
if stddevs[i] == 0:
stddevs[i] = 0.01 # Avoids division by 0
expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
# The validation and testing data should be normalized based on the
# mean and standard deviation calculated from the training data.
df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
print("Creating dataloaders...")
df_train,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
df_val,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
df_test,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
# num_features must be calculated manually since the train.training_columns includes condensed vector columns.
# Vector columns are lazily expanded while receiving batches, unless eager_loading is enabled.
num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
hidden_layers = [60, 60]
model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
def print_epoch_summary(epoch: int, val_loss: float, val_accuracy: float):
print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")
epochs = 1000
last_val_losses = [float("inf")] * 6
# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
avg_val_losses = []
print("Starting training...")
for epoch in range(epochs):
# training
for i, (x_train, y_train) in enumerate(train.as_torch()):
outputs = model(x_train)
loss = loss_fn(outputs, y_train)
# validation
val_loss = 0
val_correct = 0
val_total = 0
with torch.no_grad():
for j, (x_val, y_val) in enumerate(val.as_torch()):
outputs = model(x_val)
loss = loss_fn(outputs, y_val)
val_loss += loss.item()
preds = (outputs > 0.5).float()
val_correct += (preds == y_val).sum().item()
val_total += y_val.size(0)
avg_val_loss = val_loss / (j + 1)
avg_val_losses.append(avg_val_loss)
val_accuracy = val_correct / val_total
if epoch % 10 == 9:
print_epoch_summary(epoch + 1, val_loss, val_accuracy)
del last_val_losses[0]
last_val_losses.append(avg_val_loss)
# Early stopping check
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
epochs = epoch + 1
break
# Testing
test_loss = 0
test_correct = 0
test_total = 0
test_preds = []
test_true = []
for j, (x_test, y_test) in enumerate(test.as_torch()):
outputs = model(x_test)
loss = loss_fn(outputs, y_test)
test_loss += loss.item()
test_preds += outputs.tolist()
test_true += y_test.tolist()
preds = (outputs > 0.5).float()
test_correct += (preds == y_test).sum().item()
test_total += y_test.size(0)
avg_test_loss = test_loss / (j + 1)
test_accuracy = test_correct / test_total
print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
# Analysis
use("Agg") # Non-interactive backend for writing to files
fig = plt.figure()
ax = plt.axes()
ax.plot([i for i in range(epochs)], avg_val_losses)
plt.title("Loss curve")
plt.xlabel("Epoch")
plt.ylabel("Validation loss")
plt.savefig("loss_curve")
print("Loss curve saved to loss_curve.png")
fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
fig = plt.figure()
ax = plt.axes()
ax.plot(fpr[:-1], tpr[:-1])
plt.title("ROC curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.savefig("ROC_curve")
print("ROC curve saved to ROC_curve.png")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t UChar_t len
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1530
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2338
Loading dataframes...
Normalizing data...
Creating dataloaders...
Starting training...
Epoch 10 summary ==> Validation loss: 0.45; Validation accuracy: 0.81
Epoch 20 summary ==> Validation loss: 0.39; Validation accuracy: 0.83
Epoch 30 summary ==> Validation loss: 0.35; Validation accuracy: 0.85
Epoch 40 summary ==> Validation loss: 0.32; Validation accuracy: 0.86
Epoch 50 summary ==> Validation loss: 0.30; Validation accuracy: 0.87
Epoch 60 summary ==> Validation loss: 0.28; Validation accuracy: 0.88
Epoch 70 summary ==> Validation loss: 0.26; Validation accuracy: 0.89
Epoch 80 summary ==> Validation loss: 0.24; Validation accuracy: 0.90
Epoch 90 summary ==> Validation loss: 0.22; Validation accuracy: 0.92
Epoch 100 summary ==> Validation loss: 0.20; Validation accuracy: 0.93
Epoch 110 summary ==> Validation loss: 0.18; Validation accuracy: 0.94
Epoch 120 summary ==> Validation loss: 0.16; Validation accuracy: 0.95
Epoch 130 summary ==> Validation loss: 0.15; Validation accuracy: 0.96
Epoch 140 summary ==> Validation loss: 0.14; Validation accuracy: 0.96
Epoch 150 summary ==> Validation loss: 0.13; Validation accuracy: 0.96
Epoch 160 summary ==> Validation loss: 0.13; Validation accuracy: 0.96
Epoch 170 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 180 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 190 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 200 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 210 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Validation loss has not improved for 6 epochs, stopping training after 216 epochs.
Testing Loss: 0.1254 Accuracy: 0.9630
Loss curve saved to loss_curve.png
ROC curve saved to ROC_curve.png
Date
June 2026
Authors
Jonah Ascoli (CERN), Martin Foll (CERN, University of Oslo (UiO)), Silia Taider (CERN)

Definition in file ml_dataloader_Higgs_Classification.py.