The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
This tutorial is a continuation of the HiggsToFourLeptons tutorial. We will build a model to classify the data as Higgs or not Higgs.
import ROOT
import torch
from matplotlib import use
from torch import nn
print("Loading dataframes...")
df_train =
ROOT.RDataFrame(
"tree", data_dir +
"ml_dataloader_Higgs_Classification_train.root")
df_val =
ROOT.RDataFrame(
"tree", data_dir +
"ml_dataloader_Higgs_Classification_val.root")
df_test =
ROOT.RDataFrame(
"tree", data_dir +
"ml_dataloader_Higgs_Classification_test.root")
def __init__(
self,
num_features: int,
hidden_layers: list[int],
p: float = 0.2,
use_dropout: bool = False,
use_batchnorm: bool = True,
):
layers = []
in_dim = num_features
for out_dim in hidden_layers:
if use_batchnorm:
if use_dropout:
in_dim = out_dim
def forward(self, x):
x = self.hidden(x)
x = self.output_layer(x)
batch_size = 1000
batches_in_memory = 1000
drop_remainder = True
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
target = "isHiggsRef"
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
shuffle = True
set_seed = 42
print("Normalizing data...")
for var in columns[:-1]:
if var == "m4l":
else:
means = []
stddevs = []
for i
in range(max_vec_sizes[var]):
scalar_column = f"{var}_{i}"
if stddevs[i] == 0:
stddevs[i] = 0.01
expr =
", ".join(f
"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i
in range(max_vec_sizes[var]))
print("Creating dataloaders...")
df_train,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
df_val,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
df_test,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
hidden_layers = [60, 60]
model =
Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=
False)
print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")
epochs = 1000
last_val_losses = [float("inf")] * 6
avg_val_losses = []
print("Starting training...")
for epoch
in range(epochs):
outputs = model(x_train)
val_loss = 0
val_correct = 0
val_total = 0
outputs = model(x_val)
preds = (outputs > 0.5).float()
val_correct += (preds == y_val).
sum().
item()
avg_val_loss = val_loss / (j + 1)
val_accuracy = val_correct / val_total
if epoch % 10 == 9:
del last_val_losses[0]
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
epochs = epoch + 1
break
test_loss = 0
test_correct = 0
test_total = 0
test_preds = []
test_true = []
outputs = model(x_test)
preds = (outputs > 0.5).float()
test_correct += (preds == y_test).
sum().
item()
avg_test_loss = test_loss / (j + 1)
test_accuracy = test_correct / test_total
print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
print("Loss curve saved to loss_curve.png")
print("ROC curve saved to ROC_curve.png")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t UChar_t len
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A "std::vector"-like collection of values implementing handy operation to analyse them.
static uint64_t sum(uint64_t i)
Loading dataframes...
Normalizing data...
Creating dataloaders...
Starting training...
Epoch 10 summary ==> Validation loss: 0.45; Validation accuracy: 0.81
Epoch 20 summary ==> Validation loss: 0.39; Validation accuracy: 0.83
Epoch 30 summary ==> Validation loss: 0.35; Validation accuracy: 0.85
Epoch 40 summary ==> Validation loss: 0.32; Validation accuracy: 0.86
Epoch 50 summary ==> Validation loss: 0.30; Validation accuracy: 0.87
Epoch 60 summary ==> Validation loss: 0.28; Validation accuracy: 0.88
Epoch 70 summary ==> Validation loss: 0.26; Validation accuracy: 0.89
Epoch 80 summary ==> Validation loss: 0.24; Validation accuracy: 0.90
Epoch 90 summary ==> Validation loss: 0.22; Validation accuracy: 0.92
Epoch 100 summary ==> Validation loss: 0.20; Validation accuracy: 0.93
Epoch 110 summary ==> Validation loss: 0.18; Validation accuracy: 0.94
Epoch 120 summary ==> Validation loss: 0.16; Validation accuracy: 0.95
Epoch 130 summary ==> Validation loss: 0.15; Validation accuracy: 0.96
Epoch 140 summary ==> Validation loss: 0.14; Validation accuracy: 0.96
Epoch 150 summary ==> Validation loss: 0.13; Validation accuracy: 0.96
Epoch 160 summary ==> Validation loss: 0.13; Validation accuracy: 0.96
Epoch 170 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 180 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 190 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 200 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Epoch 210 summary ==> Validation loss: 0.12; Validation accuracy: 0.96
Validation loss has not improved for 6 epochs, stopping training after 216 epochs.
Testing Loss: 0.1254 Accuracy: 0.9630
Loss curve saved to loss_curve.png
ROC curve saved to ROC_curve.png