doc/master/ml__dataloader__Higgs__Classification_8py.html

import matplotlib.pyplot as plt

import ROOT

import sklearn.metrics as skl

import torch

from matplotlib import use

from torch import nn


print("Loading dataframes...")

data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"

df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")

df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")

df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")


# Classifier model with adjustable hidden layers

class Classifier(nn.Module):

    def __init__(

        self,

        num_features: int,

        hidden_layers: list[int],

        p: float = 0.2,

        use_dropout: bool = False,

        use_batchnorm: bool = True,

    ):

        super().__init__()


        layers = []

        in_dim = num_features


        for out_dim in hidden_layers:

            block = [nn.Linear(in_dim, out_dim)]


            if use_batchnorm:

                block.append(nn.BatchNorm1d(out_dim))


            block.append(nn.ReLU())


            if use_dropout:

                block.append(nn.Dropout(p))


            layers.append(nn.Sequential(*block))

            in_dim = out_dim


        self.hidden = nn.Sequential(*layers)

        self.output_layer = nn.Linear(in_dim, 1)


    def forward(self, x):

        x = self.hidden(x)

        x = self.output_layer(x)

        return torch.sigmoid(x)


batch_size = 1000

batches_in_memory = 1000

drop_remainder = True

columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]

target = "isHiggsRef"

max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}

shuffle = True

set_seed = 42


# Normalize the data!

print("Normalizing data...")

for var in columns[:-1]:

    if var == "m4l":  # The only non-vector column

        mean = df_train.Mean(var).GetValue()

        stddev = df_train.StdDev(var).GetValue()

        df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")

        # The validation and testing data should be normalized based on the

        # mean and standard deviation calculated from the training data.

        df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")

        df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")

    else:

        # Each vector event has 4 columns, and we need to take a column-wise mean and stddev

        means = []

        stddevs = []

        for i in range(max_vec_sizes[var]):

            scalar_column = f"{var}_{i}"

            df_train = df_train.Define(scalar_column, f"{var}[{i}]")

            means.append(df_train.Mean(scalar_column).GetValue())

            stddevs.append(df_train.StdDev(scalar_column).GetValue())

        mean_vec = ROOT.RVec("double")(means)

        stddev_vec = ROOT.RVec("double")(stddevs)

        for i in range(len(stddevs)):

            if stddevs[i] == 0:

                stddevs[i] = 0.01  # Avoids division by 0

        expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))

        df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")

        # The validation and testing data should be normalized based on the

        # mean and standard deviation calculated from the training data.

        df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")

        df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")


print("Creating dataloaders...")

train = ROOT.Experimental.ML.RDataLoader(

    df_train,

    batch_size=batch_size,

    batches_in_memory=batches_in_memory,

    drop_remainder=drop_remainder,

    columns=columns,

    target=target,

    max_vec_sizes=max_vec_sizes,

    shuffle=shuffle,

    set_seed=set_seed,

)

val = ROOT.Experimental.ML.RDataLoader(

    df_val,

    batch_size=batch_size,

    batches_in_memory=batches_in_memory,

    drop_remainder=drop_remainder,

    columns=columns,

    target=target,

    max_vec_sizes=max_vec_sizes,

    shuffle=shuffle,

    set_seed=set_seed,

)

test = ROOT.Experimental.ML.RDataLoader(

    df_test,

    batch_size=batch_size,

    batches_in_memory=batches_in_memory,

    drop_remainder=drop_remainder,

    columns=columns,

    target=target,

    max_vec_sizes=max_vec_sizes,

    shuffle=shuffle,

    set_seed=set_seed,

)


# num_features must be calculated manually since the train.training_columns includes condensed vector columns.

# Vector columns are lazily expanded while receiving batches, unless eager_loading is enabled.

num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])


torch.manual_seed(set_seed)

hidden_layers = [60, 60]

model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)

loss_fn = nn.BCELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


def print_epoch_summary(epoch: int, val_loss: float, val_accuracy: float):

    print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")


epochs = 1000

last_val_losses = [float("inf")] * 6

# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that

avg_val_losses = []

print("Starting training...")

for epoch in range(epochs):

    # training

    model.train()


    for i, (x_train, y_train) in enumerate(train.as_torch()):

        outputs = model(x_train)

        loss = loss_fn(outputs, y_train)


        optimizer.zero_grad()

        loss.backward()

        optimizer.step()


    # validation

    model.eval()

    val_loss = 0

    val_correct = 0

    val_total = 0


    with torch.no_grad():

        for j, (x_val, y_val) in enumerate(val.as_torch()):

            outputs = model(x_val)

            loss = loss_fn(outputs, y_val)

            val_loss += loss.item()


            preds = (outputs > 0.5).float()

            val_correct += (preds == y_val).sum().item()

            val_total += y_val.size(0)


    avg_val_loss = val_loss / (j + 1)

    avg_val_losses.append(avg_val_loss)

    val_accuracy = val_correct / val_total


    if epoch % 10 == 9:

        print_epoch_summary(epoch + 1, val_loss, val_accuracy)

    del last_val_losses[0]

    last_val_losses.append(avg_val_loss)

    # Early stopping check

    if min(last_val_losses[-3:]) > max(last_val_losses[:3]):

        print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")

        epochs = epoch + 1

        break


# Testing

model.eval()

test_loss = 0

test_correct = 0

test_total = 0


test_preds = []

test_true = []

with torch.no_grad():

    for j, (x_test, y_test) in enumerate(test.as_torch()):

        outputs = model(x_test)

        loss = loss_fn(outputs, y_test)

        test_loss += loss.item()

        test_preds += outputs.tolist()

        test_true += y_test.tolist()


        preds = (outputs > 0.5).float()

        test_correct += (preds == y_test).sum().item()

        test_total += y_test.size(0)


avg_test_loss = test_loss / (j + 1)

test_accuracy = test_correct / test_total


print(f"Testing Loss: {avg_test_loss:.4f}  Accuracy: {test_accuracy:.4f}\n")


# Analysis

use("Agg")  # Non-interactive backend for writing to files


fig = plt.figure()

ax = plt.axes()

ax.plot([i for i in range(epochs)], avg_val_losses)

plt.title("Loss curve")

plt.xlabel("Epoch")

plt.ylabel("Validation loss")

plt.savefig("loss_curve")

print("Loss curve saved to loss_curve.png")


fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)

fig = plt.figure()

ax = plt.axes()

ax.plot(fpr[:-1], tpr[:-1])

plt.title("ROC curve")

plt.xlabel("False positive rate")

plt.ylabel("True positive rate")

plt.savefig("ROC_curve")

print("ROC curve saved to ROC_curve.png")

TRangeDynCast
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Definition TCollection.h:359

len
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t UChar_t len
Definition TGWin32VirtualXProxy.cxx:249

ROOT::Detail::TRangeCast
Definition TCollection.h:312

ROOT::RDataFrame
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Definition RDataFrame.hxx:50

ROOT::VecOps::RVec
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1533

sum
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2338