Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
rf618_mixture_models.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_roofit
3## \notebook
4## Use of mixture models in RooFit.
5##
6## This tutorial shows, how to use mixture models for Likelihood Calculation in ROOT. Instead of directly
7## calculating the likelihood we use simulation based inference (SBI) as shown in tutorial 'rf615_simulation_based_inference.py'.
8## We train the classifier to discriminate between samples from an background hypothesis here the zz samples and a target
9## hypothesis, here the higgs samples. The data preparation is based on the tutorial 'df106_HiggsToFourLeptons.py'.
10##
11## An introduction to mixture models can be found here https://arxiv.org/pdf/1506.02169.
12##
13## A short summary:
14## We assume the whole probability distribution can be written as a mixture of several components, i.e.
15## $$p(x|\theta)= \sum_{c}w_{c}(\theta)p_{c}(x|\theta)$$
16## We can write the likelihood ratio in terms of pairwise classification problems
17## \begin{align*}
18## \frac{p(x|\mu)}{p(x|0)}&= \frac{\sum_{c}w_{c}(\mu)p_{c}(x|\mu)}{\sum_{c'}w_{c'}(0)p_{c'}(x|0)}\\
19## &=\sum_{c}\Bigg[\sum_{c'}\frac{w_{c'}(0)}{w_{c}(\mu)}\frac{p_{c'}(x|0)}{p_{c}(x|\mu)}\Bigg]^{-1},
20## \end{align*}
21## where mu is the signal strength, and a value of 0 corresponds to the background hypothesis. Using this decomposition, one is able to use the pairwise likelihood ratios.
22##
23## Since the only free parameter in our case is mu, the distributions are independent of this parameter and the dependence on the signal strength can be encoded into the weights.
24## Thus, the subratios simplify dramatically since they are independent of theta and these ratios can be pre-computed and the classifier does
25## not need to be parametrized.
26##
27## If you wish to see an analysis done with template histograms see 'hf001_example.py'.
28##
29## \macro_image
30## \macro_code
31## \macro_output
32##
33## \date September 2024
34## \author Robin Syring
35
36import ROOT
37import os
38import numpy as np
39import xgboost as xgb
40
41# Get Dataframe from tutorial df106_HiggsToFourLeptons.py
42# Adjust the path if running locally
43df = ROOT.RDataFrame("tree", ROOT.gROOT.GetTutorialDir().Data() + "/dataframe/df106_HiggsToFourLeptons.root")
44
45# Initialize a dictionary to store counts and weight sums for each category
46results = {}
47
48
49# Extract the relevant columns once and avoid repeated calls
50data_dict = df.AsNumpy(columns=["m4l", "sample_category", "weight"])
51
52
53weights_dict = {
54 name: data_dict["weight"][data_dict["sample_category"] == [name]].sum() for name in ("data", "zz", "other", "higgs")
55}
56
57# Loop over each sample category
58for sample_category in ["data", "higgs", "zz", "other"]:
59
60 weight_sum = weights_dict[sample_category]
61
62 mask = data_dict["sample_category"] == sample_category
63 # Normalize each weight
64 weights = data_dict["weight"][mask]
65 # Extract the weight_modified
66 weight_modified = weights / weight_sum
67
68 count = np.sum(mask)
69
70 # Store the count and weight sum in the dictionary
71 results[sample_category] = {
72 "weight_sum": weight_sum,
73 "weight_modified": weight_modified,
74 "count": count,
75 "weight": weights,
76 }
77
78
79# Extract the mass for higgs and zz
80higgs_data = data_dict["m4l"][data_dict["sample_category"] == ["higgs"]]
81zz_data = data_dict["m4l"][data_dict["sample_category"] == ["zz"]]
82
83
84# Prepare sample weights
85sample_weight_higgs = np.array([results["higgs"]["weight_modified"]]).flatten()
86sample_weight_zz = np.array([results["zz"]["weight_modified"]]).flatten()
87
88# Putting sample weights together in the same manner as the training data
89sample_weight = np.concatenate([sample_weight_higgs, sample_weight_zz])
90
91# For Training purposes we have to get rid of the negative weights, since xgb can't handle them
92sample_weight[sample_weight < 0] = 1e-6
93
94# Prepare the features and labels
95X = np.concatenate((higgs_data, zz_data), axis=0).reshape(-1, 1)
96y = np.concatenate([np.ones(len(higgs_data)), np.zeros(len(zz_data))])
97
98# Train the Classifier to discriminate between higgs and zz
99model_xgb = xgb.XGBClassifier(n_estimators=1000, max_depth=5, eta=0.2, min_child_weight=1e-6, nthread=1)
100model_xgb.fit(X, y, sample_weight=sample_weight)
101
102
103# Building a RooRealVar based on the observed data
104m4l = ROOT.RooRealVar("m4l", "Four Lepton Invariant Mass", 0.0)
105
106
107# Define functions to compute the learned likelihood.
108def calculate_likelihood_xgb(m4l_arr: np.ndarray) -> np.ndarray:
109 prob = model_xgb.predict_proba(m4l_arr.T)[:, 0]
110 return (1 - prob) / prob
111
112
113llh = ROOT.RooFit.bindFunction(f"llh", calculate_likelihood_xgb, m4l)
114
115# Number of signals and background
116n_signal = results["higgs"]["weight"].sum()
117n_back = results["zz"]["weight"].sum()
118
119
120# Define weight functions
121def weight_back(mu):
122 return n_back / (n_back + mu * n_signal)
123
124
125def weight_signal(mu):
126 return 1 - weight_back(mu)
127
128
129# Define the likelihood ratio accordingly to mixture models
130def likelihood_ratio(llr: np.ndarray, mu: np.ndarray) -> np.ndarray:
131
132 m = 2
133
134 w_0 = np.array([weight_back(0), weight_signal(0)])
135 w_1 = np.array([weight_back(mu[0]), weight_signal(mu[0])])
136
137 w = np.outer(w_1, 1.0 / w_0)
138
139 p = np.ones((m, m, len(llr)))
140 p[1, 0] = llr
141 for i in range(m):
142 for j in range(i):
143 p[j, i] = 1.0 / p[i, j]
144
145 return 1.0 / np.sum(1.0 / np.sum(np.expand_dims(w, axis=2) * p, axis=0), axis=0)
146
147
148mu_var = ROOT.RooRealVar("mu", "mu", 0.1, 5)
149
150nll_ratio = ROOT.RooFit.bindFunction(f"nll", likelihood_ratio, llh, mu_var)
151pdf_learned = ROOT.RooWrapperPdf("learned_pdf", "learned_pdf", nll_ratio, selfNormalized=True)
152
153# Plot the likelihood
154frame1 = m4l.frame(Title="Likelihood ratio r(m_{4l}|#mu=1);m_{4l};p(#mu=1)/p(#mu=0)", Range=(80, 170))
155# llh.plotOn(frame1, ShiftToZero=False, LineColor="kP6Blue")
156nll_ratio.plotOn(frame1, ShiftToZero=False, LineColor="kP6Blue")
157
158n_pred = ROOT.RooFormulaVar("n_pred", f"{n_back} + mu * {n_signal}", [mu_var])
159pdf_learned_extended = ROOT.RooExtendPdf("final_pdf", "final_pdf", pdf_learned, n_pred)
160
161# Prepare the observed data set and NLL
162data = ROOT.RooDataSet.from_numpy({"m4l": data_dict["m4l"][data_dict["sample_category"] == ["data"]]}, [m4l])
163nll = pdf_learned_extended.createNLL(data, Extended=True)
164
165# Plot the nll computet by the mixture model
166frame2 = mu_var.frame(Title="NLL sum;#mu (signal strength);#Delta NLL", Range=(0.5, 4))
167nll.plotOn(frame2, ShiftToZero=True, LineColor="kP6Blue")
168
169# Write the plots into one canvas to show, or into separate canvases for saving.
170single_canvas = True
171
172c = ROOT.TCanvas("", "", 1200 if single_canvas else 600, 600)
173if single_canvas:
174 c.Divide(2)
175 c.cd(1)
176 ROOT.gPad.SetLeftMargin(0.15)
177 frame1.GetYaxis().SetTitleOffset(1.8)
178frame1.Draw()
179
180if single_canvas:
181 c.cd(2)
182 ROOT.gPad.SetLeftMargin(0.15)
183 frame2.GetYaxis().SetTitleOffset(1.8)
184else:
185 c.SaveAs("rf618_plot_1.png")
186 c = ROOT.TCanvas("", "", 600, 600)
187
188frame2.Draw()
189
190if not single_canvas:
191 c.SaveAs("rf618_plot_2.png")
192
193# Compute the minimum via minuit and display the results
194minimizer = ROOT.RooMinimizer(nll)
195minimizer.setErrorLevel(0.5) # Adjust the error level in the minimization to work with likelihoods
196minimizer.setPrintLevel(-1)
197minimizer.minimize("Minuit2")
198result = minimizer.save()
199ROOT.SetOwnership(result, True)
200result.Print()
201
202del minimizer
203del nll
204del pdf_learned_extended
205del n_pred
206del llh
207del nll_ratio
208
209import sys
210
211# Hack to bypass ClearProxiedObjects()
212del sys.modules["libROOTPythonizations"]
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t UChar_t len
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345