Logo ROOT   6.21/01
Reference Guide
df103_NanoAODHiggsAnalysis.py
Go to the documentation of this file.
1 ## \file
2 ## \ingroup tutorial_dataframe
3 ## \notebook -draw
4 ## This tutorial is a simplified but yet complex example of an analysis reconstructing the Higgs boson decaying to two Z
5 ## bosons from events with four leptons. The data and simulated events are taken from CERN OpenData representing a
6 ## subset of the data recorded in 2012 with the CMS detector at the LHC. The tutorials follows the Higgs to four leptons
7 ## analysis published on CERN Open Data portal ([10.7483/OPENDATA.CMS.JKB8.RR42](http://opendata.cern.ch/record/5500)).
8 ## The resulting plots show the invariant mass of the selected four lepton systems in different decay modes (four muons,
9 ## four electrons and two of each kind) and in a combined plot indicating the decay of the Higgs boson with a mass of
10 ## about 125 GeV.
11 ##
12 ## The following steps are performed for each sample with data and simulated events in order to reconstruct the Higgs
13 ## boson from the selected muons and electrons:
14 ## 1. Select interesting events with multiple cuts on event properties, e.g., number of leptons, kinematics of the
15 ## leptons and quality of the tracks.
16 ## 2. Reconstruct two Z bosons of which only one on the mass shell from the selected events and apply additional cuts on
17 ## the reconstructed objects.
18 ## 3. Reconstruct the Higgs boson from the remaining Z boson candidates and calculate its invariant mass.
19 ##
20 ## Another aim of this version of the tutorial is to show a way to blend C++ and Python code. All the functions that
21 ## make computations on data to define new columns or filter existing ones in a precise way, better suited to be written
22 ## in C++, have been moved to a header that is then declared to the ROOT C++ interpreter. The functions that instead
23 ## create nodes of the computational graph (e.g. Filter, Define) remain inside the main Python script.
24 ##
25 ## \macro_image
26 ## \macro_code
27 ## \macro_output
28 ##
29 ## \date July 2019
30 ## \author Stefan Wunsch (KIT, CERN), Vincenzo Eduardo Padulano (UniMiB, CERN)
31 
32 import ROOT
33 import os
34 
35 # Enable multi-threading
36 ROOT.ROOT.EnableImplicitMT()
37 
38 # Include necessary header
39 higgs_header_path = os.path.join(os.sep, str(ROOT.gROOT.GetTutorialDir()) + os.sep, "dataframe" + os.sep,
40  "df103_NanoAODHiggsAnalysis_python.h")
41 
42 ROOT.gInterpreter.Declare('#include "{}"'.format(higgs_header_path))
43 
44 
45 # Python functions
46 def reco_higgs_to_2el2mu(df):
47  """Reconstruct Higgs from two electrons and two muons"""
48  # Filter interesting events
49  df_base = selection_2el2mu(df)
50  # Compute masses of Z systems
51  df_z_mass = df_base.Define("Z_mass", "compute_z_masses_2el2mu(Electron_pt, Electron_eta, Electron_phi,"
52  " Electron_mass, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
53  # Cut on mass of Z candidates
54  df_z_cut = filter_z_candidates(df_z_mass)
55  # Reconstruct H mass
56  df_h_mass = df_z_cut.Define("H_mass", "compute_higgs_mass_2el2mu(Electron_pt, Electron_eta, Electron_phi,"
57  " Electron_mass, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
58 
59  return df_h_mass
60 
61 
62 def selection_2el2mu(df):
63  """Select interesting events with two electrons and two muons"""
64  df_ge2el2mu = df.Filter("nElectron>=2 && nMuon>=2", "At least two electrons and two muons")
65  df_eta = df_ge2el2mu.Filter("All(abs(Electron_eta)<2.5) && All(abs(Muon_eta)<2.4)", "Eta cuts")
66 
67  df_pt = df_eta.Filter("pt_cuts(Muon_pt, Electron_pt)", "Pt cuts")
68 
69  df_dr = df_pt.Filter("dr_cuts(Muon_eta, Muon_phi, Electron_eta, Electron_phi)", "Dr cuts")
70 
71  df_iso = df_dr.Filter("All(abs(Electron_pfRelIso03_all)<0.40) && All(abs(Muon_pfRelIso04_all)<0.40)",
72  "Require good isolation")
73  df_el_ip3d = df_iso.Define("Electron_ip3d_el", "sqrt(Electron_dxy*Electron_dxy + Electron_dz*Electron_dz)")
74  df_el_sip3d = df_el_ip3d.Define("Electron_sip3d_el",
75  "Electron_ip3d_el/sqrt(Electron_dxyErr*Electron_dxyErr + "
76  "Electron_dzErr*Electron_dzErr)")
77  df_el_track = df_el_sip3d.Filter("All(Electron_sip3d_el<4) && All(abs(Electron_dxy)<0.5) &&"
78  " All(abs(Electron_dz)<1.0)",
79  "Electron track close to primary vertex with small uncertainty")
80  df_mu_ip3d = df_el_track.Define("Muon_ip3d_mu", "sqrt(Muon_dxy*Muon_dxy + Muon_dz*Muon_dz)")
81 
82  df_mu_sip3d = df_mu_ip3d.Define("Muon_sip3d_mu",
83  "Muon_ip3d_mu/sqrt(Muon_dxyErr*Muon_dxyErr + Muon_dzErr*Muon_dzErr)")
84  df_mu_track = df_mu_sip3d.Filter("All(Muon_sip3d_mu<4) && All(abs(Muon_dxy)<0.5) && All(abs(Muon_dz)<1.0)",
85  "Muon track close to primary vertex with small uncertainty")
86  df_2p2n = df_mu_track.Filter("Sum(Electron_charge)==0 && Sum(Muon_charge)==0",
87  "Two opposite charged electron and muon pairs")
88 
89  return df_2p2n
90 
91 
92 def reco_higgs_to_4mu(df):
93  """Reconstruct Higgs from four muons"""
94  # Filter interesting events
95  df_base = selection_4mu(df)
96 
97  # Reconstruct Z systems
98  df_z_idx = df_base.Define("Z_idx", "reco_zz_to_4l(Muon_pt, Muon_eta, Muon_phi, Muon_mass, Muon_charge)")
99 
100  # Cut on distance between muons building Z systems
101  df_z_dr = df_z_idx.Filter("filter_z_dr(Z_idx, Muon_eta, Muon_phi)", "Delta R separation of muons building Z system")
102 
103  # Compute masses of Z systems
104  df_z_mass = df_z_dr.Define("Z_mass", "compute_z_masses_4l(Z_idx, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
105 
106  # Cut on mass of Z candidates
107  df_z_cut = filter_z_candidates(df_z_mass)
108 
109  # Reconstruct H mass
110  df_h_mass = df_z_cut.Define("H_mass", "compute_higgs_mass_4l(Z_idx, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
111 
112  return df_h_mass
113 
114 
115 def selection_4mu(df):
116  """Select interesting events with four muons"""
117  df_ge4m = df.Filter("nMuon>=4", "At least four muons")
118 
119  df_iso = df_ge4m.Filter("All(abs(Muon_pfRelIso04_all)<0.40)", "Require good isolation")
120  df_kin = df_iso.Filter("All(Muon_pt>5) && All(abs(Muon_eta)<2.4)", "Good muon kinematics")
121  df_ip3d = df_kin.Define("Muon_ip3d", "sqrt(Muon_dxy*Muon_dxy + Muon_dz*Muon_dz)")
122  df_sip3d = df_ip3d.Define("Muon_sip3d", "Muon_ip3d/sqrt(Muon_dxyErr*Muon_dxyErr + Muon_dzErr*Muon_dzErr)")
123  df_pv = df_sip3d.Filter("All(Muon_sip3d<4) && All(abs(Muon_dxy)<0.5) && All(abs(Muon_dz)<1.0)",
124  "Track close to primary vertex with small uncertainty")
125  df_2p2n = df_pv.Filter("nMuon==4 && Sum(Muon_charge==1)==2 && Sum(Muon_charge==-1)==2",
126  "Two positive and two negative muons")
127 
128  return df_2p2n
129 
130 
131 def filter_z_candidates(df):
132  """Apply selection on reconstructed Z candidates"""
133  df_z1_cut = df.Filter("Z_mass[0] > 40 && Z_mass[0] < 120", "Mass of first Z candidate in [40, 120]")
134  df_z2_cut = df_z1_cut.Filter("Z_mass[1] > 12 && Z_mass[1] < 120", "Mass of second Z candidate in [12, 120]")
135 
136  return df_z2_cut
137 
138 
139 def reco_higgs_to_4el(df):
140  """Reconstruct Higgs from four electrons"""
141  # Filter interesting events
142  df_base = selection_4el(df)
143 
144  # Reconstruct Z systems
145  df_z_idx = df_base.Define("Z_idx",
146  "reco_zz_to_4l(Electron_pt, Electron_eta, Electron_phi, Electron_mass, Electron_charge)")
147 
148  # Cut on distance between Electrons building Z systems
149  df_z_dr = df_z_idx.Filter("filter_z_dr(Z_idx, Electron_eta, Electron_phi)",
150  "Delta R separation of Electrons building Z system")
151 
152  # Compute masses of Z systems
153  df_z_mass = df_z_dr.Define("Z_mass",
154  "compute_z_masses_4l(Z_idx, Electron_pt, Electron_eta, Electron_phi, Electron_mass)")
155 
156  # Cut on mass of Z candidates
157  df_z_cut = filter_z_candidates(df_z_mass)
158 
159  # Reconstruct H mass
160  df_h_mass = df_z_cut.Define("H_mass",
161  "compute_higgs_mass_4l(Z_idx, Electron_pt, Electron_eta, Electron_phi, Electron_mass)")
162 
163  return df_h_mass
164 
165 
166 def selection_4el(df):
167  """Select interesting events with four electrons"""
168  df_ge4el = df.Filter("nElectron>=4", "At least our electrons")
169  df_iso = df_ge4el.Filter("All(abs(Electron_pfRelIso03_all)<0.40)", "Require good isolation")
170  df_kin = df_iso.Filter("All(Electron_pt>7) && All(abs(Electron_eta)<2.5)", "Good Electron kinematics")
171  df_ip3d = df_kin.Define("Electron_ip3d", "sqrt(Electron_dxy*Electron_dxy + Electron_dz*Electron_dz)")
172  df_sip3d = df_ip3d.Define("Electron_sip3d",
173  "Electron_ip3d/sqrt(Electron_dxyErr*Electron_dxyErr + Electron_dzErr*Electron_dzErr)")
174  df_pv = df_sip3d.Filter("All(Electron_sip3d<4) && All(abs(Electron_dxy)<0.5) && All(abs(Electron_dz)<1.0)",
175  "Track close to primary vertex with small uncertainty")
176  df_2p2n = df_pv.Filter("nElectron==4 && Sum(Electron_charge==1)==2 && Sum(Electron_charge==-1)==2",
177  "Two positive and two negative electrons")
178 
179  return df_2p2n
180 
181 
182 def plot(sig, bkg, data, x_label, filename):
183  """
184  Plot invariant mass for signal and background processes from simulated
185  events overlay the measured data.
186  """
187  # Canvas and general style options
188  ROOT.gStyle.SetOptStat(0)
189  ROOT.gStyle.SetTextFont(42)
190  d = ROOT.TCanvas("d", "", 800, 700)
191  d.SetLeftMargin(0.15)
192 
193  # Get signal and background histograms and stack them to show Higgs signal
194  # on top of the background process
195  h_bkg = bkg
196  h_cmb = sig.Clone()
197 
198  h_cmb.Add(h_bkg)
199  h_cmb.SetTitle("")
200  h_cmb.GetXaxis().SetTitle(x_label)
201  h_cmb.GetXaxis().SetTitleSize(0.04)
202  h_cmb.GetYaxis().SetTitle("N_{Events}")
203  h_cmb.GetYaxis().SetTitleSize(0.04)
204  h_cmb.SetLineColor(ROOT.kRed)
205  h_cmb.SetLineWidth(2)
206  h_cmb.SetMaximum(18)
207  h_bkg.SetLineWidth(2)
208  h_bkg.SetFillStyle(1001)
209  h_bkg.SetLineColor(ROOT.kBlack)
210  h_bkg.SetFillColor(ROOT.kAzure - 9)
211 
212  # Get histogram of data points
213  h_data = data
214  h_data.SetLineWidth(1)
215  h_data.SetMarkerStyle(20)
216  h_data.SetMarkerSize(1.0)
217  h_data.SetMarkerColor(ROOT.kBlack)
218  h_data.SetLineColor(ROOT.kBlack)
219 
220  # Draw histograms
221  h_cmb.DrawCopy("HIST")
222  h_bkg.DrawCopy("HIST SAME")
223  h_data.DrawCopy("PE1 SAME")
224 
225  # Add legend
226  legend = ROOT.TLegend(0.62, 0.70, 0.82, 0.88)
227  legend.SetFillColor(0)
228  legend.SetBorderSize(0)
229  legend.SetTextSize(0.03)
230  legend.AddEntry(h_data, "Data", "PE1")
231  legend.AddEntry(h_bkg, "ZZ", "f")
232  legend.AddEntry(h_cmb, "m_{H} = 125 GeV", "f")
233  legend.Draw()
234 
235  # Add header
236  cms_label = ROOT.TLatex()
237  cms_label.SetTextSize(0.04)
238  cms_label.DrawLatexNDC(0.16, 0.92, "#bf{CMS Open Data}")
239  header = ROOT.TLatex()
240  header.SetTextSize(0.03)
241  header.DrawLatexNDC(0.63, 0.92, "#sqrt{s} = 8 TeV, L_{int} = 11.6 fb^{-1}")
242 
243  # Save plot
244  d.SaveAs(filename)
245 
246 
248  # Create dataframes for signal, background and data samples
249 
250  # Signal: Higgs -> 4 leptons
251  df_sig_4l = ROOT.RDataFrame("Events",
252  "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/SMHiggsToZZTo4L.root")
253 
254  # Background: ZZ -> 4 leptons
255  # Note that additional background processes from the original paper
256  # with minor contribution were left out for this
257  # tutorial.
258  df_bkg_4mu = ROOT.RDataFrame("Events",
259  "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/ZZTo4mu.root")
260 
261  df_bkg_4el = ROOT.RDataFrame("Events",
262  "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/ZZTo4e.root")
263 
264  df_bkg_2el2mu = ROOT.RDataFrame("Events",
265  "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/ZZTo2e2mu.root")
266 
267  # CMS data taken in 2012 (11.6 fb^-1 integrated luminosity)
268  doublemu_files = ROOT.std.vector("string")(2)
269  doublemu_files[0] = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root"
270  doublemu_files[1] = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012C_DoubleMuParked.root"
271 
272  df_data_doublemu = ROOT.RDataFrame("Events", doublemu_files)
273 
274  doubleel_files = ROOT.std.vector("string")(2)
275  doubleel_files[0] = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleElectron.root"
276  doubleel_files[1] = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012C_DoubleElectron.root"
277 
278  df_data_doubleel = ROOT.RDataFrame("Events", doubleel_files)
279 
280  # Number of bins for all histograms
281  nbins = 36
282 
283  # Weights
284  luminosity = 11580.0 # Integrated luminosity of the data samples
285 
286  xsec_ZZTo4mu = 0.077 # ZZ->4mu: Standard Model cross-section
287  nevt_ZZTo4mu = 1499064.0 # ZZ->4mu: Number of simulated events
288 
289  xsec_ZZTo4el = 0.077 # ZZ->4el: Standard Model cross-section
290  nevt_ZZTo4el = 1499093.0 # ZZ->4el: Number of simulated events
291 
292  xsec_ZZTo2el2mu = 0.18 # ZZ->2el2mu: Standard Model cross-section
293  nevt_ZZTo2el2mu = 1497445.0 # ZZ->2el2mu: Number of simulated events
294 
295  xsec_SMHiggsToZZTo4L = 0.0065 # H->4l: Standard Model cross-section
296  nevt_SMHiggsToZZTo4L = 299973.0 # H->4l: Number of simulated events
297  scale_ZZTo4l = 1.386 # ZZ->4l: Scale factor for ZZ to four leptons
298 
299  weight_sig_4mu = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
300  weight_bkg_4mu = luminosity * xsec_ZZTo4mu * scale_ZZTo4l / nevt_ZZTo4mu
301 
302  weight_sig_4el = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
303  weight_bkg_4el = luminosity * xsec_ZZTo4el * scale_ZZTo4l / nevt_ZZTo4el
304 
305  weight_sig_2el2mu = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
306  weight_bkg_2el2mu = luminosity * xsec_ZZTo2el2mu * scale_ZZTo4l / nevt_ZZTo2el2mu
307 
308  # Reconstruct Higgs to 4 muons
309  df_sig_4mu_reco = reco_higgs_to_4mu(df_sig_4l)
310 
311  df_h_sig_4mu = df_sig_4mu_reco.Define("weight", "{}".format(weight_sig_4mu))\
312  .Histo1D(("h_sig_4mu", "", nbins, 70, 180), "H_mass", "weight")
313 
314  df_bkg_4mu_reco = reco_higgs_to_4mu(df_bkg_4mu)
315 
316  df_h_bkg_4mu = df_bkg_4mu_reco.Define("weight", "{}".format(weight_bkg_4mu))\
317  .Histo1D(("h_bkg_4mu", "", nbins, 70, 180), "H_mass", "weight")
318 
319  df_data_4mu_reco = reco_higgs_to_4mu(df_data_doublemu)
320 
321  df_h_data_4mu = df_data_4mu_reco.Define("weight", "1.0")\
322  .Histo1D(("h_data_4mu", "", nbins, 70, 180), "H_mass", "weight")
323 
324  # Reconstruct Higgs to 4 electrons
325  df_sig_4el_reco = reco_higgs_to_4el(df_sig_4l)
326 
327  df_h_sig_4el = df_sig_4el_reco.Define("weight", "{}".format(weight_sig_4el))\
328  .Histo1D(("h_sig_4el", "", nbins, 70, 180), "H_mass", "weight")
329 
330  df_bkg_4el_reco = reco_higgs_to_4el(df_bkg_4el)
331 
332  df_h_bkg_4el = df_bkg_4el_reco.Define("weight", "{}".format(weight_bkg_4el))\
333  .Histo1D(("h_bkg_4el", "", nbins, 70, 180), "H_mass", "weight")
334 
335  df_data_4el_reco = reco_higgs_to_4el(df_data_doubleel)
336 
337  df_h_data_4el = df_data_4el_reco.Define("weight", "1.0")\
338  .Histo1D(("h_data_4el", "", nbins, 70, 180), "H_mass", "weight")
339 
340  # Reconstruct Higgs to 2 electrons and 2 muons
341  df_sig_2el2mu_reco = reco_higgs_to_2el2mu(df_sig_4l)
342 
343  df_h_sig_2el2mu = df_sig_2el2mu_reco.Define("weight", "{}".format(weight_sig_2el2mu))\
344  .Histo1D(("h_sig_2el2mu", "", nbins, 70, 180), "H_mass", "weight")
345 
346  df_bkg_2el2mu_reco = reco_higgs_to_2el2mu(df_bkg_2el2mu)
347 
348  df_h_bkg_2el2mu = df_bkg_2el2mu_reco.Define("weight", "{}".format(weight_bkg_2el2mu))\
349  .Histo1D(("h_bkg_2el2mu", "", nbins, 70, 180), "H_mass", "weight")
350 
351  df_data_2el2mu_reco = reco_higgs_to_2el2mu(df_data_doublemu)
352 
353  df_h_data_2el2mu = df_data_2el2mu_reco.Define("weight", "1.0")\
354  .Histo1D(("h_data_2el2mu_doublemu", "", nbins, 70, 180), "H_mass", "weight")
355 
356  # Trigger event loops and retrieve histograms
357  signal_4mu = df_h_sig_4mu.GetValue()
358  background_4mu = df_h_bkg_4mu.GetValue()
359  data_4mu = df_h_data_4mu.GetValue()
360 
361  signal_4el = df_h_sig_4el.GetValue()
362  background_4el = df_h_bkg_4el.GetValue()
363  data_4el = df_h_data_4el.GetValue()
364 
365  signal_2el2mu = df_h_sig_2el2mu.GetValue()
366  background_2el2mu = df_h_bkg_2el2mu.GetValue()
367  data_2el2mu = df_h_data_2el2mu.GetValue()
368 
369  # Make plots
370  plot(signal_4mu, background_4mu, data_4mu, "m_{4#mu} (GeV)", "higgs_4mu.pdf")
371  plot(signal_4el, background_4el, data_4el, "m_{4e} (GeV)", "higgs_4el.pdf")
372  plot(signal_2el2mu, background_2el2mu, data_2el2mu, "m_{2e2#mu} (GeV)", "higgs_2el2mu.pdf")
373 
374  # Combined plots
375  # If this was done before plotting the others, calling the `Add` function
376  # on the `signal_4mu` histogram would modify the underlying `TH1D` object.
377  # Thus, the histogram with the 4 muons reconstruction would be lost,
378  # instead resulting in the same plot as the aggregated histograms.
379  h_sig_4l = signal_4mu
380  h_sig_4l.Add(signal_4el)
381  h_sig_4l.Add(signal_2el2mu)
382 
383  h_bkg_4l = background_4mu
384  h_bkg_4l.Add(background_4el)
385  h_bkg_4l.Add(background_2el2mu)
386 
387  h_data_4l = data_4mu
388  h_data_4l.Add(data_4el)
389  h_data_4l.Add(data_2el2mu)
390 
391  # Plot aggregated histograms
392  plot(h_sig_4l, h_bkg_4l, h_data_4l, "m_{4l} (GeV)", "higgs_4l.pdf")
393 
394 
395 if __name__ == "__main__":
ROOT&#39;s RDataFrame offers a high level interface for analyses of data stored in TTrees, CSV&#39;s and other data formats.
Definition: RDataFrame.hxx:42