{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "28e79fe7",
   "metadata": {},
   "source": [
    "# tmva100_DataPreparation\n",
    "This tutorial illustrates how to prepare ROOT datasets to be nicely readable\n",
    "by most machine learning methods. This requires filtering the initial complex\n",
    "datasets and writing the data in a flat format.\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "**Author:** Stefan Wunsch  \n",
    "<i><small>This notebook tutorial was automatically generated with <a href= \"https://github.com/root-project/root/blob/master/documentation/doxygen/converttonotebook.py\">ROOTBOOK-izer</a> from the macro found in the ROOT repository  on Tuesday, May 19, 2026 at 08:22 PM.</small></i>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f751af74",
   "metadata": {
    "collapsed": false,
    "execution": {
     "iopub.execute_input": "2026-05-19T20:22:37.982308Z",
     "iopub.status.busy": "2026-05-19T20:22:37.982191Z",
     "iopub.status.idle": "2026-05-19T20:22:43.099210Z",
     "shell.execute_reply": "2026-05-19T20:22:43.098407Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>> Extract the training and testing events for signal from the SMHiggsToZZTo4L.root dataset.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>> Extract the training and testing events for background from the ZZTo2e2mu.root dataset.At least two electrons and two muons: pass=45352      all=299973     -- eff=15.12 % cumulative eff=15.12 %\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "At least two electrons and two muons: pass=262776     all=1497445    -- eff=17.55 % cumulative eff=17.55 %\n"
     ]
    }
   ],
   "source": [
    "import ROOT\n",
    "\n",
    "\n",
    "def filter_events(df):\n",
    "    \"\"\"\n",
    "    Reduce initial dataset to only events which shall be used for training\n",
    "    \"\"\"\n",
    "    return df.Filter(\"nElectron>=2 && nMuon>=2\", \"At least two electrons and two muons\")\n",
    "\n",
    "\n",
    "def define_variables(df):\n",
    "    \"\"\"\n",
    "    Define the variables which shall be used for training\n",
    "    \"\"\"\n",
    "    return df.Define(\"Muon_pt_1\", \"Muon_pt[0]\")\\\n",
    "             .Define(\"Muon_pt_2\", \"Muon_pt[1]\")\\\n",
    "             .Define(\"Electron_pt_1\", \"Electron_pt[0]\")\\\n",
    "             .Define(\"Electron_pt_2\", \"Electron_pt[1]\")\n",
    "\n",
    "\n",
    "variables = [\"Muon_pt_1\", \"Muon_pt_2\", \"Electron_pt_1\", \"Electron_pt_2\"]\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    for filename, label in [[\"SMHiggsToZZTo4L.root\", \"signal\"], [\"ZZTo2e2mu.root\", \"background\"]]:\n",
    "        print(\">>> Extract the training and testing events for {} from the {} dataset.\".format(\n",
    "            label, filename))\n",
    "\n",
    "        # Load dataset, filter the required events and define the training variables\n",
    "        filepath = \"root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/\" + filename\n",
    "        df = ROOT.RDataFrame(\"Events\", filepath)\n",
    "        df = filter_events(df)\n",
    "        df = define_variables(df)\n",
    "\n",
    "        # Book cutflow report\n",
    "        report = df.Report()\n",
    "\n",
    "        # Split dataset by event number for training and testing\n",
    "        columns = ROOT.std.vector[\"string\"](variables)\n",
    "        df.Filter(\"event % 2 == 0\", \"Select events with even event number for training\")\\\n",
    "          .Snapshot(\"Events\", \"train_\" + label + \".root\", columns)\n",
    "        df.Filter(\"event % 2 == 1\", \"Select events with odd event number for training\")\\\n",
    "          .Snapshot(\"Events\", \"test_\" + label + \".root\", columns)\n",
    "\n",
    "        # Print cutflow report\n",
    "        report.Print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
