This example shows how to process a dataset where entries might be incomplete due to one or more missing branches in one or more of the files in the dataset. It shows usage of the FilterAvailable and DefaultValueFor RDataFrame functionalities to act upon the missing entries.
#include <iostream>
#include <numeric>
struct Dataset {
constexpr static std::array<const char *, 3> fFileNames{"df036_missingBranches_C_file_1.root",
"df036_missingBranches_C_file_2.root",
"df036_missingBranches_C_file_3.root"};
constexpr static std::array<const char *, 3> fTreeNames{"tree_1", "tree_2", "tree_3"};
constexpr static auto fTreeEntries{5};
Dataset()
{
{
TFile f(fFileNames[0],
"RECREATE");
TTree t(fTreeNames[0], fTreeNames[0]);
t.Branch(
"x", &
x,
"x/I");
t.Branch(
"y", &
y,
"y/I");
for (int i = 1; i <= fTreeEntries; i++) {
t.Fill();
}
t.Write();
}
{
TFile f(fFileNames[1],
"RECREATE");
TTree t(fTreeNames[1], fTreeNames[1]);
t.Branch(
"y", &
y,
"y/I");
for (int i = 1; i <= fTreeEntries; i++) {
t.Fill();
}
t.Write();
}
{
TFile f(fFileNames[2],
"RECREATE");
TTree t(fTreeNames[2], fTreeNames[2]);
t.Branch(
"x", &
x,
"x/I");
for (int i = 1; i <= fTreeEntries; i++) {
t.Fill();
}
t.Write();
}
}
~Dataset()
{
for (auto &&fileName : fFileNames)
std::remove(fileName);
}
};
{
Dataset trees{};
for (auto i = 0; i < trees.fFileNames.size(); i++) {
const auto fullPath = std::string(trees.fFileNames[i]) + "?#" + trees.fTreeNames[i];
}
constexpr static auto defaultValue = std::numeric_limits<int>::min();
auto display1 = df.DefaultValueFor("x", defaultValue)
.DefaultValueFor("y", defaultValue)
.Display<
int,
int>({
"x",
"y"}, 15);
auto display2 =
df.DefaultValueFor(
"y", defaultValue).FilterAvailable(
"x").Display<
int,
int>({
"x",
"y"}, 15);
auto display3 = df.FilterMissing("y").Display<int>({"x"}, 15);
std::cout << "Example 1: provide a default value for all missing branches\n";
display1->Print();
std::cout << "Example 2: provide a default value for branch y, but skip events where branch x is missing\n";
display2->Print();
std::cout << "Example 3: only keep events where branch y is missing and display values for branch x\n";
display3->Print();
}
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A chain is a collection of files containing TTree objects.
virtual Int_t Add(TChain *chain)
Add all files referenced by the passed chain to this chain.
A ROOT file is an on-disk file, usually with extension .root, that stores objects in a file-system-li...
A TTree represents a columnar dataset.
Example 1: provide a default value for all missing branches
+-----+-------------+-------------+
| Row | x | y |
+-----+-------------+-------------+
| 0 | 1 | 2 |
+-----+-------------+-------------+
| 1 | 2 | 4 |
+-----+-------------+-------------+
| 2 | 3 | 6 |
+-----+-------------+-------------+
| 3 | 4 | 8 |
+-----+-------------+-------------+
| 4 | 5 | 10 |
+-----+-------------+-------------+
| 5 | -2147483648 | 3 |
+-----+-------------+-------------+
| 6 | -2147483648 | 6 |
+-----+-------------+-------------+
| 7 | -2147483648 | 9 |
+-----+-------------+-------------+
| 8 | -2147483648 | 12 |
+-----+-------------+-------------+
| 9 | -2147483648 | 15 |
+-----+-------------+-------------+
| 10 | 4 | -2147483648 |
+-----+-------------+-------------+
| 11 | 8 | -2147483648 |
+-----+-------------+-------------+
| 12 | 12 | -2147483648 |
+-----+-------------+-------------+
| 13 | 16 | -2147483648 |
+-----+-------------+-------------+
| 14 | 20 | -2147483648 |
+-----+-------------+-------------+
Example 2: provide a default value for branch y, but skip events where branch x is missing
+-----+----+-------------+
| Row | x | y |
+-----+----+-------------+
| 0 | 1 | 2 |
+-----+----+-------------+
| 1 | 2 | 4 |
+-----+----+-------------+
| 2 | 3 | 6 |
+-----+----+-------------+
| 3 | 4 | 8 |
+-----+----+-------------+
| 4 | 5 | 10 |
+-----+----+-------------+
| 10 | 4 | -2147483648 |
+-----+----+-------------+
| 11 | 8 | -2147483648 |
+-----+----+-------------+
| 12 | 12 | -2147483648 |
+-----+----+-------------+
| 13 | 16 | -2147483648 |
+-----+----+-------------+
| 14 | 20 | -2147483648 |
+-----+----+-------------+
Example 3: only keep events where branch y is missing and display values for branch x
+-----+----+
| Row | x |
+-----+----+
| 10 | 4 |
+-----+----+
| 11 | 8 |
+-----+----+
| 12 | 12 |
+-----+----+
| 13 | 16 |
+-----+----+
| 14 | 20 |
+-----+----+
- Date
- September 2024
- Author
- Vincenzo Eduardo Padulano (CERN)
Definition in file df036_missingBranches.C.