Logo ROOT   6.21/01
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
12 #include <ROOT/RDataFrame.hxx>
13 #include <ROOT/RDF/RInterface.hxx>
14 #include <ROOT/RStringView.hxx>
15 #include <ROOT/TSeq.hxx>
16 #include <RtypesCore.h>
17 #include <TDirectory.h>
18 #include <TChain.h>
19 #include <TClass.h>
20 #include <TClassEdit.h>
21 #include <TFriendElement.h>
22 #include <TInterpreter.h>
23 #include <TObject.h>
24 #include <TRegexp.h>
25 #include <TPRegexp.h>
26 #include <TString.h>
27 #include <TTree.h>
28 
29 // pragma to disable warnings on Rcpp which have
30 // so many noise compiling
31 #if defined(__GNUC__)
32 #pragma GCC diagnostic push
33 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
34 #pragma GCC diagnostic ignored "-Wshadow"
35 #endif
36 #include "lexertk.hpp"
37 #if defined(__GNUC__)
38 #pragma GCC diagnostic pop
39 #endif
40 
41 #include <iosfwd>
42 #include <set>
43 #include <stdexcept>
44 #include <string>
45 #include <typeinfo>
46 
47 namespace ROOT {
48 namespace Detail {
49 namespace RDF {
50 class RCustomColumnBase;
51 class RFilterBase;
52 class RLoopManager;
53 class RRangeBase;
54 } // namespace RDF
55 } // namespace Detail
56 
57 namespace RDF {
58 class RDataSource;
59 } // namespace RDF
60 
61 } // namespace ROOT
62 
63 namespace ROOT {
64 namespace Internal {
65 namespace RDF {
66 
67 /// A tokeniser for the expression which is in C++
68 /// The goal is to extract all names which are potentially
69 /// columns. The difficulty is to catch also the names containing dots.
70 std::set<std::string> GetPotentialColumnNames(const std::string &expr)
71 {
72  lexertk::generator generator;
73  const auto ok = generator.process(expr);
74  if (!ok) {
75  const auto msg = "Failed to tokenize expression:\n" + expr + "\n\nMake sure it is valid C++.";
76  throw std::runtime_error(msg);
77  }
78 
79  std::set<std::string> potCols;
80  const auto nToks = generator.size();
81  std::string potColWithDots;
82 
83  auto IsSymbol = [](const lexertk::token &t) { return t.type == lexertk::token::e_symbol; };
84  auto IsDot = [](const lexertk::token &t) { return t.value == "."; };
85 
86  // Now we start iterating over the tokens
87  for (auto i = 0ULL; i < nToks; ++i) {
88  auto &tok = generator[i];
89  if (!IsSymbol(tok))
90  continue;
91 
92  if (i == 0 || (i > 0 && !IsDot(generator[i - 1])))
93  potCols.insert(tok.value);
94 
95  // after the current token we may have a chain of .<symbol>.<symbol>...
96  // we need to build a set of potential columns incrementally
97  // and stop at the right point. All this advancing the token
98  // cursor.
99  potColWithDots = tok.value;
100  while (i < nToks) {
101  if (i + 2 == nToks)
102  break;
103  auto &nextTok = generator[i + 1];
104  auto &next2nextTok = generator[i + 2];
105  if (!IsDot(nextTok) || !IsSymbol(next2nextTok)) {
106  break;
107  }
108  potColWithDots += "." + next2nextTok.value;
109  potCols.insert(potColWithDots);
110  i += 2;
111  }
112  potColWithDots = "";
113  }
114  return potCols;
115 }
116 
117 // The set here is used as a registry, the real list, which keeps the order, is
118 // the one in the vector
119 class RActionBase;
120 
121 HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols,
122  std::string_view treeName,
123  std::string_view fileName,
124  bool isLazy,
125  RLoopManager &loopManager,
126  std::unique_ptr<RDFInternal::RActionBase> actionPtr)
127 {
128  // create new RDF
130  auto snapshotRDF = std::make_shared<ROOT::RDataFrame>(treeName, fileName, validCols);
131  auto snapshotRDFResPtr = MakeResultPtr(snapshotRDF, loopManager, std::move(actionPtr));
132 
133  if (!isLazy) {
134  *snapshotRDFResPtr;
135  }
136  return snapshotRDFResPtr;
137 }
138 
139 std::string DemangleTypeIdName(const std::type_info &typeInfo)
140 {
141  int dummy(0);
142  return TClassEdit::DemangleTypeIdName(typeInfo, dummy);
143 }
144 
146  TTree *tree,
147  ROOT::RDF::RDataSource *dataSource,
148  std::string_view columnNameRegexp,
149  std::string_view callerName)
150 {
151  const auto theRegexSize = columnNameRegexp.size();
152  std::string theRegex(columnNameRegexp);
153 
154  const auto isEmptyRegex = 0 == theRegexSize;
155  // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
156  if (theRegexSize > 0 && theRegex[0] != '^')
157  theRegex = "^" + theRegex;
158  if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
159  theRegex = theRegex + "$";
160 
161  ColumnNames_t selectedColumns;
162  selectedColumns.reserve(32);
163 
164  // Since we support gcc48 and it does not provide in its stl std::regex,
165  // we need to use TRegexp
166  TPRegexp regexp(theRegex);
167  for (auto &&branchName : customColumns.GetNames()) {
168  if ((isEmptyRegex || 0 != regexp.Match(branchName.c_str())) &&
169  !RDFInternal::IsInternalColumn(branchName)) {
170  selectedColumns.emplace_back(branchName);
171  }
172  }
173 
174  if (tree) {
175  auto branchNames = RDFInternal::GetTopLevelBranchNames(*tree);
176  for (auto &branchName : branchNames) {
177  if (isEmptyRegex || 0 != regexp.Match(branchName.c_str())) {
178  selectedColumns.emplace_back(branchName);
179  }
180  }
181  }
182 
183  if (dataSource) {
184  auto &dsColNames = dataSource->GetColumnNames();
185  for (auto &dsColName : dsColNames) {
186  if ((isEmptyRegex || 0 != regexp.Match(dsColName.c_str())) &&
187  !RDFInternal::IsInternalColumn(dsColName)) {
188  selectedColumns.emplace_back(dsColName);
189  }
190  }
191  }
192 
193  if (selectedColumns.empty()) {
194  std::string text(callerName);
195  if (columnNameRegexp.empty()) {
196  text = ": there is no column available to match.";
197  } else {
198  text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
199  }
200  throw std::runtime_error(text);
201  }
202  return selectedColumns;
203 }
204 
205 void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
206  std::set<TTree *> &analysedTrees)
207 {
208 
209  if (!analysedTrees.insert(&t).second) {
210  return;
211  }
212 
213  auto branches = t.GetListOfBranches();
214  if (branches) {
215  for (auto branchObj : *branches) {
216  auto name = branchObj->GetName();
217  if (bNamesReg.insert(name).second) {
218  bNames.emplace_back(name);
219  }
220  }
221  }
222 
223  auto friendTrees = t.GetListOfFriends();
224 
225  if (!friendTrees)
226  return;
227 
228  for (auto friendTreeObj : *friendTrees) {
229  auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
230  GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees);
231  }
232 }
233 
234 ///////////////////////////////////////////////////////////////////////////////
235 /// Get all the top-level branches names, including the ones of the friend trees
237 {
238  std::set<std::string> bNamesSet;
239  ColumnNames_t bNames;
240  std::set<TTree *> analysedTrees;
241  GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
242  return bNames;
243 }
244 
245 bool IsValidCppVarName(const std::string &var)
246 {
247  if (var.empty())
248  return false;
249  const char firstChar = var[0];
250 
251  // first character must be either a letter or an underscore
252  auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
253  const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
254  if (!isValidFirstChar)
255  return false;
256 
257  // all characters must be either a letter, an underscore or a number
258  auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
259  auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
260  for (const char c : var)
261  if (!isValidTok(c))
262  return false;
263 
264  return true;
265 }
266 
267 void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
268  const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &dataSourceColumns)
269 {
270  const std::string definedColStr(definedCol);
271 
272  if (!IsValidCppVarName(definedColStr)) {
273  const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
274  throw std::runtime_error(msg);
275  }
276 
277  if (treePtr != nullptr) {
278  // check if definedCol is already present in TTree
279  const auto branch = treePtr->GetBranch(definedColStr.c_str());
280  if (branch != nullptr) {
281  const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
282  throw std::runtime_error(msg);
283  }
284  }
285  // check if definedCol has already been `Define`d in the functional graph
286  if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
287  const auto msg = "Redefinition of column \"" + definedColStr + "\"";
288  throw std::runtime_error(msg);
289  }
290 
291  // Check if the definedCol is an alias
292  const auto aliasColNameIt = aliasMap.find(definedColStr);
293  if (aliasColNameIt != aliasMap.end()) {
294  const auto msg = "An alias with name " + definedColStr + " pointing to column " +
295  aliasColNameIt->second + " is already existing.";
296  throw std::runtime_error(msg);
297  }
298 
299  // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
300  if (!dataSourceColumns.empty()) {
301  if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
302  const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
303  throw std::runtime_error(msg);
304  }
305  }
306 }
307 
308 void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
309 {
310  if (nTemplateParams != nColumnNames) {
311  std::string err_msg = "The number of template parameters specified is ";
312  err_msg += std::to_string(nTemplateParams);
313  err_msg += " while ";
314  err_msg += std::to_string(nColumnNames);
315  err_msg += " columns have been specified.";
316  throw std::runtime_error(err_msg);
317  }
318 }
319 
320 /// Choose between local column names or default column names, throw in case of errors.
321 const ColumnNames_t
322 SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
323 {
324  if (names.empty()) {
325  // use default column names
326  if (defaultNames.size() < nRequiredNames)
327  throw std::runtime_error(
328  std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
329  " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
330  // return first nRequiredNames default column names
331  return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
332  } else {
333  // use column names provided by the user to this particular transformation/action
334  if (names.size() != nRequiredNames) {
335  auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
336  " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
337  " provided:";
338  for (const auto &name : names)
339  msg += " \"" + name + "\",";
340  msg.back() = '.';
341  throw std::runtime_error(msg);
342  }
343  return names;
344  }
345 }
346 
347 ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
348  const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
349 {
350  ColumnNames_t unknownColumns;
351  for (auto &column : requiredCols) {
352  const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
353  if (isBranch)
354  continue;
355  const auto isCustomColumn = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
356  if (isCustomColumn)
357  continue;
358  const auto isDataSourceColumn =
359  std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
360  if (isDataSourceColumn)
361  continue;
362  unknownColumns.emplace_back(column);
363  }
364  return unknownColumns;
365 }
366 
367 bool IsInternalColumn(std::string_view colName)
368 {
369  const auto str = colName.data();
370  const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
371  ('r' == str[0] || 't' == str[0]) && // starts with r or t
372  0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
373  return goodPrefix && '_' == colName.back(); // also ends with '_'
374 }
375 
376 std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
377 {
378  return loopManager->GetFiltersNames();
379 }
380 
381 // Replace all the occurrences of a string by another string
382 unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
383 {
384  size_t idx = 0;
385  auto numReplacements = 0U;
386  while ((idx = s.find(what, idx)) != std::string::npos) {
387  s.replace(idx, what.size(), withWhat);
388  idx += withWhat.size();
389  numReplacements++;
390  }
391  return numReplacements;
392 }
393 
394 // Match expression against names of branches passed as parameter
395 // Return vector of names of the branches used in the expression
396 std::vector<std::string> FindUsedColumnNames(std::string_view expression, ColumnNames_t branches,
397  const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns,
398  const std::map<std::string, std::string> &aliasMap)
399 {
400  // To help matching the regex
401  const auto potCols = GetPotentialColumnNames(std::string(expression));
402 
403  if (potCols.size() == 0) return {};
404 
405  std::set<std::string> usedBranches;
406 
407  // Check which custom columns match
408  for (auto &brName : customColumns) {
409  if (potCols.find(brName) != potCols.end()) {
410  usedBranches.insert(brName);
411  }
412  }
413 
414  // Check which tree branches match
415  // We need to match the longest
416 
417  // First: reverse sort to have longer branches before, e.g.
418  // a.b.c
419  // a.b
420  // a
421  // We want that the longest branch ends up in usedBranches before.
422  std::sort(branches.begin(), branches.end(),
423  [](const std::string &s0, const std::string &s1) {return s0 > s1;});
424 
425  for (auto &brName : branches) {
426  // If the branch is not in the potential columns, we simply move on
427  if (potCols.find(brName) == potCols.end()) {
428  continue;
429  }
430  // If not, we check if the branch name is contained in one of the branch
431  // names which we already added to the usedBranches.
432  auto isContained = [&brName](const std::string &usedBr) {
433  // We check two things:
434  // 1. That the string is contained, e.g. a.b is contained in a.b.c.d
435  // 2. That the number of '.'s is greater, otherwise in situations where
436  // 2 branches have names like br0 and br01, br0 is not matched (ROOT-9929)
437  return usedBr.find(brName) != std::string::npos &&
438  std::count(usedBr.begin(), usedBr.end(), '.') > std::count(brName.begin(), brName.end(), '.');
439  };
440  auto it = std::find_if(usedBranches.begin(), usedBranches.end(), isContained);
441  if (it == usedBranches.end()) {
442  usedBranches.insert(brName);
443  }
444  }
445 
446  // Check which data-source columns match
447  for (auto &col : dsColumns) {
448  if (potCols.find(col) != potCols.end()) {
449  usedBranches.insert(col);
450  }
451  }
452 
453  // Check which aliases match
454  for (auto &alias_colName : aliasMap) {
455  auto &alias = alias_colName.first;
456  if (potCols.find(alias) != potCols.end()) {
457  usedBranches.insert(alias);
458  }
459  }
460 
461  return std::vector<std::string>(usedBranches.begin(), usedBranches.end());
462 }
463 
464 // TODO we should also replace other invalid chars, like '[],' and spaces
465 std::vector<std::string> ReplaceDots(const ColumnNames_t &colNames)
466 {
467  std::vector<std::string> dotlessNames = colNames;
468  for (auto &c : dotlessNames) {
469  const bool hasDot = c.find_first_of('.') != std::string::npos;
470  if (hasDot) {
471  std::replace(c.begin(), c.end(), '.', '_');
472  c.insert(0u, "__rdf_arg_");
473  }
474  }
475  return dotlessNames;
476 }
477 
478 // TODO comment well -- there is a lot going on in this function in terms of side-effects
479 std::vector<std::string> ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames,
480  const std::map<std::string, std::string> &aliasMap, TTree *tree,
481  RDataSource *ds, std::string &expr, unsigned int namespaceID,
482  const RDFInternal::RBookedCustomColumns &customCols)
483 {
484  std::vector<std::string> colTypes;
485  colTypes.reserve(colNames.size());
486  const auto aliasMapEnd = aliasMap.end();
487 
488  for (auto c = colNames.begin(), v = varNames.begin(); c != colNames.end();) {
489  const auto &colName = *c;
490 
491  if (colName.find('.') != std::string::npos) {
492  // If the column name contains dots, replace its name in the expression with the corresponding varName
493  auto numRepl = Replace(expr, colName, *v);
494  if (numRepl == 0) {
495  // Discard this column: we could not replace it, although we matched it previously
496  // This is because it is a substring of a column we already replaced in the expression
497  // e.g. "a.b" is a substring column of "a.b.c"
498  c = colNames.erase(c);
499  v = varNames.erase(v);
500  continue;
501  }
502  } else {
503  // Column name with no dots: check the name is still there
504  // it might have only been there as part of a column name with dots, e.g. "a" inside "a.b.c"
505  const auto paddedExpr = " " + expr + " ";
506  static const std::string noWordChars("[^a-zA-Z0-9_]");
507  const auto colNameRxBody = noWordChars + colName + noWordChars;
508  TRegexp colNameRegex(colNameRxBody.c_str());
509  Ssiz_t matchedLen;
510  const auto colStillThere = colNameRegex.Index(paddedExpr.c_str(), &matchedLen) != -1;
511  if (!colStillThere) {
512  c = colNames.erase(c);
513  v = varNames.erase(v);
514  continue;
515  }
516  }
517 
518  // Replace the colName with the real one in case colName it's an alias
519  // The real name is used to get the type, but the variable name will still be colName
520  const auto aliasMapIt = aliasMap.find(colName);
521  const auto &realColName = aliasMapEnd == aliasMapIt ? colName : aliasMapIt->second;
522  // The map is a const reference, so no operator[]
523  const auto isCustomCol = customCols.HasName(realColName);
524  const auto customColID = isCustomCol ? customCols.GetColumns().at(realColName)->GetID() : 0;
525  const auto colTypeName =
526  ColumnName2ColumnTypeName(realColName, namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
527  colTypes.emplace_back(colTypeName);
528  ++c, ++v;
529  }
530 
531  return colTypes;
532 }
533 
534 // Jit expression "in the vacuum", throw if cling exits with an error
535 // This is to make sure that column names, types and expression string are proper C++
536 void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames,
537  const std::vector<std::string> &colTypes, bool hasReturnStmt)
538 {
539  R__ASSERT(colNames.size() == colTypes.size());
540 
541  static unsigned int iNs = 0U;
542  std::stringstream dummyDecl;
543  dummyDecl << "namespace __rdf_" << std::to_string(iNs++) << "{ auto rdf_f = []() {";
544 
545  for (auto col = colNames.begin(), type = colTypes.begin(); col != colNames.end(); ++col, ++type) {
546  dummyDecl << *type << " " << *col << ";\n";
547  }
548 
549  // Now that branches are declared as variables, put the body of the
550  // lambda in dummyDecl and close scopes of f and namespace __rdf_N
551  if (hasReturnStmt)
552  dummyDecl << expression << "\n;};}";
553  else
554  dummyDecl << "return " << expression << "\n;};}";
555 
556  // Try to declare the dummy lambda, error out if it does not compile
557  if (!gInterpreter->Declare(dummyDecl.str().c_str())) {
558  auto msg =
559  "Cannot interpret the following expression:\n" + std::string(expression) + "\n\nMake sure it is valid C++.";
560  throw std::runtime_error(msg);
561  }
562 }
563 
564 std::string
565 BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
566 {
567  R__ASSERT(vars.size() == varTypes.size());
568 
569  std::stringstream ss;
570  ss << "[](";
571  for (auto i = 0u; i < vars.size(); ++i) {
572  // We pass by reference to avoid expensive copies
573  // It can't be const reference in general, as users might want/need to call non-const methods on the values
574  ss << varTypes[i] << "& " << vars[i] << ", ";
575  }
576  if (!vars.empty())
577  ss.seekp(-2, ss.cur);
578 
579  if (hasReturnStmt)
580  ss << "){";
581  else
582  ss << "){return ";
583  ss << expr << "\n;}";
584 
585  return ss.str();
586 }
587 
588 std::string PrettyPrintAddr(const void *const addr)
589 {
590  std::stringstream s;
591  // Windows-friendly
592  s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
593  return s.str();
594 }
595 
596 // Jit a string filter expression and jit-and-call this->Filter with the appropriate arguments
597 // Return pointer to the new functional chain node returned by the call, cast to Long_t
598 
599 void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name,
600  std::string_view expression, const std::map<std::string, std::string> &aliasMap,
601  const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree,
602  RDataSource *ds, unsigned int namespaceID)
603 {
604  const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
605 
606  // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
607  auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
608  auto varNames = ReplaceDots(usedBranches);
609  auto dotlessExpr = std::string(expression);
610  const auto usedColTypes =
611  ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
612 
613  TRegexp re("[^a-zA-Z0-9_]?return[^a-zA-Z0-9_]");
614  Ssiz_t matchedLen;
615  const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
616 
617  auto lm = jittedFilter->GetLoopManagerUnchecked();
618  lm->JitDeclarations(); // TryToJitExpression might need some of the Define'd column type aliases
619  TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
620 
621  const auto filterLambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
622 
623  const auto jittedFilterAddr = PrettyPrintAddr(jittedFilter);
624  const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
625 
626  // columnsOnHeap is deleted by the jitted call to JitFilterHelper
628  const auto columnsOnHeapAddr = PrettyPrintAddr(columnsOnHeap);
629 
630  // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
631  // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
632  std::stringstream filterInvocation;
633  filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << filterLambda << ", {";
634  for (const auto &brName : usedBranches) {
635  // Here we selectively replace the brName with the real column name if it's necessary.
636  const auto aliasMapIt = aliasMap.find(brName);
637  auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
638  filterInvocation << "\"" << realBrName << "\", ";
639  }
640  if (!usedBranches.empty())
641  filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
642  filterInvocation << "}, \"" << name << "\", "
643  << "reinterpret_cast<ROOT::Detail::RDF::RJittedFilter*>(" << jittedFilterAddr << "), "
644  << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
645  << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << columnsOnHeapAddr << ")"
646  << ");";
647 
648  lm->ToJitExec(filterInvocation.str());
649 }
650 
651 // Jit a Define call
652 void BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds,
653  const std::shared_ptr<RJittedCustomColumn> &jittedCustomColumn,
654  const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches)
655 {
656  const auto &aliasMap = lm.GetAliasMap();
657  auto *const tree = lm.GetTree();
658  const auto namespaceID = lm.GetID();
659  const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
660 
661  // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
662  auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
663  auto varNames = ReplaceDots(usedBranches);
664  auto dotlessExpr = std::string(expression);
665  const auto usedColTypes =
666  ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
667 
668  TRegexp re("[^a-zA-Z0-9_]?return[^a-zA-Z0-9_]");
669  Ssiz_t matchedLen;
670  const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
671 
672  lm.JitDeclarations(); // TryToJitExpression might need some of the Define'd column type aliases
673  TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
674 
675  const auto definelambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
676  const auto customColID = std::to_string(jittedCustomColumn->GetID());
677  const auto lambdaName = "eval_" + std::string(name) + customColID;
678  const auto ns = "__rdf" + std::to_string(namespaceID);
679 
680  auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols);
681  auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
682 
683  // Declare the lambda variable and an alias for the type of the defined column in namespace __rdf
684  // This assumes that a given variable is Define'd once per RDataFrame -- we might want to relax this requirement
685  // to let python users execute a Define cell multiple times
686  const auto defineDeclaration =
687  "namespace " + ns + " { auto " + lambdaName + " = " + definelambda + ";\n" + "using " + std::string(name) +
688  customColID + "_type = typename ROOT::TypeTraits::CallableTraits<decltype(" + lambdaName + " )>::ret_type; }\n";
689  lm.ToJitDeclare(defineDeclaration);
690 
691  std::stringstream defineInvocation;
692  defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << definelambda << ", {";
693  for (auto brName : usedBranches) {
694  // Here we selectively replace the brName with the real column name if it's necessary.
695  auto aliasMapIt = aliasMap.find(brName);
696  auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
697  defineInvocation << "\"" << realBrName << "\", ";
698  }
699  if (!usedBranches.empty())
700  defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
701  defineInvocation << "}, \"" << name << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>("
702  << PrettyPrintAddr(&lm) << "), *reinterpret_cast<ROOT::Detail::RDF::RJittedCustomColumn*>("
703  << PrettyPrintAddr(jittedCustomColumn.get()) << "),"
704  << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
705  << ");";
706 
707  lm.ToJitExec(defineInvocation.str());
708 }
709 
710 // Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
711 // (see comments in the body for actual jitted code)
712 std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at,
713  void *rOnHeap, TTree *tree, const unsigned int nSlots,
714  const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds,
715  std::shared_ptr<RJittedAction> *jittedActionOnHeap, unsigned int namespaceID)
716 {
717  auto nBranches = bl.size();
718 
719  // retrieve branch type names as strings
720  std::vector<std::string> columnTypeNames(nBranches);
721  for (auto i = 0u; i < nBranches; ++i) {
722  const auto isCustomCol = customCols.HasName(bl[i]);
723  const auto customColID = isCustomCol ? customCols.GetColumns().at(bl[i])->GetID() : 0;
724  const auto columnTypeName =
725  ColumnName2ColumnTypeName(bl[i], namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
726  if (columnTypeName.empty()) {
727  std::string exceptionText = "The type of column ";
728  exceptionText += bl[i];
729  exceptionText += " could not be guessed. Please specify one.";
730  throw std::runtime_error(exceptionText.c_str());
731  }
732  columnTypeNames[i] = columnTypeName;
733  }
734 
735  // retrieve type of result of the action as a string
736  auto actionResultTypeClass = TClass::GetClass(art);
737  if (!actionResultTypeClass) {
738  std::string exceptionText = "An error occurred while inferring the result type of an operation.";
739  throw std::runtime_error(exceptionText.c_str());
740  }
741  const auto actionResultTypeName = actionResultTypeClass->GetName();
742 
743  // retrieve type of action as a string
744  auto actionTypeClass = TClass::GetClass(at);
745  if (!actionTypeClass) {
746  std::string exceptionText = "An error occurred while inferring the action type of the operation.";
747  throw std::runtime_error(exceptionText.c_str());
748  }
749  const auto actionTypeName = actionTypeClass->GetName();
750 
751  auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols); // deleted in jitted CallBuildAction
752  auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
753 
754  // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
755  // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
756  std::stringstream createAction_str;
757  createAction_str << "ROOT::Internal::RDF::CallBuildAction"
758  << "<" << actionTypeName;
759  for (auto &colType : columnTypeNames)
760  createAction_str << ", " << colType;
761  // on Windows, to prefix the hexadecimal value of a pointer with '0x',
762  // one need to write: std::hex << std::showbase << (size_t)pointer
763  createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
764  << PrettyPrintAddr(prevNode) << "), {";
765  for (auto i = 0u; i < bl.size(); ++i) {
766  if (i != 0u)
767  createAction_str << ", ";
768  createAction_str << '"' << bl[i] << '"';
769  }
770  createAction_str << "}, " << std::dec << std::noshowbase << nSlots << ", reinterpret_cast<" << actionResultTypeName
771  << "*>(" << PrettyPrintAddr(rOnHeap) << ")"
772  << ", reinterpret_cast<std::shared_ptr<ROOT::Internal::RDF::RJittedAction>*>("
773  << PrettyPrintAddr(jittedActionOnHeap) << "),"
774  << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
775  << ");";
776  return createAction_str.str();
777 }
778 
779 bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
780 {
781  for (const auto &s : strings) {
782  if (s.empty())
783  return true;
784  }
785  return false;
786 }
787 
788 std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
789 {
790  return ptr;
791 }
792 
793 /// Given the desired number of columns and the user-provided list of columns:
794 /// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
795 /// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
796 /// * replace column names from aliases by the actual column name
797 /// Return the list of selected column names.
798 ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
799  const ColumnNames_t &validCustomColumns, RDataSource *ds)
800 {
801  const auto &defaultColumns = lm.GetDefaultColumnNames();
802  auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
803  const auto &validBranchNames = lm.GetBranchNames();
804  const auto unknownColumns = FindUnknownColumns(selectedColumns, validBranchNames, validCustomColumns,
805  ds ? ds->GetColumnNames() : ColumnNames_t{});
806 
807  if (!unknownColumns.empty()) {
808  // throw
809  std::stringstream unknowns;
810  std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
811  for (auto &unknownColumn : unknownColumns) {
812  unknowns << delim << unknownColumn;
813  delim = ',';
814  }
815  throw std::runtime_error("Unknown column" + unknowns.str());
816  }
817 
818  // Now we need to check within the aliases if some of the yet unknown names can be recovered
819  auto &aliasMap = lm.GetAliasMap();
820  auto aliasMapEnd = aliasMap.end();
821 
822  for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
823  const auto &colName = selectedColumns[idx];
824  const auto aliasColumnNameIt = aliasMap.find(colName);
825  if (aliasMapEnd != aliasColumnNameIt) {
826  selectedColumns[idx] = aliasColumnNameIt->second;
827  }
828  }
829 
830  return selectedColumns;
831 }
832 
833 /// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
834 /// name of a column that must be defined via datasource. All elements of the returned vector are false if no
835 /// data-source is present.
836 std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
837 {
838  const auto nColumns = requestedCols.size();
839  std::vector<bool> mustBeDefined(nColumns, false);
840  for (auto i = 0u; i < nColumns; ++i)
841  mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
842  return mustBeDefined;
843 }
844 
845 } // namespace RDF
846 } // namespace Internal
847 } // namespace ROOT
void JitDeclarations()
Declare to the interpreter type aliases and other entities required by RDF jitted nodes...
void GetTopLevelBranchNamesImpl(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::set< TTree *> &analysedTrees)
The head node of a RDF computation graph.
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
std::vector< std::string > ReplaceDots(const ColumnNames_t &colNames)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
Returns the available number of logical cores.
Definition: StringConv.hxx:21
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset&#39;s column names.
bool IsInternalColumn(std::string_view colName)
#define s0(x)
Definition: RSha256.hxx:90
std::string BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
void BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const std::shared_ptr< RJittedCustomColumn > &jittedCustomColumn, const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches)
std::vector< std::string > FindUsedColumnNames(std::string_view expression, ColumnNames_t branches, const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns, const std::map< std::string, std::string > &aliasMap)
virtual TList * GetListOfFriends() const
Definition: TTree.h:469
Regular expression class.
Definition: TRegexp.h:31
#define R__ASSERT(e)
Definition: TError.h:96
std::string ColumnName2ColumnTypeName(const std::string &colName, unsigned int namespaceID, TTree *tree, RDataSource *ds, bool isCustomColumn, bool vector2rvec, unsigned int customColID)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:197
#define gInterpreter
Definition: TInterpreter.h:555
void ToJitDeclare(const std::string &s)
HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols, std::string_view treeName, std::string_view fileName, bool isLazy, RLoopManager &loopManager, std::unique_ptr< RDFInternal::RActionBase > actionPtr)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
void ToJitExec(const std::string &s)
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:467
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame...
bool IsValidCppVarName(const std::string &var)
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
A wrapper around a concrete RFilter, which forwards all calls to it RJittedFilter is the type of the ...
static constexpr double s
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5096
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
std::string DemangleTypeIdName(const std::type_info &typeInfo)
void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree, RDataSource *ds, unsigned int namespaceID)
void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames, const std::vector< std::string > &colTypes, bool hasReturnStmt)
RResultPtr< T > MakeResultPtr(const std::shared_ptr< T > &r, RLoopManager &df, std::shared_ptr< ROOT::Internal::RDF::RActionBase > actionPtr)
std::set< std::string > GetPotentialColumnNames(const std::string &expr)
A tokeniser for the expression which is in C++ The goal is to extract all names which are potentially...
#define s1(x)
Definition: RSha256.hxx:91
virtual RLoopManager * GetLoopManagerUnchecked()
Definition: RNodeBase.hxx:64
int Ssiz_t
Definition: RtypesCore.h:63
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validCustomColumns, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TText * text
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
int type
Definition: TGX11.cxx:120
static RooMathCoreReg dummy
const RCustomColumnBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2906
Small helper to keep current directory context.
Definition: TDirectory.h:41
std::string PrettyPrintAddr(const void *const addr)
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
A TFriendElement TF describes a TTree object TF in a file.
static constexpr double ns
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
#define c(i)
Definition: RSha256.hxx:101
Definition: tree.py:1
Encapsulates the columns defined by the user.
A TTree represents a columnar dataset.
Definition: TTree.h:72
const std::map< std::string, std::string > & GetAliasMap() const
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:339
std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at, void *rOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds, std::shared_ptr< RJittedAction > *jittedActionOnHeap, unsigned int namespaceID)
std::vector< std::string > ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames, const std::map< std::string, std::string > &aliasMap, TTree *tree, RDataSource *ds, std::string &expr, unsigned int namespaceID, const RDFInternal::RBookedCustomColumns &customCols)
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
char name[80]
Definition: TGX11.cxx:109
ROOT::Detail::RDF::ColumnNames_t ColumnNames_t
Definition: RDataFrame.cxx:797
ColumnNames_t ConvertRegexToColumns(const RDFInternal::RBookedCustomColumns &customColumns, TTree *tree, ROOT::RDF::RDataSource *dataSource, std::string_view columnNameRegexp, std::string_view callerName)