Logo ROOT  
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
12#include <ROOT/RDataFrame.hxx>
13#include <ROOT/RStringView.hxx>
14#include <ROOT/TSeq.hxx>
15#include <RtypesCore.h>
16#include <TDirectory.h>
17#include <TChain.h>
18#include <TClass.h>
19#include <TClassEdit.h>
20#include <TFriendElement.h>
21#include <TInterpreter.h>
22#include <TObject.h>
23#include <TPRegexp.h>
24#include <TString.h>
25#include <TTree.h>
26
27// pragma to disable warnings on Rcpp which have
28// so many noise compiling
29#if defined(__GNUC__)
30#pragma GCC diagnostic push
31#pragma GCC diagnostic ignored "-Woverloaded-virtual"
32#pragma GCC diagnostic ignored "-Wshadow"
33#endif
34#include "lexertk.hpp"
35#if defined(__GNUC__)
36#pragma GCC diagnostic pop
37#endif
38
39#include <algorithm>
40#include <unordered_set>
41#include <stdexcept>
42#include <string>
43#include <sstream>
44#include <typeinfo>
45
46namespace ROOT {
47namespace Detail {
48namespace RDF {
49class RDefineBase;
50class RFilterBase;
51class RLoopManager;
52class RRangeBase;
53} // namespace RDF
54} // namespace Detail
55
56namespace RDF {
57class RDataSource;
58} // namespace RDF
59
60} // namespace ROOT
61
62namespace {
64
65/// A string expression such as those passed to Filter and Define, digested to a standardized form
66struct ParsedExpression {
67 /// The string expression with the dummy variable names in fVarNames in place of the original column names
68 std::string fExpr;
69 /// The list of valid column names that were used in the original string expression.
70 /// Duplicates are removed and column aliases (created with Alias calls) are resolved.
71 ColumnNames_t fUsedCols;
72 /// The list of variable names used in fExpr, with same ordering and size as fUsedCols
73 ColumnNames_t fVarNames;
74};
75
76static bool IsStrInVec(const std::string &str, const std::vector<std::string> &vec)
77{
78 return std::find(vec.cbegin(), vec.cend(), str) != vec.cend();
79}
80
81// look at expression `expr` and return a list of column names used, including aliases
82static ColumnNames_t FindUsedColumns(const std::string &expr, const ColumnNames_t &treeBranchNames,
83 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
84 const std::map<std::string, std::string> &aliasMap)
85{
86 ColumnNames_t usedCols;
87
88 lexertk::generator tokens;
89 const auto tokensOk = tokens.process(expr);
90 if (!tokensOk) {
91 const auto msg = "Failed to tokenize expression:\n" + expr + "\n\nMake sure it is valid C++.";
92 throw std::runtime_error(msg);
93 }
94
95 // iterate over tokens in expression and fill usedCols, varNames and exprWithVars
96 const auto nTokens = tokens.size();
97 const auto kSymbol = lexertk::token::e_symbol;
98 for (auto i = 0u; i < nTokens; ++i) {
99 const auto &tok = tokens[i];
100 // lexertk classifies '&' as e_symbol for some reason
101 if (tok.type != kSymbol || tok.value == "&" || tok.value == "|") {
102 // token is not a potential variable name, skip it
103 continue;
104 }
105
106 ColumnNames_t potentialColNames({tok.value});
107
108 // if token is the start of a dot chain (a.b.c...), a.b, a.b.c etc. are also potential column names
109 auto dotChainKeepsGoing = [&](unsigned int _i) {
110 return _i + 2 <= nTokens && tokens[_i + 1].value == "." && tokens[_i + 2].type == kSymbol;
111 };
112 while (dotChainKeepsGoing(i)) {
113 potentialColNames.emplace_back(potentialColNames.back() + "." + tokens[i + 2].value);
114 i += 2; // consume the tokens we looked at
115 }
116
117 // find the longest potential column name that is an actual column name
118 // if it's a new match, also add it to usedCols and update varNames
119 // potential columns are sorted by length, so we search from the end
120 auto isRDFColumn = [&](const std::string &columnOrAlias) {
121 const auto &col = ROOT::Internal::RDF::ResolveAlias(columnOrAlias, aliasMap);
122 if (IsStrInVec(col, customColNames) || IsStrInVec(col, treeBranchNames) || IsStrInVec(col, dataSourceColNames))
123 return true;
124 return false;
125 };
126 const auto longestRDFColMatch = std::find_if(potentialColNames.crbegin(), potentialColNames.crend(), isRDFColumn);
127
128 if (longestRDFColMatch != potentialColNames.crend() && !IsStrInVec(*longestRDFColMatch, usedCols)) {
129 // found a new RDF column in the expression (potentially an alias)
130 usedCols.emplace_back(*longestRDFColMatch);
131 }
132 }
133
134 return usedCols;
135}
136
137static ParsedExpression ParseRDFExpression(std::string_view expr, const ColumnNames_t &treeBranchNames,
138 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
139 const std::map<std::string, std::string> &aliasMap)
140{
141 // transform `#var` into `R_rdf_sizeof_var`
142 TString preProcessedExpr(expr);
143 // match #varname at beginning of the sentence or after not-a-word, but exclude preprocessor directives like #ifdef
144 TPRegexp colSizeReplacer(
145 "(^|\\W)#(?!(ifdef|ifndef|if|else|elif|endif|pragma|define|undef|include|line))([a-zA-Z_][a-zA-Z0-9_]*)");
146 colSizeReplacer.Substitute(preProcessedExpr, "$1R_rdf_sizeof_$3", "g");
147
148 const auto usedColsAndAliases =
149 FindUsedColumns(std::string(preProcessedExpr), treeBranchNames, customColNames, dataSourceColNames, aliasMap);
150
151 auto escapeDots = [](const std::string &s) {
152 TString ss(s);
153 TPRegexp dot("\\.");
154 dot.Substitute(ss, "\\.", "g");
155 return std::string(std::move(ss));
156 };
157
158 ColumnNames_t varNames;
159 ColumnNames_t usedCols;
160 // when we are done, exprWithVars willl be the same as preProcessedExpr but column names will be substituted with
161 // the dummy variable names in varNames
162 TString exprWithVars(preProcessedExpr);
163 for (const auto &colOrAlias : usedColsAndAliases) {
164 const auto col = ROOT::Internal::RDF::ResolveAlias(colOrAlias, aliasMap);
165 unsigned int varIdx; // index of the variable in varName corresponding to col
166 if (!IsStrInVec(col, usedCols)) {
167 usedCols.emplace_back(col);
168 varIdx = varNames.size();
169 varNames.emplace_back("var" + std::to_string(varIdx));
170 } else {
171 // colOrAlias must be an alias that resolves to a column we have already seen.
172 // Find back the corresponding varName
173 varIdx = std::distance(usedCols.begin(), std::find(usedCols.begin(), usedCols.end(), col));
174 }
175 TPRegexp replacer("\\b" + escapeDots(colOrAlias) + "\\b"); // watch out: need to replace colOrAlias, not col
176 replacer.Substitute(exprWithVars, varNames[varIdx], "g");
177 }
178
179 return ParsedExpression{std::string(std::move(exprWithVars)), std::move(usedCols), std::move(varNames)};
180}
181
182/// Return the static global map of Filter/Define lambda expressions that have been jitted.
183/// It's used to check whether a given expression has already been jitted, and
184/// to look up its associated variable name if it is.
185/// Keys in the map are the body of the expression, values are the name of the
186/// jitted variable that corresponds to that expression. For example, for:
187/// auto lambda1 = [] { return 42; };
188/// key would be "[] { return 42; }" and value would be "lambda1".
189static std::unordered_map<std::string, std::string> &GetJittedExprs() {
190 static std::unordered_map<std::string, std::string> jittedExpressions;
191 return jittedExpressions;
192}
193
194static std::string
195BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
196{
197 assert(vars.size() == varTypes.size());
198
199 TPRegexp re(R"(\breturn\b)");
200 const bool hasReturnStmt = re.MatchB(expr);
201
202 static const std::vector<std::string> fundamentalTypes = {
203 "int",
204 "signed",
205 "signed int",
206 "Int_t",
207 "unsigned",
208 "unsigned int",
209 "UInt_t",
210 "double",
211 "Double_t",
212 "float",
213 "Float_t",
214 "char",
215 "Char_t",
216 "unsigned char",
217 "UChar_t",
218 "bool",
219 "Bool_t",
220 "short",
221 "short int",
222 "Short_t",
223 "long",
224 "long int",
225 "long long int",
226 "Long64_t",
227 "unsigned long",
228 "unsigned long int",
229 "ULong64_t",
230 "std::size_t",
231 "size_t",
232 "Ssiz_t"
233 };
234
235 std::stringstream ss;
236 ss << "[](";
237 for (auto i = 0u; i < vars.size(); ++i) {
238 std::string fullType;
239 const auto &type = varTypes[i];
240 if (std::find(fundamentalTypes.begin(), fundamentalTypes.end(), type) != fundamentalTypes.end()) {
241 // pass it by const value to help detect common mistakes such as if(x = 3)
242 fullType = "const " + type + " ";
243 } else {
244 // We pass by reference to avoid expensive copies
245 // It can't be const reference in general, as users might want/need to call non-const methods on the values
246 fullType = type + "& ";
247 }
248 ss << fullType << vars[i] << ", ";
249 }
250 if (!vars.empty())
251 ss.seekp(-2, ss.cur);
252
253 if (hasReturnStmt)
254 ss << "){";
255 else
256 ss << "){return ";
257 ss << expr << "\n;}";
258
259 return ss.str();
260}
261
262/// Declare a lambda expression to the interpreter in namespace R_rdf, return the name of the jitted lambda.
263/// If the lambda expression is already in GetJittedExprs, return the name for the lambda that has already been jitted.
264static std::string DeclareLambda(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
265{
267
268 const auto lambdaExpr = BuildLambdaString(expr, vars, varTypes);
269 auto &exprMap = GetJittedExprs();
270 const auto exprIt = exprMap.find(lambdaExpr);
271 if (exprIt != exprMap.end()) {
272 // expression already there
273 const auto lambdaName = exprIt->second;
274 return lambdaName;
275 }
276
277 // new expression
278 const auto lambdaBaseName = "lambda" + std::to_string(exprMap.size());
279 const auto lambdaFullName = "R_rdf::" + lambdaBaseName;
280
281 const auto toDeclare = "namespace R_rdf {\nauto " + lambdaBaseName + " = " + lambdaExpr + ";\nusing " +
282 lambdaBaseName + "_ret_t = typename ROOT::TypeTraits::CallableTraits<decltype(" +
283 lambdaBaseName + ")>::ret_type;\n}";
285
286 // InterpreterDeclare could throw. If it doesn't, mark the lambda as already jitted
287 exprMap.insert({lambdaExpr, lambdaFullName});
288
289 return lambdaFullName;
290}
291
292/// Each jitted lambda comes with a lambda_ret_t type alias for its return type.
293/// Resolve that alias and return the true type as string.
294static std::string RetTypeOfLambda(const std::string &lambdaName)
295{
296 const auto dt = gROOT->GetType((lambdaName + "_ret_t").c_str());
297 R__ASSERT(dt != nullptr);
298 const auto type = dt->GetFullTypeName();
299 return type;
300}
301
302static void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
303 std::set<TTree *> &analysedTrees, const std::string friendName = "")
304{
305 if (!analysedTrees.insert(&t).second) {
306 return;
307 }
308
309 auto branches = t.GetListOfBranches();
310 if (branches) {
311 for (auto branchObj : *branches) {
312 const auto name = branchObj->GetName();
313 if (bNamesReg.insert(name).second) {
314 bNames.emplace_back(name);
315 } else if (!friendName.empty()) {
316 // If this is a friend and the branch name has already been inserted, it might be because the friend
317 // has a branch with the same name as a branch in the main tree. Let's add it as <friendname>.<branchname>.
318 // If used for a Snapshot, this name will become <friendname>_<branchname> (with an underscore).
319 const auto longName = friendName + "." + name;
320 if (bNamesReg.insert(longName).second)
321 bNames.emplace_back(longName);
322 }
323 }
324 }
325
326 auto friendTrees = t.GetListOfFriends();
327
328 if (!friendTrees)
329 return;
330
331 for (auto friendTreeObj : *friendTrees) {
332 auto friendElement = static_cast<TFriendElement *>(friendTreeObj);
333 auto friendTree = friendElement->GetTree();
334 const std::string frName(friendElement->GetName()); // this gets us the TTree name or the friend alias if any
335 GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees, frName);
336 }
337}
338
339} // anonymous namespace
340
341namespace ROOT {
342namespace Internal {
343namespace RDF {
344
345/// Take a list of column names, return that list with entries starting by '#' filtered out.
346/// The function throws when filtering out a column this way.
347ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
348{
349 ColumnNames_t columnListWithoutSizeColumns;
350 ColumnNames_t filteredColumns;
351 std::copy_if(columnNames.begin(), columnNames.end(), std::back_inserter(columnListWithoutSizeColumns),
352 [&](const std::string &name) {
353 if (name[0] == '#') {
354 filteredColumns.emplace_back(name);
355 return false;
356 } else {
357 return true;
358 }
359 });
360
361 if (!filteredColumns.empty()) {
362 std::string msg = "Column name(s) {";
363 for (auto &c : filteredColumns)
364 msg += c + ", ";
365 msg[msg.size() - 2] = '}';
366 msg += "will be ignored. Please go through a valid Alias to " + action + " an array size column";
367 throw std::runtime_error(msg);
368 }
369
370 return columnListWithoutSizeColumns;
371}
372
373std::string ResolveAlias(const std::string &col, const std::map<std::string, std::string> &aliasMap)
374{
375 const auto it = aliasMap.find(col);
376 if (it != aliasMap.end())
377 return it->second;
378
379 // #var is an alias for R_rdf_sizeof_var
380 if (col.size() > 1 && col[0] == '#')
381 return "R_rdf_sizeof_" + col.substr(1);
382
383 return col;
384}
385
386void CheckValidCppVarName(std::string_view var, const std::string &where)
387{
388 bool isValid = true;
389
390 if (var.empty())
391 isValid = false;
392 const char firstChar = var[0];
393
394 // first character must be either a letter or an underscore
395 auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
396 const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
397 if (!isValidFirstChar)
398 isValid = false;
399
400 // all characters must be either a letter, an underscore or a number
401 auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
402 auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
403 for (const char c : var)
404 if (!isValidTok(c))
405 isValid = false;
406
407 if (!isValid) {
408 const auto error =
409 "RDataFrame::" + where + ": cannot define column \"" + std::string(var) + "\". Not a valid C++ variable name.";
410 throw std::runtime_error(error);
411 }
412}
413
414///////////////////////////////////////////////////////////////////////////////
415/// Get all the top-level branches names, including the ones of the friend trees
417{
418 std::set<std::string> bNamesSet;
419 ColumnNames_t bNames;
420 std::set<TTree *> analysedTrees;
421 GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
422 return bNames;
423}
424
425std::string DemangleTypeIdName(const std::type_info &typeInfo)
426{
427 int dummy(0);
428 char *tn = TClassEdit::DemangleTypeIdName(typeInfo, dummy);
429 std::string tname(tn);
430 free(tn);
431 return tname;
432}
433
435ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
436{
437 const auto theRegexSize = columnNameRegexp.size();
438 std::string theRegex(columnNameRegexp);
439
440 const auto isEmptyRegex = 0 == theRegexSize;
441 // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
442 if (theRegexSize > 0 && theRegex[0] != '^')
443 theRegex = "^" + theRegex;
444 if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
445 theRegex = theRegex + "$";
446
447 ColumnNames_t selectedColumns;
448
449 // Since we support gcc48 and it does not provide in its stl std::regex,
450 // we need to use TPRegexp
451 TPRegexp regexp(theRegex);
452 for (auto &&colName : colNames) {
453 if ((isEmptyRegex || regexp.MatchB(colName.c_str())) && !IsInternalColumn(colName)) {
454 selectedColumns.emplace_back(colName);
455 }
456 }
457
458 if (selectedColumns.empty()) {
459 std::string text(callerName);
460 if (columnNameRegexp.empty()) {
461 text = ": there is no column available to match.";
462 } else {
463 text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
464 }
465 throw std::runtime_error(text);
466 }
467 return selectedColumns;
468}
469
470/// Throw if column `definedColView` is already there.
471void CheckForRedefinition(const std::string &where, std::string_view definedColView, const ColumnNames_t &customCols,
472 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &treeColumns,
473 const ColumnNames_t &dataSourceColumns)
474{
475 const std::string definedCol(definedColView); // convert to std::string
476 std::string error;
477
478 const auto aliasColNameIt = aliasMap.find(definedCol);
479 const bool isAnAlias = aliasColNameIt != aliasMap.end();
480 if (isAnAlias) {
481 error = "An alias with that name, pointing to column \"" + aliasColNameIt->second + "\", already exists.";
482 }
483
484 if (error.empty()) {
485 if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end())
486 error = "A column with that name has already been Define'd. Use Redefine to force redefinition.";
487 // else, check if definedCol is in the list of tree branches. This is a bit better than interrogating the TTree
488 // directly because correct usage of GetBranch, FindBranch, GetLeaf and FindLeaf can be tricky; so let's assume we
489 // got it right when we collected the list of available branches.
490 else if (std::find(treeColumns.begin(), treeColumns.end(), definedCol) != treeColumns.end())
491 error =
492 "A branch with that name is already present in the input TTree/TChain. Use Redefine to force redefinition.";
493 else if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end())
494 error =
495 "A column with that name is already present in the input data source. Use Redefine to force redefinition.";
496 }
497
498 if (!error.empty()) {
499 error = "RDataFrame::" + where + ": cannot define column \"" + definedCol + "\". " + error;
500 throw std::runtime_error(error);
501 }
502}
503
504/// Throw if column `definedColView` is _not_ already there.
505void CheckForDefinition(const std::string &where, std::string_view definedColView, const ColumnNames_t &customCols,
506 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &treeColumns,
507 const ColumnNames_t &dataSourceColumns)
508{
509 const std::string definedCol(definedColView); // convert to std::string
510 std::string error;
511
512 const auto aliasColNameIt = aliasMap.find(definedCol);
513 const bool isAnAlias = aliasColNameIt != aliasMap.end();
514 if (isAnAlias) {
515 error = "An alias with that name, pointing to column \"" + aliasColNameIt->second +
516 "\", already exists. Aliases cannot be Redefined.";
517 }
518
519 if (error.empty()) {
520 const bool isAlreadyDefined = std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end();
521 // check if definedCol is in the list of tree branches. This is a bit better than interrogating the TTree
522 // directly because correct usage of GetBranch, FindBranch, GetLeaf and FindLeaf can be tricky; so let's assume we
523 // got it right when we collected the list of available branches.
524 const bool isABranch = std::find(treeColumns.begin(), treeColumns.end(), definedCol) != treeColumns.end();
525 const bool isADSColumn =
526 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end();
527
528 if (!isAlreadyDefined && !isABranch && !isADSColumn)
529 error = "No column with that name was found in the dataset. Use Define to create a new column.";
530 }
531
532 if (!error.empty()) {
533 error = "RDataFrame::" + where + ": cannot redefine column \"" + definedCol + "\". " + error;
534 throw std::runtime_error(error);
535 }
536}
537
538void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
539{
540 if (nTemplateParams != nColumnNames) {
541 std::string err_msg = "The number of template parameters specified is ";
542 err_msg += std::to_string(nTemplateParams);
543 err_msg += " while ";
544 err_msg += std::to_string(nColumnNames);
545 err_msg += " columns have been specified.";
546 throw std::runtime_error(err_msg);
547 }
548}
549
550/// Choose between local column names or default column names, throw in case of errors.
551const ColumnNames_t
552SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
553{
554 if (names.empty()) {
555 // use default column names
556 if (defaultNames.size() < nRequiredNames)
557 throw std::runtime_error(
558 std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
559 " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
560 // return first nRequiredNames default column names
561 return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
562 } else {
563 // use column names provided by the user to this particular transformation/action
564 if (names.size() != nRequiredNames) {
565 auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
566 " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
567 " provided:";
568 for (const auto &name : names)
569 msg += " \"" + name + "\",";
570 msg.back() = '.';
571 throw std::runtime_error(msg);
572 }
573 return names;
574 }
575}
576
577ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
578 const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
579{
580 ColumnNames_t unknownColumns;
581 for (auto &column : requiredCols) {
582 const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
583 if (isBranch)
584 continue;
585 const auto isDefine = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
586 if (isDefine)
587 continue;
588 const auto isDataSourceColumn =
589 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
590 if (isDataSourceColumn)
591 continue;
592 unknownColumns.emplace_back(column);
593 }
594 return unknownColumns;
595}
596
597std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
598{
599 return loopManager->GetFiltersNames();
600}
601
602ParsedTreePath ParseTreePath(std::string_view fullTreeName)
603{
604 // split name into directory and treename if needed
605 std::string_view dirName = "";
606 std::string_view treeName = fullTreeName;
607 const auto lastSlash = fullTreeName.rfind('/');
608 if (std::string_view::npos != lastSlash) {
609 dirName = treeName.substr(0, lastSlash);
610 treeName = treeName.substr(lastSlash + 1, treeName.size());
611 }
612 return {std::string(treeName), std::string(dirName)};
613}
614
615std::string PrettyPrintAddr(const void *const addr)
616{
617 std::stringstream s;
618 // Windows-friendly
619 s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
620 return s.str();
621}
622
623/// Book the jitting of a Filter call
624void BookFilterJit(const std::shared_ptr<RJittedFilter> &jittedFilter,
625 std::shared_ptr<RDFDetail::RNodeBase> *prevNodeOnHeap, std::string_view name,
626 std::string_view expression, const std::map<std::string, std::string> &aliasMap,
627 const ColumnNames_t &branches, const RBookedDefines &customCols, TTree *tree, RDataSource *ds)
628{
629 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
630
631 const auto parsedExpr =
632 ParseRDFExpression(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
633 const auto exprVarTypes =
634 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Filter", /*vector2rvec=*/true);
635 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
636 const auto type = RetTypeOfLambda(lambdaName);
637 if (type != "bool")
638 std::runtime_error("Filter: the following expression does not evaluate to bool:\n" + std::string(expression));
639
640 // definesOnHeap is deleted by the jitted call to JitFilterHelper
642 const auto definesOnHeapAddr = PrettyPrintAddr(definesOnHeap);
643 const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
644
645 // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
646 // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
647 std::stringstream filterInvocation;
648 filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << lambdaName << ", new const char*["
649 << parsedExpr.fUsedCols.size() << "]{";
650 for (const auto &col : parsedExpr.fUsedCols)
651 filterInvocation << "\"" << col << "\", ";
652 if (!parsedExpr.fUsedCols.empty())
653 filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
654 // lifetime of pointees:
655 // - jittedFilter: heap-allocated weak_ptr to the actual jittedFilter that will be deleted by JitFilterHelper
656 // - prevNodeOnHeap: heap-allocated shared_ptr to the actual previous node that will be deleted by JitFilterHelper
657 // - definesOnHeap: heap-allocated, will be deleted by JitFilterHelper
658 filterInvocation << "}, " << parsedExpr.fUsedCols.size() << ", \"" << name << "\", "
659 << "reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedFilter>*>("
660 << PrettyPrintAddr(MakeWeakOnHeap(jittedFilter)) << "), "
661 << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
662 << "reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesOnHeapAddr << ")"
663 << ");\n";
664
665 auto lm = jittedFilter->GetLoopManagerUnchecked();
666 lm->ToJitExec(filterInvocation.str());
667}
668
669/// Book the jitting of a Define call
670std::shared_ptr<RJittedDefine> BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm,
671 RDataSource *ds, const RBookedDefines &customCols,
672 const ColumnNames_t &branches,
673 std::shared_ptr<RNodeBase> *upcastNodeOnHeap)
674{
675 const auto &aliasMap = lm.GetAliasMap();
676 auto *const tree = lm.GetTree();
677 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
678
679 const auto parsedExpr =
680 ParseRDFExpression(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
681 const auto exprVarTypes =
682 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Define", /*vector2rvec=*/true);
683 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
684 const auto type = RetTypeOfLambda(lambdaName);
685
686 auto definesCopy = new RBookedDefines(customCols);
687 auto definesAddr = PrettyPrintAddr(definesCopy);
688 auto jittedDefine = std::make_shared<RDFDetail::RJittedDefine>(name, type, lm.GetNSlots(), lm.GetDSValuePtrs());
689
690 std::stringstream defineInvocation;
691 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper<ROOT::Internal::RDF::DefineTypes::RDefineTag>("
692 << lambdaName << ", new const char*[" << parsedExpr.fUsedCols.size() << "]{";
693 for (const auto &col : parsedExpr.fUsedCols) {
694 defineInvocation << "\"" << col << "\", ";
695 }
696 if (!parsedExpr.fUsedCols.empty())
697 defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
698 // lifetime of pointees:
699 // - lm is the loop manager, and if that goes out of scope jitting does not happen at all (i.e. will always be valid)
700 // - jittedDefine: heap-allocated weak_ptr that will be deleted by JitDefineHelper after usage
701 // - definesAddr: heap-allocated, will be deleted by JitDefineHelper after usage
702 defineInvocation << "}, " << parsedExpr.fUsedCols.size() << ", \"" << name
703 << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>(" << PrettyPrintAddr(&lm)
704 << "), reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedDefine>*>("
705 << PrettyPrintAddr(MakeWeakOnHeap(jittedDefine))
706 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesAddr
707 << "), reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
708 << PrettyPrintAddr(upcastNodeOnHeap) << "));\n";
709
710 lm.ToJitExec(defineInvocation.str());
711 return jittedDefine;
712}
713
714/// Book the jitting of a DefinePerSample call
715std::shared_ptr<RJittedDefine> BookDefinePerSampleJit(std::string_view name, std::string_view expression,
716 RLoopManager &lm, const RBookedDefines &customCols,
717 std::shared_ptr<RNodeBase> *upcastNodeOnHeap)
718{
719 const auto lambdaName = DeclareLambda(std::string(expression), {"rdfslot_", "rdfsampleinfo_"},
720 {"unsigned int", "const ROOT::RDF::RSampleInfo"});
721 const auto retType = RetTypeOfLambda(lambdaName);
722
723 auto definesCopy = new RBookedDefines(customCols);
724 auto definesAddr = PrettyPrintAddr(definesCopy);
725 auto jittedDefine = std::make_shared<RDFDetail::RJittedDefine>(name, retType, lm.GetNSlots(), lm.GetDSValuePtrs());
726
727 std::stringstream defineInvocation;
728 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper<ROOT::Internal::RDF::DefineTypes::RDefinePerSampleTag>("
729 << lambdaName << ", nullptr, 0, ";
730 // lifetime of pointees:
731 // - lm is the loop manager, and if that goes out of scope jitting does not happen at all (i.e. will always be valid)
732 // - jittedDefine: heap-allocated weak_ptr that will be deleted by JitDefineHelper after usage
733 // - definesAddr: heap-allocated, will be deleted by JitDefineHelper after usage
734 defineInvocation << "\"" << name
735 << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>(" << PrettyPrintAddr(&lm)
736 << "), reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedDefine>*>("
737 << PrettyPrintAddr(MakeWeakOnHeap(jittedDefine))
738 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesAddr
739 << "), reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
740 << PrettyPrintAddr(upcastNodeOnHeap) << "));\n";
741
742 lm.ToJitExec(defineInvocation.str());
743 return jittedDefine;
744}
745
746// Jit and call something equivalent to "this->BuildAndBook<ColTypes...>(params...)"
747// (see comments in the body for actual jitted code)
748std::string JitBuildAction(const ColumnNames_t &cols, std::shared_ptr<RDFDetail::RNodeBase> *prevNode,
749 const std::type_info &helperArgType, const std::type_info &at, void *helperArgOnHeap,
750 TTree *tree, const unsigned int nSlots, const RBookedDefines &customCols, RDataSource *ds,
751 std::weak_ptr<RJittedAction> *jittedActionOnHeap)
752{
753 // retrieve type of result of the action as a string
754 auto helperArgClass = TClass::GetClass(helperArgType);
755 if (!helperArgClass) {
756 std::string exceptionText = "An error occurred while inferring the result type of an operation.";
757 throw std::runtime_error(exceptionText.c_str());
758 }
759 const auto helperArgClassName = helperArgClass->GetName();
760
761 // retrieve type of action as a string
762 auto actionTypeClass = TClass::GetClass(at);
763 if (!actionTypeClass) {
764 std::string exceptionText = "An error occurred while inferring the action type of the operation.";
765 throw std::runtime_error(exceptionText.c_str());
766 }
767 const std::string actionTypeName = actionTypeClass->GetName();
768 const std::string actionTypeNameBase = actionTypeName.substr(actionTypeName.rfind(':') + 1);
769
770 auto definesCopy = new RBookedDefines(customCols); // deleted in jitted CallBuildAction
771 auto definesAddr = PrettyPrintAddr(definesCopy);
772
773 // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
774 // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
775 std::stringstream createAction_str;
776 createAction_str << "ROOT::Internal::RDF::CallBuildAction<" << actionTypeName;
777 const auto columnTypeNames =
778 GetValidatedArgTypes(cols, customCols, tree, ds, actionTypeNameBase, /*vector2rvec=*/true);
779 for (auto &colType : columnTypeNames)
780 createAction_str << ", " << colType;
781 // on Windows, to prefix the hexadecimal value of a pointer with '0x',
782 // one need to write: std::hex << std::showbase << (size_t)pointer
783 createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
784 << PrettyPrintAddr(prevNode) << "), new const char*[" << cols.size() << "]{";
785 for (auto i = 0u; i < cols.size(); ++i) {
786 if (i != 0u)
787 createAction_str << ", ";
788 createAction_str << '"' << cols[i] << '"';
789 }
790 createAction_str << "}, " << cols.size() << ", " << nSlots << ", reinterpret_cast<" << helperArgClassName << "*>("
791 << PrettyPrintAddr(helperArgOnHeap)
792 << "), reinterpret_cast<std::weak_ptr<ROOT::Internal::RDF::RJittedAction>*>("
793 << PrettyPrintAddr(jittedActionOnHeap)
794 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesAddr << "));";
795 return createAction_str.str();
796}
797
798bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
799{
800 for (const auto &s : strings) {
801 if (s.empty())
802 return true;
803 }
804 return false;
805}
806
807std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
808{
809 return ptr;
810}
811
812/// Given the desired number of columns and the user-provided list of columns:
813/// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
814/// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
815/// * replace column names from aliases by the actual column name
816/// Return the list of selected column names.
817ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
818 const ColumnNames_t &validDefines, RDataSource *ds)
819{
820 auto selectedColumns = SelectColumns(nColumns, columns, lm.GetDefaultColumnNames());
821
822 // Resolve aliases and expand `#var` to `R_rdf_sizeof_var`
823 const auto &aliasMap = lm.GetAliasMap();
824
825 for (auto &col : selectedColumns) {
826 col = ResolveAlias(col, aliasMap);
827 }
828
829 // Complain if there are still unknown columns at this point
830 const auto unknownColumns = FindUnknownColumns(selectedColumns, lm.GetBranchNames(), validDefines,
831 ds ? ds->GetColumnNames() : ColumnNames_t{});
832
833 if (!unknownColumns.empty()) {
834 std::stringstream unknowns;
835 std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
836 for (auto &unknownColumn : unknownColumns) {
837 unknowns << delim << unknownColumn;
838 delim = ',';
839 }
840 throw std::runtime_error("Unknown column" + unknowns.str());
841 }
842
843 return selectedColumns;
844}
845
846std::vector<std::string> GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedDefines &defines, TTree *tree,
847 RDataSource *ds, const std::string &context, bool vector2rvec)
848{
849 auto toCheckedArgType = [&](const std::string &c) {
850 RDFDetail::RDefineBase *define = defines.HasName(c) ? defines.GetColumns().at(c).get() : nullptr;
851 const auto colType = ColumnName2ColumnTypeName(c, tree, ds, define, vector2rvec);
852 if (colType.rfind("CLING_UNKNOWN_TYPE", 0) == 0) { // the interpreter does not know this type
853 const auto msg =
854 "The type of custom column \"" + c + "\" (" + colType.substr(19) +
855 ") is not known to the interpreter, but a just-in-time-compiled " + context +
856 " call requires this column. Make sure to create and load ROOT dictionaries for this column's class.";
857 throw std::runtime_error(msg);
858 }
859 return colType;
860 };
861 std::vector<std::string> colTypes;
862 colTypes.reserve(colNames.size());
863 std::transform(colNames.begin(), colNames.end(), std::back_inserter(colTypes), toCheckedArgType);
864 return colTypes;
865}
866
867/// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
868/// name of a column that must be defined via datasource. All elements of the returned vector are false if no
869/// data-source is present.
870std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
871{
872 const auto nColumns = requestedCols.size();
873 std::vector<bool> mustBeDefined(nColumns, false);
874 for (auto i = 0u; i < nColumns; ++i)
875 mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
876 return mustBeDefined;
877}
878
880{
881 std::unordered_set<std::string> uniqueCols;
882 for (auto &col : cols) {
883 if (!uniqueCols.insert(col).second) {
884 const auto msg = "Error: column \"" + col +
885 "\" was passed to Snapshot twice. This is not supported: only one of the columns would be "
886 "readable with RDataFrame.";
887 throw std::logic_error(msg);
888 }
889 }
890}
891
892////////////////////////////////////////////////////////////////////////////////
893/// \brief Trigger the execution of an RDataFrame computation graph.
894/// \param[in] node A node of the computation graph (not a result).
895///
896/// This function calls the RLoopManager::Run method on the \p fLoopManager data
897/// member of the input argument. It is intended for internal use only.
899 node.fLoopManager->Run();
900}
901
902} // namespace RDF
903} // namespace Internal
904} // namespace ROOT
#define c(i)
Definition: RSha256.hxx:101
#define R__ASSERT(e)
Definition: TError.h:118
char name[80]
Definition: TGX11.cxx:110
int type
Definition: TGX11.cxx:121
R__EXTERN TVirtualMutex * gROOTMutex
Definition: TROOT.h:61
#define gROOT
Definition: TROOT.h:404
#define R__LOCKGUARD(mutex)
#define free
Definition: civetweb.c:1539
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
void ToJitExec(const std::string &) const
void Run()
Start the event loop with a different mechanism depending on IMT/no IMT, data source/no data source.
const std::map< std::string, std::vector< void * > > & GetDSValuePtrs() const
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame.
unsigned int GetNSlots() const
Encapsulates the columns defined by the user.
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
const RDefineBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
The public interface to the RDataFrame federation of classes.
Definition: RInterface.hxx:98
RLoopManager * fLoopManager
Definition: RInterface.hxx:113
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2955
A TFriendElement TF describes a TTree object TF in a file.
virtual TTree * GetTree()
Return pointer to friend TTree.
Bool_t MatchB(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Definition: TPRegexp.h:78
Basic string class.
Definition: TString.h:136
A TTree represents a columnar dataset.
Definition: TTree.h:79
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:484
virtual TList * GetListOfFriends() const
Definition: TTree.h:486
TText * text
basic_string_view< char > string_view
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
void CheckValidCppVarName(std::string_view var, const std::string &where)
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2rvec=true)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:224
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string ResolveAlias(const std::string &col, const std::map< std::string, std::string > &aliasMap)
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string PrettyPrintAddr(const void *const addr)
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::shared_ptr< RJittedDefine > BookDefinePerSampleJit(std::string_view name, std::string_view expression, RLoopManager &lm, const RBookedDefines &customCols, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
Book the jitting of a DefinePerSample call.
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RBookedDefines &customCols, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
Book the jitting of a Define call.
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validDefines, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
std::string JitBuildAction(const ColumnNames_t &cols, std::shared_ptr< RDFDetail::RNodeBase > *prevNode, const std::type_info &helperArgType, const std::type_info &at, void *helperArgOnHeap, TTree *tree, const unsigned int nSlots, const RBookedDefines &customCols, RDataSource *ds, std::weak_ptr< RJittedAction > *jittedActionOnHeap)
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition: RDFUtils.cxx:368
void BookFilterJit(const std::shared_ptr< RJittedFilter > &jittedFilter, std::shared_ptr< RDFDetail::RNodeBase > *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RBookedDefines &customCols, TTree *tree, RDataSource *ds)
Book the jitting of a Filter call.
ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
Take a list of column names, return that list with entries starting by '#' filtered out.
void InterpreterDeclare(const std::string &code)
Declare code in the interpreter via the TInterpreter::Declare method, throw in case of errors.
Definition: RDFUtils.cxx:320
void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
std::vector< std::string > GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedDefines &defines, TTree *tree, RDataSource *ds, const std::string &context, bool vector2rvec)
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
void CheckForRedefinition(const std::string &where, std::string_view definedColView, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &treeColumns, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is already there.
void TriggerRun(ROOT::RDF::RNode &node)
Trigger the execution of an RDataFrame computation graph.
void CheckForDefinition(const std::string &where, std::string_view definedColView, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &treeColumns, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is not already there.
std::vector< std::string > ColumnNames_t
Definition: Utils.hxx:35
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
static constexpr double s
Definition: tree.py:1
Definition: civetweb.c:2228