Logo ROOT  
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
12#include <ROOT/RDataFrame.hxx>
14#include <ROOT/RStringView.hxx>
15#include <ROOT/TSeq.hxx>
16#include <RtypesCore.h>
17#include <TDirectory.h>
18#include <TChain.h>
19#include <TClass.h>
20#include <TClassEdit.h>
21#include <TFriendElement.h>
22#include <TInterpreter.h>
23#include <TObject.h>
24#include <TRegexp.h>
25#include <TPRegexp.h>
26#include <TString.h>
27#include <TTree.h>
28
29// pragma to disable warnings on Rcpp which have
30// so many noise compiling
31#if defined(__GNUC__)
32#pragma GCC diagnostic push
33#pragma GCC diagnostic ignored "-Woverloaded-virtual"
34#pragma GCC diagnostic ignored "-Wshadow"
35#endif
36#include "lexertk.hpp"
37#if defined(__GNUC__)
38#pragma GCC diagnostic pop
39#endif
40
41#include <iosfwd>
42#include <set>
43#include <stdexcept>
44#include <string>
45#include <typeinfo>
46
47namespace ROOT {
48namespace Detail {
49namespace RDF {
50class RCustomColumnBase;
51class RFilterBase;
52class RLoopManager;
53class RRangeBase;
54} // namespace RDF
55} // namespace Detail
56
57namespace RDF {
58class RDataSource;
59} // namespace RDF
60
61} // namespace ROOT
62
63namespace ROOT {
64namespace Internal {
65namespace RDF {
66
67/// A tokeniser for the expression which is in C++
68/// The goal is to extract all names which are potentially
69/// columns. The difficulty is to catch also the names containing dots.
70std::set<std::string> GetPotentialColumnNames(const std::string &expr)
71{
72 lexertk::generator generator;
73 const auto ok = generator.process(expr);
74 if (!ok) {
75 const auto msg = "Failed to tokenize expression:\n" + expr + "\n\nMake sure it is valid C++.";
76 throw std::runtime_error(msg);
77 }
78
79 std::set<std::string> potCols;
80 const auto nToks = generator.size();
81 std::string potColWithDots;
82
83 auto IsSymbol = [](const lexertk::token &t) { return t.type == lexertk::token::e_symbol; };
84 auto IsDot = [](const lexertk::token &t) { return t.value == "."; };
85
86 // Now we start iterating over the tokens
87 for (auto i = 0ULL; i < nToks; ++i) {
88 auto &tok = generator[i];
89 if (!IsSymbol(tok))
90 continue;
91
92 if (i == 0 || (i > 0 && !IsDot(generator[i - 1])))
93 potCols.insert(tok.value);
94
95 // after the current token we may have a chain of .<symbol>.<symbol>...
96 // we need to build a set of potential columns incrementally
97 // and stop at the right point. All this advancing the token
98 // cursor.
99 potColWithDots = tok.value;
100 while (i < nToks) {
101 if (i + 2 == nToks)
102 break;
103 auto &nextTok = generator[i + 1];
104 auto &next2nextTok = generator[i + 2];
105 if (!IsDot(nextTok) || !IsSymbol(next2nextTok)) {
106 break;
107 }
108 potColWithDots += "." + next2nextTok.value;
109 potCols.insert(potColWithDots);
110 i += 2;
111 }
112 potColWithDots = "";
113 }
114 return potCols;
115}
116
117// The set here is used as a registry, the real list, which keeps the order, is
118// the one in the vector
119class RActionBase;
120
121HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols,
122 std::string_view treeName,
123 std::string_view fileName,
124 bool isLazy,
125 RLoopManager &loopManager,
126 std::unique_ptr<RDFInternal::RActionBase> actionPtr)
127{
128 // create new RDF
130 auto snapshotRDF = std::make_shared<ROOT::RDataFrame>(treeName, fileName, validCols);
131 auto snapshotRDFResPtr = MakeResultPtr(snapshotRDF, loopManager, std::move(actionPtr));
132
133 if (!isLazy) {
134 *snapshotRDFResPtr;
135 }
136 return snapshotRDFResPtr;
137}
138
139std::string DemangleTypeIdName(const std::type_info &typeInfo)
140{
141 int dummy(0);
142 return TClassEdit::DemangleTypeIdName(typeInfo, dummy);
143}
144
146 TTree *tree,
147 ROOT::RDF::RDataSource *dataSource,
148 std::string_view columnNameRegexp,
149 std::string_view callerName)
150{
151 const auto theRegexSize = columnNameRegexp.size();
152 std::string theRegex(columnNameRegexp);
153
154 const auto isEmptyRegex = 0 == theRegexSize;
155 // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
156 if (theRegexSize > 0 && theRegex[0] != '^')
157 theRegex = "^" + theRegex;
158 if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
159 theRegex = theRegex + "$";
160
161 ColumnNames_t selectedColumns;
162 selectedColumns.reserve(32);
163
164 // Since we support gcc48 and it does not provide in its stl std::regex,
165 // we need to use TRegexp
166 TPRegexp regexp(theRegex);
167 for (auto &&branchName : customColumns.GetNames()) {
168 if ((isEmptyRegex || 0 != regexp.Match(branchName.c_str())) &&
169 !RDFInternal::IsInternalColumn(branchName)) {
170 selectedColumns.emplace_back(branchName);
171 }
172 }
173
174 if (tree) {
175 auto branchNames = RDFInternal::GetTopLevelBranchNames(*tree);
176 for (auto &branchName : branchNames) {
177 if (isEmptyRegex || 0 != regexp.Match(branchName.c_str())) {
178 selectedColumns.emplace_back(branchName);
179 }
180 }
181 }
182
183 if (dataSource) {
184 auto &dsColNames = dataSource->GetColumnNames();
185 for (auto &dsColName : dsColNames) {
186 if ((isEmptyRegex || 0 != regexp.Match(dsColName.c_str())) &&
187 !RDFInternal::IsInternalColumn(dsColName)) {
188 selectedColumns.emplace_back(dsColName);
189 }
190 }
191 }
192
193 if (selectedColumns.empty()) {
194 std::string text(callerName);
195 if (columnNameRegexp.empty()) {
196 text = ": there is no column available to match.";
197 } else {
198 text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
199 }
200 throw std::runtime_error(text);
201 }
202 return selectedColumns;
203}
204
205void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
206 std::set<TTree *> &analysedTrees)
207{
208
209 if (!analysedTrees.insert(&t).second) {
210 return;
211 }
212
213 auto branches = t.GetListOfBranches();
214 if (branches) {
215 for (auto branchObj : *branches) {
216 auto name = branchObj->GetName();
217 if (bNamesReg.insert(name).second) {
218 bNames.emplace_back(name);
219 }
220 }
221 }
222
223 auto friendTrees = t.GetListOfFriends();
224
225 if (!friendTrees)
226 return;
227
228 for (auto friendTreeObj : *friendTrees) {
229 auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
230 GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees);
231 }
232}
233
234///////////////////////////////////////////////////////////////////////////////
235/// Get all the top-level branches names, including the ones of the friend trees
237{
238 std::set<std::string> bNamesSet;
239 ColumnNames_t bNames;
240 std::set<TTree *> analysedTrees;
241 GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
242 return bNames;
243}
244
245bool IsValidCppVarName(const std::string &var)
246{
247 if (var.empty())
248 return false;
249 const char firstChar = var[0];
250
251 // first character must be either a letter or an underscore
252 auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
253 const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
254 if (!isValidFirstChar)
255 return false;
256
257 // all characters must be either a letter, an underscore or a number
258 auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
259 auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
260 for (const char c : var)
261 if (!isValidTok(c))
262 return false;
263
264 return true;
265}
266
267void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
268 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &dataSourceColumns)
269{
270 const std::string definedColStr(definedCol);
271
272 if (!IsValidCppVarName(definedColStr)) {
273 const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
274 throw std::runtime_error(msg);
275 }
276
277 if (treePtr != nullptr) {
278 // check if definedCol is already present in TTree
279 const auto branch = treePtr->GetBranch(definedColStr.c_str());
280 if (branch != nullptr) {
281 const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
282 throw std::runtime_error(msg);
283 }
284 }
285 // check if definedCol has already been `Define`d in the functional graph
286 if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
287 const auto msg = "Redefinition of column \"" + definedColStr + "\"";
288 throw std::runtime_error(msg);
289 }
290
291 // Check if the definedCol is an alias
292 const auto aliasColNameIt = aliasMap.find(definedColStr);
293 if (aliasColNameIt != aliasMap.end()) {
294 const auto msg = "An alias with name " + definedColStr + " pointing to column " +
295 aliasColNameIt->second + " is already existing.";
296 throw std::runtime_error(msg);
297 }
298
299 // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
300 if (!dataSourceColumns.empty()) {
301 if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
302 const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
303 throw std::runtime_error(msg);
304 }
305 }
306}
307
308void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
309{
310 if (nTemplateParams != nColumnNames) {
311 std::string err_msg = "The number of template parameters specified is ";
312 err_msg += std::to_string(nTemplateParams);
313 err_msg += " while ";
314 err_msg += std::to_string(nColumnNames);
315 err_msg += " columns have been specified.";
316 throw std::runtime_error(err_msg);
317 }
318}
319
320/// Choose between local column names or default column names, throw in case of errors.
321const ColumnNames_t
322SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
323{
324 if (names.empty()) {
325 // use default column names
326 if (defaultNames.size() < nRequiredNames)
327 throw std::runtime_error(
328 std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
329 " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
330 // return first nRequiredNames default column names
331 return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
332 } else {
333 // use column names provided by the user to this particular transformation/action
334 if (names.size() != nRequiredNames) {
335 auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
336 " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
337 " provided:";
338 for (const auto &name : names)
339 msg += " \"" + name + "\",";
340 msg.back() = '.';
341 throw std::runtime_error(msg);
342 }
343 return names;
344 }
345}
346
347ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
348 const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
349{
350 ColumnNames_t unknownColumns;
351 for (auto &column : requiredCols) {
352 const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
353 if (isBranch)
354 continue;
355 const auto isCustomColumn = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
356 if (isCustomColumn)
357 continue;
358 const auto isDataSourceColumn =
359 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
360 if (isDataSourceColumn)
361 continue;
362 unknownColumns.emplace_back(column);
363 }
364 return unknownColumns;
365}
366
368{
369 const auto str = colName.data();
370 const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
371 ('r' == str[0] || 't' == str[0]) && // starts with r or t
372 0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
373 return goodPrefix && '_' == colName.back(); // also ends with '_'
374}
375
376std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
377{
378 return loopManager->GetFiltersNames();
379}
380
381// Replace all the occurrences of a string by another string
382unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
383{
384 size_t idx = 0;
385 auto numReplacements = 0U;
386 while ((idx = s.find(what, idx)) != std::string::npos) {
387 s.replace(idx, what.size(), withWhat);
388 idx += withWhat.size();
389 numReplacements++;
390 }
391 return numReplacements;
392}
393
394// Match expression against names of branches passed as parameter
395// Return vector of names of the branches used in the expression
396std::vector<std::string> FindUsedColumnNames(std::string_view expression, ColumnNames_t branches,
397 const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns,
398 const std::map<std::string, std::string> &aliasMap)
399{
400 // To help matching the regex
401 const auto potCols = GetPotentialColumnNames(std::string(expression));
402
403 if (potCols.size() == 0) return {};
404
405 std::set<std::string> usedBranches;
406
407 // Check which custom columns match
408 for (auto &brName : customColumns) {
409 if (potCols.find(brName) != potCols.end()) {
410 usedBranches.insert(brName);
411 }
412 }
413
414 // Check which tree branches match
415 // We need to match the longest
416
417 // First: reverse sort to have longer branches before, e.g.
418 // a.b.c
419 // a.b
420 // a
421 // We want that the longest branch ends up in usedBranches before.
422 std::sort(branches.begin(), branches.end(),
423 [](const std::string &s0, const std::string &s1) {return s0 > s1;});
424
425 for (auto &brName : branches) {
426 // If the branch is not in the potential columns, we simply move on
427 if (potCols.find(brName) == potCols.end()) {
428 continue;
429 }
430 // If not, we check if the branch name is contained in one of the branch
431 // names which we already added to the usedBranches.
432 auto isContained = [&brName](const std::string &usedBr) {
433 // We check two things:
434 // 1. That the string is contained, e.g. a.b is contained in a.b.c.d
435 // 2. That the number of '.'s is greater, otherwise in situations where
436 // 2 branches have names like br0 and br01, br0 is not matched (ROOT-9929)
437 return usedBr.find(brName) != std::string::npos &&
438 std::count(usedBr.begin(), usedBr.end(), '.') > std::count(brName.begin(), brName.end(), '.');
439 };
440 auto it = std::find_if(usedBranches.begin(), usedBranches.end(), isContained);
441 if (it == usedBranches.end()) {
442 usedBranches.insert(brName);
443 }
444 }
445
446 // Check which data-source columns match
447 for (auto &col : dsColumns) {
448 if (potCols.find(col) != potCols.end()) {
449 usedBranches.insert(col);
450 }
451 }
452
453 // Check which aliases match
454 for (auto &alias_colName : aliasMap) {
455 auto &alias = alias_colName.first;
456 if (potCols.find(alias) != potCols.end()) {
457 usedBranches.insert(alias);
458 }
459 }
460
461 return std::vector<std::string>(usedBranches.begin(), usedBranches.end());
462}
463
464// TODO we should also replace other invalid chars, like '[],' and spaces
465std::vector<std::string> ReplaceDots(const ColumnNames_t &colNames)
466{
467 std::vector<std::string> dotlessNames = colNames;
468 for (auto &c : dotlessNames) {
469 const bool hasDot = c.find_first_of('.') != std::string::npos;
470 if (hasDot) {
471 std::replace(c.begin(), c.end(), '.', '_');
472 c.insert(0u, "__rdf_arg_");
473 }
474 }
475 return dotlessNames;
476}
477
478// TODO comment well -- there is a lot going on in this function in terms of side-effects
479std::vector<std::string> ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames,
480 const std::map<std::string, std::string> &aliasMap, TTree *tree,
481 RDataSource *ds, std::string &expr, unsigned int namespaceID,
482 const RDFInternal::RBookedCustomColumns &customCols)
483{
484 std::vector<std::string> colTypes;
485 colTypes.reserve(colNames.size());
486 const auto aliasMapEnd = aliasMap.end();
487
488 for (auto c = colNames.begin(), v = varNames.begin(); c != colNames.end();) {
489 const auto &colName = *c;
490
491 if (colName.find('.') != std::string::npos) {
492 // If the column name contains dots, replace its name in the expression with the corresponding varName
493 auto numRepl = Replace(expr, colName, *v);
494 if (numRepl == 0) {
495 // Discard this column: we could not replace it, although we matched it previously
496 // This is because it is a substring of a column we already replaced in the expression
497 // e.g. "a.b" is a substring column of "a.b.c"
498 c = colNames.erase(c);
499 v = varNames.erase(v);
500 continue;
501 }
502 } else {
503 // Column name with no dots: check the name is still there
504 // it might have only been there as part of a column name with dots, e.g. "a" inside "a.b.c"
505 const auto paddedExpr = " " + expr + " ";
506 static const std::string noWordChars("[^a-zA-Z0-9_]");
507 const auto colNameRxBody = noWordChars + colName + noWordChars;
508 TRegexp colNameRegex(colNameRxBody.c_str());
509 Ssiz_t matchedLen;
510 const auto colStillThere = colNameRegex.Index(paddedExpr.c_str(), &matchedLen) != -1;
511 if (!colStillThere) {
512 c = colNames.erase(c);
513 v = varNames.erase(v);
514 continue;
515 }
516 }
517
518 // Replace the colName with the real one in case colName it's an alias
519 // The real name is used to get the type, but the variable name will still be colName
520 const auto aliasMapIt = aliasMap.find(colName);
521 const auto &realColName = aliasMapEnd == aliasMapIt ? colName : aliasMapIt->second;
522 // The map is a const reference, so no operator[]
523 const auto isCustomCol = customCols.HasName(realColName);
524 const auto customColID = isCustomCol ? customCols.GetColumns().at(realColName)->GetID() : 0;
525 const auto colTypeName =
526 ColumnName2ColumnTypeName(realColName, namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
527 colTypes.emplace_back(colTypeName);
528 ++c, ++v;
529 }
530
531 return colTypes;
532}
533
534// Jit expression "in the vacuum", throw if cling exits with an error
535// This is to make sure that column names, types and expression string are proper C++
536void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames,
537 const std::vector<std::string> &colTypes, bool hasReturnStmt)
538{
539 R__ASSERT(colNames.size() == colTypes.size());
540
541 static unsigned int iNs = 0U;
542 std::stringstream dummyDecl;
543 dummyDecl << "namespace __rdf_" << std::to_string(iNs++) << "{ auto rdf_f = []() {";
544
545 for (auto col = colNames.begin(), type = colTypes.begin(); col != colNames.end(); ++col, ++type) {
546 dummyDecl << *type << " " << *col << ";\n";
547 }
548
549 // Now that branches are declared as variables, put the body of the
550 // lambda in dummyDecl and close scopes of f and namespace __rdf_N
551 if (hasReturnStmt)
552 dummyDecl << expression << "\n;};}";
553 else
554 dummyDecl << "return " << expression << "\n;};}";
555
556 // Try to declare the dummy lambda, error out if it does not compile
557 if (!gInterpreter->Declare(dummyDecl.str().c_str())) {
558 auto msg =
559 "Cannot interpret the following expression:\n" + std::string(expression) + "\n\nMake sure it is valid C++.";
560 throw std::runtime_error(msg);
561 }
562}
563
564std::string
565BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
566{
567 R__ASSERT(vars.size() == varTypes.size());
568
569 std::stringstream ss;
570 ss << "[](";
571 for (auto i = 0u; i < vars.size(); ++i) {
572 // We pass by reference to avoid expensive copies
573 // It can't be const reference in general, as users might want/need to call non-const methods on the values
574 ss << varTypes[i] << "& " << vars[i] << ", ";
575 }
576 if (!vars.empty())
577 ss.seekp(-2, ss.cur);
578
579 if (hasReturnStmt)
580 ss << "){";
581 else
582 ss << "){return ";
583 ss << expr << "\n;}";
584
585 return ss.str();
586}
587
588std::string PrettyPrintAddr(const void *const addr)
589{
590 std::stringstream s;
591 // Windows-friendly
592 s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
593 return s.str();
594}
595
596// Jit a string filter expression and jit-and-call this->Filter with the appropriate arguments
597// Return pointer to the new functional chain node returned by the call, cast to Long_t
598
599void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name,
600 std::string_view expression, const std::map<std::string, std::string> &aliasMap,
601 const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree,
602 RDataSource *ds, unsigned int namespaceID)
603{
604 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
605
606 // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
607 auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
608 auto varNames = ReplaceDots(usedBranches);
609 auto dotlessExpr = std::string(expression);
610 const auto usedColTypes =
611 ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
612
613 TRegexp re("[^a-zA-Z0-9_]?return[^a-zA-Z0-9_]");
614 Ssiz_t matchedLen;
615 const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
616
617 auto lm = jittedFilter->GetLoopManagerUnchecked();
618 lm->JitDeclarations(); // TryToJitExpression might need some of the Define'd column type aliases
619 TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
620
621 const auto filterLambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
622
623 const auto jittedFilterAddr = PrettyPrintAddr(jittedFilter);
624 const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
625
626 // columnsOnHeap is deleted by the jitted call to JitFilterHelper
628 const auto columnsOnHeapAddr = PrettyPrintAddr(columnsOnHeap);
629
630 // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
631 // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
632 std::stringstream filterInvocation;
633 filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << filterLambda << ", {";
634 for (const auto &brName : usedBranches) {
635 // Here we selectively replace the brName with the real column name if it's necessary.
636 const auto aliasMapIt = aliasMap.find(brName);
637 auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
638 filterInvocation << "\"" << realBrName << "\", ";
639 }
640 if (!usedBranches.empty())
641 filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
642 filterInvocation << "}, \"" << name << "\", "
643 << "reinterpret_cast<ROOT::Detail::RDF::RJittedFilter*>(" << jittedFilterAddr << "), "
644 << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
645 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << columnsOnHeapAddr << ")"
646 << ");";
647
648 lm->ToJitExec(filterInvocation.str());
649}
650
651// Jit a Define call
653 const std::shared_ptr<RJittedCustomColumn> &jittedCustomColumn,
654 const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches)
655{
656 const auto &aliasMap = lm.GetAliasMap();
657 auto *const tree = lm.GetTree();
658 const auto namespaceID = lm.GetID();
659 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
660
661 // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
662 auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
663 auto varNames = ReplaceDots(usedBranches);
664 auto dotlessExpr = std::string(expression);
665 const auto usedColTypes =
666 ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
667
668 TRegexp re("[^a-zA-Z0-9_]?return[^a-zA-Z0-9_]");
669 Ssiz_t matchedLen;
670 const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
671
672 lm.JitDeclarations(); // TryToJitExpression might need some of the Define'd column type aliases
673 TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
674
675 const auto definelambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
676 const auto customColID = std::to_string(jittedCustomColumn->GetID());
677 const auto lambdaName = "eval_" + std::string(name) + customColID;
678 const auto ns = "__rdf" + std::to_string(namespaceID);
679
680 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols);
681 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
682
683 // Declare the lambda variable and an alias for the type of the defined column in namespace __rdf
684 // This assumes that a given variable is Define'd once per RDataFrame -- we might want to relax this requirement
685 // to let python users execute a Define cell multiple times
686 const auto defineDeclaration =
687 "namespace " + ns + " { auto " + lambdaName + " = " + definelambda + ";\n" + "using " + std::string(name) +
688 customColID + "_type = typename ROOT::TypeTraits::CallableTraits<decltype(" + lambdaName + " )>::ret_type; }\n";
689 lm.ToJitDeclare(defineDeclaration);
690
691 std::stringstream defineInvocation;
692 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << definelambda << ", {";
693 for (auto brName : usedBranches) {
694 // Here we selectively replace the brName with the real column name if it's necessary.
695 auto aliasMapIt = aliasMap.find(brName);
696 auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
697 defineInvocation << "\"" << realBrName << "\", ";
698 }
699 if (!usedBranches.empty())
700 defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
701 defineInvocation << "}, \"" << name << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>("
702 << PrettyPrintAddr(&lm) << "), *reinterpret_cast<ROOT::Detail::RDF::RJittedCustomColumn*>("
703 << PrettyPrintAddr(jittedCustomColumn.get()) << "),"
704 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
705 << ");";
706
707 lm.ToJitExec(defineInvocation.str());
708}
709
710// Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
711// (see comments in the body for actual jitted code)
712std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at,
713 void *rOnHeap, TTree *tree, const unsigned int nSlots,
714 const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds,
715 std::shared_ptr<RJittedAction> *jittedActionOnHeap, unsigned int namespaceID)
716{
717 auto nBranches = bl.size();
718
719 // retrieve branch type names as strings
720 std::vector<std::string> columnTypeNames(nBranches);
721 for (auto i = 0u; i < nBranches; ++i) {
722 const auto isCustomCol = customCols.HasName(bl[i]);
723 const auto customColID = isCustomCol ? customCols.GetColumns().at(bl[i])->GetID() : 0;
724 const auto columnTypeName =
725 ColumnName2ColumnTypeName(bl[i], namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
726 if (columnTypeName.empty()) {
727 std::string exceptionText = "The type of column ";
728 exceptionText += bl[i];
729 exceptionText += " could not be guessed. Please specify one.";
730 throw std::runtime_error(exceptionText.c_str());
731 }
732 columnTypeNames[i] = columnTypeName;
733 }
734
735 // retrieve type of result of the action as a string
736 auto actionResultTypeClass = TClass::GetClass(art);
737 if (!actionResultTypeClass) {
738 std::string exceptionText = "An error occurred while inferring the result type of an operation.";
739 throw std::runtime_error(exceptionText.c_str());
740 }
741 const auto actionResultTypeName = actionResultTypeClass->GetName();
742
743 // retrieve type of action as a string
744 auto actionTypeClass = TClass::GetClass(at);
745 if (!actionTypeClass) {
746 std::string exceptionText = "An error occurred while inferring the action type of the operation.";
747 throw std::runtime_error(exceptionText.c_str());
748 }
749 const auto actionTypeName = actionTypeClass->GetName();
750
751 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols); // deleted in jitted CallBuildAction
752 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
753
754 // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
755 // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
756 std::stringstream createAction_str;
757 createAction_str << "ROOT::Internal::RDF::CallBuildAction"
758 << "<" << actionTypeName;
759 for (auto &colType : columnTypeNames)
760 createAction_str << ", " << colType;
761 // on Windows, to prefix the hexadecimal value of a pointer with '0x',
762 // one need to write: std::hex << std::showbase << (size_t)pointer
763 createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
764 << PrettyPrintAddr(prevNode) << "), {";
765 for (auto i = 0u; i < bl.size(); ++i) {
766 if (i != 0u)
767 createAction_str << ", ";
768 createAction_str << '"' << bl[i] << '"';
769 }
770 createAction_str << "}, " << std::dec << std::noshowbase << nSlots << ", reinterpret_cast<" << actionResultTypeName
771 << "*>(" << PrettyPrintAddr(rOnHeap) << ")"
772 << ", reinterpret_cast<std::shared_ptr<ROOT::Internal::RDF::RJittedAction>*>("
773 << PrettyPrintAddr(jittedActionOnHeap) << "),"
774 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
775 << ");";
776 return createAction_str.str();
777}
778
779bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
780{
781 for (const auto &s : strings) {
782 if (s.empty())
783 return true;
784 }
785 return false;
786}
787
788std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
789{
790 return ptr;
791}
792
793/// Given the desired number of columns and the user-provided list of columns:
794/// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
795/// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
796/// Return the list of selected column names.
797ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
798 const ColumnNames_t &validCustomColumns, RDataSource *ds)
799{
800 const auto &defaultColumns = lm.GetDefaultColumnNames();
801 auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
802 const auto &validBranchNames = lm.GetBranchNames();
803 const auto unknownColumns = FindUnknownColumns(selectedColumns, validBranchNames, validCustomColumns,
804 ds ? ds->GetColumnNames() : ColumnNames_t{});
805
806 if (!unknownColumns.empty()) {
807 // throw
808 std::stringstream unknowns;
809 std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
810 for (auto &unknownColumn : unknownColumns) {
811 unknowns << delim << unknownColumn;
812 delim = ',';
813 }
814 throw std::runtime_error("Unknown column" + unknowns.str());
815 }
816
817 // Now we need to check within the aliases if some of the yet unknown names can be recovered
818 auto &aliasMap = lm.GetAliasMap();
819 auto aliasMapEnd = aliasMap.end();
820
821 for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
822 const auto &colName = selectedColumns[idx];
823 const auto aliasColumnNameIt = aliasMap.find(colName);
824 if (aliasMapEnd != aliasColumnNameIt) {
825 selectedColumns[idx] = aliasColumnNameIt->second;
826 }
827 }
828
829 return selectedColumns;
830}
831
832/// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
833/// name of a column that must be defined via datasource. All elements of the returned vector are false if no
834/// data-source is present.
835std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
836{
837 const auto nColumns = requestedCols.size();
838 std::vector<bool> mustBeDefined(nColumns, false);
839 for (auto i = 0u; i < nColumns; ++i)
840 mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
841 return mustBeDefined;
842}
843
844} // namespace RDF
845} // namespace Internal
846} // namespace ROOT
#define c(i)
Definition: RSha256.hxx:101
#define s0(x)
Definition: RSha256.hxx:90
#define s1(x)
Definition: RSha256.hxx:91
static RooMathCoreReg dummy
int Ssiz_t
Definition: RtypesCore.h:63
#define R__ASSERT(e)
Definition: TError.h:96
char name[80]
Definition: TGX11.cxx:109
int type
Definition: TGX11.cxx:120
#define gInterpreter
Definition: TInterpreter.h:555
A wrapper around a concrete RFilter, which forwards all calls to it RJittedFilter is the type of the ...
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
void ToJitDeclare(const std::string &s)
void ToJitExec(const std::string &s)
void JitDeclarations()
Declare to the interpreter type aliases and other entities required by RDF jitted nodes.
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame.
virtual RLoopManager * GetLoopManagerUnchecked()
Definition: RNodeBase.hxx:64
Encapsulates the columns defined by the user.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
const RCustomColumnBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2906
Small helper to keep current directory context.
Definition: TDirectory.h:41
A TFriendElement TF describes a TTree object TF in a file.
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:339
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
A TTree represents a columnar dataset.
Definition: TTree.h:72
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5170
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:475
virtual TList * GetListOfFriends() const
Definition: TTree.h:477
TText * text
basic_string_view< char > string_view
RResultPtr< T > MakeResultPtr(const std::shared_ptr< T > &r, RLoopManager &df, std::shared_ptr< ROOT::Internal::RDF::RActionBase > actionPtr)
Create a RResultPtr and set its pointer to the corresponding RAction This overload is invoked by non-...
Definition: RResultPtr.hxx:346
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
std::string BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
std::vector< std::string > ReplaceDots(const ColumnNames_t &colNames)
bool IsValidCppVarName(const std::string &var)
unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
ColumnNames_t ConvertRegexToColumns(const RDFInternal::RBookedCustomColumns &customColumns, TTree *tree, ROOT::RDF::RDataSource *dataSource, std::string_view columnNameRegexp, std::string_view callerName)
std::string PrettyPrintAddr(const void *const addr)
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::set< std::string > GetPotentialColumnNames(const std::string &expr)
A tokeniser for the expression which is in C++ The goal is to extract all names which are potentially...
std::vector< std::string > FindUsedColumnNames(std::string_view expression, ColumnNames_t branches, const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns, const std::map< std::string, std::string > &aliasMap)
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols, std::string_view treeName, std::string_view fileName, bool isLazy, RLoopManager &loopManager, std::unique_ptr< RDFInternal::RActionBase > actionPtr)
std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at, void *rOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds, std::shared_ptr< RJittedAction > *jittedActionOnHeap, unsigned int namespaceID)
bool IsInternalColumn(std::string_view colName)
std::string ColumnName2ColumnTypeName(const std::string &colName, unsigned int namespaceID, TTree *tree, RDataSource *ds, bool isCustomColumn, bool vector2rvec, unsigned int customColID)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:210
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validCustomColumns, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
void GetTopLevelBranchNamesImpl(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::set< TTree * > &analysedTrees)
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames, const std::vector< std::string > &colTypes, bool hasReturnStmt)
void BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const std::shared_ptr< RJittedCustomColumn > &jittedCustomColumn, const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches)
void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree, RDataSource *ds, unsigned int namespaceID)
std::vector< std::string > ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames, const std::map< std::string, std::string > &aliasMap, TTree *tree, RDataSource *ds, std::string &expr, unsigned int namespaceID, const RDFInternal::RBookedCustomColumns &customCols)
void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
VSD Structures.
Definition: StringConv.hxx:21
ROOT::Detail::RDF::ColumnNames_t ColumnNames_t
Definition: RDataFrame.cxx:788
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
static constexpr double s
static constexpr double ns
Definition: tree.py:1