Logo ROOT  
Reference Guide
RDFUtils.cxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #include "RConfigure.h" // R__USE_IMT
12 #include "ROOT/RDataSource.hxx"
13 #include "ROOT/RDF/RDefineBase.hxx"
15 #include "ROOT/RDF/Utils.hxx"
16 #include "ROOT/RLogger.hxx"
17 #include "RtypesCore.h"
18 #include "TBranch.h"
19 #include "TBranchElement.h"
20 #include "TClass.h"
21 #include "TClassEdit.h"
22 #include "TClassRef.h"
23 #include "TError.h" // Info
24 #include "TInterpreter.h"
25 #include "TLeaf.h"
26 #include "TROOT.h" // IsImplicitMTEnabled, GetThreadPoolSize
27 #include "TTree.h"
28 
29 #include <stdexcept>
30 #include <string>
31 #include <cstring>
32 #include <typeinfo>
33 
34 using namespace ROOT::Detail::RDF;
35 using namespace ROOT::RDF;
36 
37 ROOT::Experimental::RLogChannel &ROOT::Detail::RDF::RDFLogChannel()
38 {
39  static ROOT::Experimental::RLogChannel c("ROOT.RDF");
40  return c;
41 }
42 
43 namespace ROOT {
44 namespace Internal {
45 namespace RDF {
46 
47 /// Return the type_info associated to a name. If the association fails, an
48 /// exception is thrown.
49 /// References and pointers are not supported since those cannot be stored in
50 /// columns.
51 const std::type_info &TypeName2TypeID(const std::string &name)
52 {
53  if (auto c = TClass::GetClass(name.c_str())) {
54  if (!c->GetTypeInfo()) {
55  std::string msg("Cannot extract type_info of type ");
56  msg += name.c_str();
57  msg += ".";
58  throw std::runtime_error(msg);
59  }
60  return *c->GetTypeInfo();
61  } else if (name == "char" || name == "Char_t")
62  return typeid(char);
63  else if (name == "unsigned char" || name == "UChar_t")
64  return typeid(unsigned char);
65  else if (name == "int" || name == "Int_t")
66  return typeid(int);
67  else if (name == "unsigned int" || name == "UInt_t")
68  return typeid(unsigned int);
69  else if (name == "short" || name == "Short_t")
70  return typeid(short);
71  else if (name == "unsigned short" || name == "UShort_t")
72  return typeid(unsigned short);
73  else if (name == "long" || name == "Long_t")
74  return typeid(long);
75  else if (name == "unsigned long" || name == "ULong_t")
76  return typeid(unsigned long);
77  else if (name == "double" || name == "Double_t")
78  return typeid(double);
79  else if (name == "float" || name == "Float_t")
80  return typeid(float);
81  else if (name == "long long" || name == "long long int" || name == "Long64_t")
82  return typeid(Long64_t);
83  else if (name == "unsigned long long" || name == "unsigned long long int" || name == "ULong64_t")
84  return typeid(ULong64_t);
85  else if (name == "bool" || name == "Bool_t")
86  return typeid(bool);
87  else {
88  std::string msg("Cannot extract type_info of type ");
89  msg += name.c_str();
90  msg += ".";
91  throw std::runtime_error(msg);
92  }
93 }
94 
95 /// Returns the name of a type starting from its type_info
96 /// An empty string is returned in case of failure
97 /// References and pointers are not supported since those cannot be stored in
98 /// columns.
99 std::string TypeID2TypeName(const std::type_info &id)
100 {
101  if (auto c = TClass::GetClass(id)) {
102  return c->GetName();
103  } else if (id == typeid(char))
104  return "char";
105  else if (id == typeid(unsigned char))
106  return "unsigned char";
107  else if (id == typeid(int))
108  return "int";
109  else if (id == typeid(unsigned int))
110  return "unsigned int";
111  else if (id == typeid(short))
112  return "short";
113  else if (id == typeid(unsigned short))
114  return "unsigned short";
115  else if (id == typeid(long))
116  return "long";
117  else if (id == typeid(unsigned long))
118  return "unsigned long";
119  else if (id == typeid(double))
120  return "double";
121  else if (id == typeid(float))
122  return "float";
123  else if (id == typeid(Long64_t))
124  return "Long64_t";
125  else if (id == typeid(ULong64_t))
126  return "ULong64_t";
127  else if (id == typeid(bool))
128  return "bool";
129  else
130  return "";
131 }
132 
133 std::string ComposeRVecTypeName(const std::string &valueType)
134 {
135  return "ROOT::VecOps::RVec<" + valueType + ">";
136 }
137 
138 std::string GetLeafTypeName(TLeaf *leaf, const std::string &colName)
139 {
140  std::string colType = leaf->GetTypeName();
141  if (colType.empty())
142  throw std::runtime_error("Could not deduce type of leaf " + colName);
143  if (leaf->GetLeafCount() != nullptr && leaf->GetLenStatic() == 1) {
144  // this is a variable-sized array
145  colType = ComposeRVecTypeName(colType);
146  } else if (leaf->GetLeafCount() == nullptr && leaf->GetLenStatic() > 1) {
147  // this is a fixed-sized array (we do not differentiate between variable- and fixed-sized arrays)
148  colType = ComposeRVecTypeName(colType);
149  } else if (leaf->GetLeafCount() != nullptr && leaf->GetLenStatic() > 1) {
150  // we do not know how to deal with this branch
151  throw std::runtime_error("TTree leaf " + colName +
152  " has both a leaf count and a static length. This is not supported.");
153  }
154 
155  return colType;
156 }
157 
158 /// Return the typename of object colName stored in t, if any. Return an empty string if colName is not in t.
159 /// Supported cases:
160 /// - leaves corresponding to single values, variable- and fixed-length arrays, with following syntax:
161 /// - "leafname", as long as TTree::GetLeaf resolves it
162 /// - "b1.b2...leafname", as long as TTree::GetLeaf("b1.b2....", "leafname") resolves it
163 /// - TBranchElements, as long as TTree::GetBranch resolves their names
164 std::string GetBranchOrLeafTypeName(TTree &t, const std::string &colName)
165 {
166  // look for TLeaf either with GetLeaf(colName) or with GetLeaf(branchName, leafName) (splitting on last dot)
167  auto leaf = t.GetLeaf(colName.c_str());
168  if (!leaf)
169  leaf = t.FindLeaf(colName.c_str()); // try harder
170  if (!leaf) {
171  // try splitting branchname and leafname
172  const auto dotPos = colName.find_last_of('.');
173  const auto hasDot = dotPos != std::string::npos;
174  if (hasDot) {
175  const auto branchName = colName.substr(0, dotPos);
176  const auto leafName = colName.substr(dotPos + 1);
177  leaf = t.GetLeaf(branchName.c_str(), leafName.c_str());
178  }
179  }
180  if (leaf)
181  return GetLeafTypeName(leaf, colName);
182 
183  // we could not find a leaf named colName, so we look for a TBranchElement
184  auto branch = t.GetBranch(colName.c_str());
185  if (!branch)
186  branch = t.FindBranch(colName.c_str()); // try harder
187  if (branch) {
188  static const TClassRef tbranchelement("TBranchElement");
189  if (branch->InheritsFrom(tbranchelement)) {
190  auto be = static_cast<TBranchElement *>(branch);
191  if (auto currentClass = be->GetCurrentClass())
192  return currentClass->GetName();
193  else {
194  // Here we have a special case for getting right the type of data members
195  // of classes sorted in TClonesArrays: ROOT-9674
196  auto mother = be->GetMother();
197  if (mother && mother->InheritsFrom(tbranchelement) && mother != be) {
198  auto beMom = static_cast<TBranchElement *>(mother);
199  auto beMomClass = beMom->GetClass();
200  if (beMomClass && 0 == std::strcmp("TClonesArray", beMomClass->GetName()))
201  return be->GetTypeName();
202  }
203  return be->GetClassName();
204  }
205  }
206  }
207 
208  // colName is not a leaf nor a TBranchElement
209  return std::string();
210 }
211 
212 /// Return a string containing the type of the given branch. Works both with real TTree branches and with temporary
213 /// column created by Define. Throws if type name deduction fails.
214 /// Note that for fixed- or variable-sized c-style arrays the returned type name will be RVec<T>.
215 /// vector2rvec specifies whether typename 'std::vector<T>' should be converted to 'RVec<T>' or returned as is
216 /// customColID is only used if isDefine is true, and must correspond to the custom column's unique identifier
217 /// returned by its `GetID()` method.
218 std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RDefineBase *define,
219  bool vector2rvec)
220 {
221  std::string colType;
222 
223  if (ds && ds->HasColumn(colName))
224  colType = ds->GetTypeName(colName);
225 
226  if (colType.empty() && tree) {
227  colType = GetBranchOrLeafTypeName(*tree, colName);
228  if (vector2rvec && TClassEdit::IsSTLCont(colType) == ROOT::ESTLType::kSTLvector) {
229  std::vector<std::string> split;
230  int dummy;
231  TClassEdit::GetSplit(colType.c_str(), split, dummy);
232  auto &valueType = split[1];
233  colType = ComposeRVecTypeName(valueType);
234  }
235  }
236 
237  if (colType.empty() && define) {
238  colType = define->GetTypeName();
239  }
240 
241  if (colType.empty())
242  throw std::runtime_error("Column \"" + colName +
243  "\" is not in a dataset and is not a custom column been defined.");
244 
245  return colType;
246 }
247 
248 /// Convert type name (e.g. "Float_t") to ROOT type code (e.g. 'F') -- see TBranch documentation.
249 /// Return a space ' ' in case no match was found.
250 char TypeName2ROOTTypeName(const std::string &b)
251 {
252  if (b == "Char_t" || b == "char")
253  return 'B';
254  if (b == "UChar_t" || b == "unsigned char")
255  return 'b';
256  if (b == "Short_t" || b == "short" || b == "short int")
257  return 'S';
258  if (b == "UShort_t" || b == "unsigned short" || b == "unsigned short int")
259  return 's';
260  if (b == "Int_t" || b == "int")
261  return 'I';
262  if (b == "UInt_t" || b == "unsigned" || b == "unsigned int")
263  return 'i';
264  if (b == "Float_t" || b == "float")
265  return 'F';
266  if (b == "Double_t" || b == "double")
267  return 'D';
268  if (b == "Long64_t" || b == "long long" || b == "long long int")
269  return 'L';
270  if (b == "ULong64_t" || b == "unsigned long long" || b == "unsigned long long int")
271  return 'l';
272  if (b == "Long_t" || b == "long" || b == "long int")
273  return 'G';
274  if (b == "ULong_t" || b == "unsigned long" || b == "unsigned long int")
275  return 'g';
276  if (b == "Bool_t" || b == "bool")
277  return 'O';
278  return ' ';
279 }
280 
281 unsigned int GetNSlots()
282 {
283  unsigned int nSlots = 1;
284 #ifdef R__USE_IMT
286  nSlots = ROOT::GetThreadPoolSize();
287 #endif // R__USE_IMT
288  return nSlots;
289 }
290 
291 /// Replace occurrences of '.' with '_' in each string passed as argument.
292 /// An Info message is printed when this happens. Dots at the end of the string are not replaced.
293 /// An exception is thrown in case the resulting set of strings would contain duplicates.
294 std::vector<std::string> ReplaceDotWithUnderscore(const std::vector<std::string> &columnNames)
295 {
296  auto newColNames = columnNames;
297  for (auto &col : newColNames) {
298  const auto dotPos = col.find('.');
299  if (dotPos != std::string::npos && dotPos != col.size() - 1 && dotPos != 0u) {
300  auto oldName = col;
301  std::replace(col.begin(), col.end(), '.', '_');
302  if (std::find(columnNames.begin(), columnNames.end(), col) != columnNames.end())
303  throw std::runtime_error("Column " + oldName + " would be written as " + col +
304  " but this column already exists. Please use Alias to select a new name for " +
305  oldName);
306  Info("Snapshot", "Column %s will be saved as %s", oldName.c_str(), col.c_str());
307  }
308  }
309 
310  return newColNames;
311 }
312 
313 void InterpreterDeclare(const std::string &code)
314 {
315  R__LOG_DEBUG(10, RDFLogChannel()) << "Declaring the following code to cling:\n\n" << code << '\n';
316 
317  if (!gInterpreter->Declare(code.c_str())) {
318  const auto msg =
319  "\nRDataFrame: An error occurred during just-in-time compilation. The lines above might indicate the cause of "
320  "the crash\n All RDF objects that have not run an event loop yet should be considered in an invalid state.\n";
321  throw std::runtime_error(msg);
322  }
323 }
324 
325 Long64_t InterpreterCalc(const std::string &code, const std::string &context)
326 {
327  R__LOG_DEBUG(10, RDFLogChannel()) << "Jitting and executing the following code:\n\n" << code << '\n';
328 
330  auto res = gInterpreter->Calc(code.c_str(), &errorCode);
331  if (errorCode != TInterpreter::EErrorCode::kNoError) {
332  std::string msg = "\nAn error occurred during just-in-time compilation";
333  if (!context.empty())
334  msg += " in " + context;
335  msg += ". The lines above might indicate the cause of the crash\nAll RDF objects that have not run their event "
336  "loop yet should be considered in an invalid state.\n";
337  throw std::runtime_error(msg);
338  }
339  return res;
340 }
341 
343 {
344  const auto str = colName.data();
345  const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
346  ('r' == str[0] || 't' == str[0]) && // starts with r or t
347  0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
348  return goodPrefix && '_' == colName.back(); // also ends with '_'
349 }
350 
351 } // end NS RDF
352 } // end NS Internal
353 } // end NS ROOT
c
#define c(i)
Definition: RSha256.hxx:101
ROOT::Internal::RDF::InterpreterCalc
Long64_t InterpreterCalc(const std::string &code, const std::string &context)
Definition: RDFUtils.cxx:325
ROOT::kSTLvector
@ kSTLvector
Definition: ESTLType.h:30
ROOT::Detail::RDF::RDefineBase::GetTypeName
std::string GetTypeName() const
Definition: RDefineBase.cxx:44
TTree::FindBranch
virtual TBranch * FindBranch(const char *name)
Return the branch that correspond to the path 'branchname', which can include the name of the tree or...
Definition: TTree.cxx:4774
ROOT::Internal::RDF::ComposeRVecTypeName
std::string ComposeRVecTypeName(const std::string &valueType)
Definition: RDFUtils.cxx:133
TTree::GetLeaf
virtual TLeaf * GetLeaf(const char *branchname, const char *leafname)
Return pointer to the 1st Leaf named name in any Branch of this Tree or any branch in the list of fri...
Definition: TTree.cxx:6112
TBranchElement
A Branch for the case of an object.
Definition: TBranchElement.h:39
ROOT::Internal::RDF::TypeName2ROOTTypeName
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition: RDFUtils.cxx:250
tree
Definition: tree.py:1
TBranch.h
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
string_view
basic_string_view< char > string_view
Definition: libcpp_string_view.h:785
TTree
A TTree represents a columnar dataset.
Definition: TTree.h:79
TLeaf::GetTypeName
virtual const char * GetTypeName() const
Definition: TLeaf.h:138
extract_docstrings.ds
ds
Definition: extract_docstrings.py:40
gInterpreter
#define gInterpreter
Definition: TInterpreter.h:560
Utils.hxx
ROOT::Internal::RDF::ReplaceDotWithUnderscore
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition: RDFUtils.cxx:294
TClass.h
TInterpreter::EErrorCode
EErrorCode
Definition: TInterpreter.h:76
long
long
Definition: Converters.cxx:858
RDefineBase.hxx
TTree.h
ROOT::Experimental::RLogChannel
A log configuration for a channel, e.g.
Definition: RLogger.hxx:101
b
#define b(i)
Definition: RSha256.hxx:100
ROOT::Detail::RDF
Definition: GraphUtils.hxx:28
ROOT::RDF::RDataSource
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:106
bool
RDataSource.hxx
TROOT.h
ROOT::Internal::RDF::ColumnName2ColumnTypeName
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RDefineBase *define, bool vector2rvec)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:218
TTree::GetBranch
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5221
ROOT::Internal::RDF::InterpreterDeclare
void InterpreterDeclare(const std::string &code)
Definition: RDFUtils.cxx:313
RLogger.hxx
TClassEdit::IsSTLCont
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
Definition: TClassEdit.cxx:1358
ROOT::Detail::RDF::RDefineBase
Definition: RDefineBase.hxx:34
TBranchElement::GetClass
virtual TClass * GetClass() const
Definition: TBranchElement.h:187
TBranchElement.h
TLeaf.h
ROOT::Internal::RDF::GetBranchOrLeafTypeName
std::string GetBranchOrLeafTypeName(TTree &t, const std::string &colName)
Return the typename of object colName stored in t, if any.
Definition: RDFUtils.cxx:164
TLeaf
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:57
TLeaf::GetLeafCount
virtual TLeaf * GetLeafCount() const
If this leaf stores a variable-sized array or a multi-dimensional array whose last dimension has vari...
Definition: TLeaf.h:120
ROOT::GetThreadPoolSize
UInt_t GetThreadPoolSize()
Returns the size of ROOT's thread pool.
Definition: TROOT.cxx:563
double
double
Definition: Converters.cxx:921
TClass::GetClass
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2946
ROOT::Internal::RDF::IsInternalColumn
bool IsInternalColumn(std::string_view colName)
Definition: RDFUtils.cxx:342
TTree::FindLeaf
virtual TLeaf * FindLeaf(const char *name)
Find leaf..
Definition: TTree.cxx:4846
TLeaf::GetLenStatic
virtual Int_t GetLenStatic() const
Return the fixed length of this leaf.
Definition: TLeaf.h:131
TClassEdit::GetSplit
int GetSplit(const char *type, std::vector< std::string > &output, int &nestedLoc, EModType mode=TClassEdit::kNone)
Stores in output (after emptying it) the split type.
Definition: TClassEdit.cxx:1009
ROOT::RDF
Definition: RArrowDS.hxx:20
ULong64_t
unsigned long long ULong64_t
Definition: RtypesCore.h:74
ROOT::Internal::RDF::GetNSlots
unsigned int GetNSlots()
Definition: RDFUtils.cxx:281
ROOT::IsImplicitMTEnabled
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition: TROOT.cxx:556
RtypesCore.h
TClassRef.h
TClassEdit.h
TInterpreter.h
Info
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition: TError.cxx:220
ULong64_t
short
l unsigned short
Definition: Converters.cxx:862
name
char name[80]
Definition: TGX11.cxx:110
ROOT::Internal::RDF::TypeName2TypeID
const std::type_info & TypeName2TypeID(const std::string &name)
Return the type_info associated to a name.
Definition: RDFUtils.cxx:51
TInterpreter::kNoError
@ kNoError
Definition: TInterpreter.h:77
ROOT::Internal::RDF::GetLeafTypeName
std::string GetLeafTypeName(TLeaf *leaf, const std::string &colName)
Definition: RDFUtils.cxx:138
TClassRef
TClassRef is used to implement a permanent reference to a TClass object.
Definition: TClassRef.h:28
RLoopManager.hxx
TNamed::GetName
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
R__LOG_DEBUG
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition: RLogger.hxx:365
ROOT::Internal::RDF::TypeID2TypeName
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition: RDFUtils.cxx:99
ROOT
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Definition: EExecutionPolicy.hxx:4
int
TError.h