Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterfaceBase.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud CERN 08/2022
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/InternalTreeUtils.hxx> // GetFriendInfo, GetFileNamesFromTree
14#include <ROOT/RDF/Utils.hxx>
16#include <string_view>
17#include <TTree.h>
18
19#include <algorithm> // std::for_each
20#include <iomanip> // std::setw
21#include <memory>
22#include <set>
23#include <sstream>
24#include <string>
25#include <unordered_set>
26
28{
29 if (auto dataSource = GetDataSource()) {
30 return dataSource->GetNFiles();
31 }
32 return 0;
33}
34
36{
37 // Datasource as input
38 if (auto ds = GetDataSource()) {
40 }
41 // Trivial/empty datasource
42 else {
43 const auto n = fLoopManager->GetNEmptyEntries();
44 if (n == 1) {
45 return "Empty dataframe filling 1 row";
46 } else {
47 return "Empty dataframe filling " + std::to_string(n) + " rows";
48 }
49 }
50}
51
52ROOT::RDF::RInterfaceBase::RInterfaceBase(std::shared_ptr<RDFDetail::RLoopManager> lm)
53 : fLoopManager(lm), fColRegister(lm.get())
54{
56}
57
59 : fLoopManager(std::shared_ptr<ROOT::Detail::RDF::RLoopManager>{&lm, [](ROOT::Detail::RDF::RLoopManager *) {}}),
60 fColRegister(colRegister)
61{
62}
63
64/////////////////////////////////////////////////////////////////////////////
65/// \brief Returns the names of the available columns.
66/// \return the container of column names.
67///
68/// This is not an action nor a transformation, just a query to the RDataFrame object.
69///
70/// ### Example usage:
71/// ~~~{.cpp}
72/// auto colNames = d.GetColumnNames();
73/// // Print columns' names
74/// for (auto &&colName : colNames) std::cout << colName << std::endl;
75/// ~~~
76///
78{
79 // there could be duplicates between Redefined columns and columns in the data source
80 std::unordered_set<std::string> allColumns;
81
82 auto addIfNotInternal = [&allColumns](std::string_view colName) {
84 allColumns.emplace(colName);
85 };
86
87 auto definedColumns = fColRegister.GenerateColumnNames();
88
90
91 if (auto ds = GetDataSource()) {
93 if (s.rfind("R_rdf_sizeof", 0) != 0)
94 allColumns.emplace(s);
95 }
96 }
97
99 std::sort(ret.begin(), ret.end());
100 return ret;
101}
102
103/////////////////////////////////////////////////////////////////////////////
104/// \brief Retrieve the names of top-level field names
105///
106/// For data sources that support hierarchical dataset schemas, such as TTree
107/// or RNTuple, this function will retrieve the names of top-level fields. For
108/// example, if the schema contains a user class with a data member, only
109/// the name of the top-level field containing the user class object would be
110/// reported, but not the name of the data member sub-field.
111///
112/// For all other data sources, returns the list of all available dataset columns.
114{
116 if (auto ds = GetDataSource()) {
118 // Sorting to be consistent with GetColumnNames
119 std::sort(ret.begin(), ret.end());
120 }
121
122 return ret;
123}
124
125/////////////////////////////////////////////////////////////////////////////
126/// \brief Return the type of a given column as a string.
127/// \return the type of the required column.
128///
129/// This is not an action nor a transformation, just a query to the RDataFrame object.
130///
131/// ### Example usage:
132/// ~~~{.cpp}
133/// auto colType = d.GetColumnType("columnName");
134/// // Print column type
135/// std::cout << "Column " << colType << " has type " << colType << std::endl;
136/// ~~~
137///
138std::string ROOT::RDF::RInterfaceBase::GetColumnType(std::string_view column)
139{
140 const auto col = fColRegister.ResolveAlias(column);
141
142 RDFDetail::RDefineBase *define = fColRegister.GetDefine(col);
143
144 const bool convertVector2RVec = true;
145 return RDFInternal::ColumnName2ColumnTypeName(std::string(col), nullptr, fLoopManager->GetDataSource(), define,
147}
148
149/////////////////////////////////////////////////////////////////////////////
150/// \brief Return information about the dataframe.
151/// \return information about the dataframe as RDFDescription object
152///
153/// This convenience function describes the dataframe and combines the following information:
154/// - Number of event loops run, see GetNRuns()
155/// - Number of total and defined columns, see GetColumnNames() and GetDefinedColumnNames()
156/// - Column names, see GetColumnNames()
157/// - Column types, see GetColumnType()
158/// - Number of processing slots, see GetNSlots()
159///
160/// This is not an action nor a transformation, just a query to the RDataFrame object.
161/// The result is dependent on the node from which this method is called, e.g. the list of
162/// defined columns returned by GetDefinedColumnNames().
163///
164/// Please note that this is a convenience feature and the layout of the output can be subject
165/// to change and should be parsed via RDFDescription methods.
166///
167/// ### Example usage:
168/// ~~~{.cpp}
169/// RDataFrame df(10);
170/// auto df2 = df.Define("x", "1.f").Define("s", "\"myStr\"");
171/// // Describe the dataframe
172/// df2.Describe().Print()
173/// df2.Describe().Print(/*shortFormat=*/true)
174/// std::cout << df2.Describe().AsString() << std::endl;
175/// std::cout << df2.Describe().AsString(/*shortFormat=*/true) << std::endl;
176/// ~~~
177///
179{
180 // Build set of defined column names to find later in all column names
181 // the defined columns more efficiently
182 const auto columnNames = GetColumnNames();
183 std::set<std::string> definedColumnNamesSet;
184 for (const auto &name : GetDefinedColumnNames())
186
187 // Get information for the metadata table
188 const std::vector<std::string> metadataProperties = {"Columns in total", "Columns from defines", "Event loops run",
189 "Processing slots"};
190 const std::vector<std::string> metadataValues = {std::to_string(columnNames.size()),
191 std::to_string(definedColumnNamesSet.size()),
192 std::to_string(GetNRuns()), std::to_string(GetNSlots())};
193
194 // Set header for metadata table
196 // The column width of the values is required to make right-bound numbers and is equal
197 // to the maximum of the string "Value" and all values to be put in this column.
198 const auto columnWidthValues =
199 std::max(std::max_element(metadataValues.begin(), metadataValues.end())->size(), static_cast<std::size_t>(5u));
200 std::stringstream ss;
201 ss << std::left << std::setw(columnWidthProperties) << "Property" << std::setw(columnWidthValues) << "Value\n"
202 << std::setw(columnWidthProperties) << "--------" << std::setw(columnWidthValues) << "-----\n";
203
204 // Build metadata table
205 // All numbers should be bound to the right and strings bound to the left.
206 for (auto i = 0u; i < metadataProperties.size(); i++) {
207 ss << std::left << std::setw(columnWidthProperties) << metadataProperties[i] << std::right
208 << std::setw(columnWidthValues) << metadataValues[i] << '\n';
209 }
210 ss << '\n'; // put space between this and the next table
211
212 // Set header for columns table
214 const auto columnTypes = GetColumnTypeNamesList(columnNames);
216 ss << std::left << std::setw(columnWidthNames) << "Column" << std::setw(columnWidthTypes) << "Type"
217 << "Origin\n"
218 << std::setw(columnWidthNames) << "------" << std::setw(columnWidthTypes) << "----"
219 << "------\n";
220
221 // Build columns table
222 const auto nCols = columnNames.size();
223 for (auto i = 0u; i < nCols; i++) {
224 auto origin = "Dataset";
226 origin = "Define";
227 ss << std::left << std::setw(columnWidthNames) << columnNames[i] << std::setw(columnWidthTypes) << columnTypes[i]
228 << origin << '\n';
229 }
230 // Use the string returned from DescribeDataset() as the 'brief' description
231 // Use the converted to string stringstream ss as the 'full' description
232 return RDFDescription(DescribeDataset(), ss.str(), GetNFiles());
233}
234
235/// \brief Returns the names of the defined columns.
236/// \return the container of the defined column names.
237///
238/// This is not an action nor a transformation, just a simple utility to
239/// get the columns names that have been defined up to the node.
240/// If no column has been defined, e.g. on a root node, it returns an
241/// empty collection.
242///
243/// ### Example usage:
244/// ~~~{.cpp}
245/// auto defColNames = d.GetDefinedColumnNames();
246/// // Print defined columns' names
247/// for (auto &&defColName : defColNames) std::cout << defColName << std::endl;
248/// ~~~
249///
251{
253
254 const auto columns = fColRegister.BuildDefineNames();
255 for (const auto &column : columns) {
257 definedColumns.emplace_back(column);
258 }
259
260 return definedColumns;
261}
262
263/// \brief Return a descriptor for the systematic variations registered in this branch of the computation graph.
264///
265/// This is not an action nor a transformation, just a simple utility to
266/// inspect the systematic variations that have been registered with Vary() up to this node.
267/// When called on the root node, it returns an empty descriptor.
268///
269/// ### Example usage:
270/// ~~~{.cpp}
271/// auto variations = d.GetVariations();
272/// variations.Print();
273/// ~~~
274///
276{
277 return fColRegister.BuildVariationsDescription();
278}
279
280/// \brief Checks if a column is present in the dataset.
281/// \return true if the column is available, false otherwise
282///
283/// This method checks if a column is part of the input ROOT dataset, has
284/// been defined or can be provided by the data source.
285///
286/// Example usage:
287/// ~~~{.cpp}
288/// ROOT::RDataFrame base(1);
289/// auto rdf = base.Define("definedColumn", [](){return 0;});
290/// rdf.HasColumn("definedColumn"); // true: we defined it
291/// rdf.HasColumn("rdfentry_"); // true: it's always there
292/// rdf.HasColumn("foo"); // false: it is not there
293/// ~~~
295{
296 if (fColRegister.IsDefineOrAlias(columnName))
297 return true;
298
299 if (auto ds = GetDataSource(); ds && ds->HasColumn(columnName))
300 return true;
301
302 return false;
303}
304
305/// \brief Gets the number of data processing slots.
306/// \return The number of data processing slots used by this RDataFrame instance
307///
308/// This method returns the number of data processing slots used by this RDataFrame
309/// instance. This number is influenced by the global switch ROOT::EnableImplicitMT().
310///
311/// Example usage:
312/// ~~~{.cpp}
313/// ROOT::EnableImplicitMT(6)
314/// ROOT::RDataFrame df(1);
315/// std::cout << df.GetNSlots() << std::endl; // prints "6"
316/// ~~~
318{
319 return fLoopManager->GetNSlots();
320}
321
322/// \brief Gets the number of event loops run.
323/// \return The number of event loops run by this RDataFrame instance
324///
325/// This method returns the number of events loops run so far by this RDataFrame instance.
326///
327/// Example usage:
328/// ~~~{.cpp}
329/// ROOT::RDataFrame df(1);
330/// std::cout << df.GetNRuns() << std::endl; // prints "0"
331/// df.Sum("rdfentry_").GetValue(); // trigger the event loop
332/// std::cout << df.GetNRuns() << std::endl; // prints "1"
333/// df.Sum("rdfentry_").GetValue(); // trigger another event loop
334/// std::cout << df.GetNRuns() << std::endl; // prints "2"
335/// ~~~
337{
338 return fLoopManager->GetNRuns();
339}
340
342{
343 std::vector<std::string> types;
344
345 for (auto column : columnList) {
346 types.push_back(GetColumnType(column));
347 }
348 return types;
349}
350
352{
354 std::string error(callerName);
355 error += " was called with ImplicitMT enabled, but multi-thread is not supported.";
356 throw std::runtime_error(error);
357 }
358}
359
361{
362 // Entry number column
363 const std::string entryColName = "rdfentry_";
364 const std::string entryColType = "ULong64_t";
365 auto entryColGen = [](unsigned int, ULong64_t entry) { return entry; };
366 using NewColEntry_t = RDFDetail::RDefine<decltype(entryColGen), RDFDetail::ExtraArgsForDefine::SlotAndEntry>;
367
368 auto entryColumn = std::make_shared<NewColEntry_t>(entryColName, entryColType, std::move(entryColGen),
369 ColumnNames_t{}, fColRegister, *fLoopManager);
370 fColRegister.AddDefine(std::move(entryColumn));
371
372 // Slot number column
373 const std::string slotColName = "rdfslot_";
374 const std::string slotColType = "unsigned int";
375 auto slotColGen = [](unsigned int slot) { return slot; };
376 using NewColSlot_t = RDFDetail::RDefine<decltype(slotColGen), RDFDetail::ExtraArgsForDefine::Slot>;
377
378 auto slotColumn = std::make_shared<NewColSlot_t>(slotColName, slotColType, std::move(slotColGen), ColumnNames_t{},
379 fColRegister, *fLoopManager);
380 fColRegister.AddDefine(std::move(slotColumn));
381
382 fColRegister.AddAlias("tdfentry_", entryColName);
383 fColRegister.AddAlias("tdfslot_", slotColName);
384}
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
char name[80]
Definition TGX11.cxx:148
The head node of a RDF computation graph.
A binder for user-defined columns, variations and aliases.
A DFDescription contains useful information about a given RDataFrame computation graph.
RVariationsDescription GetVariations() const
Return a descriptor for the systematic variations registered in this branch of the computation graph.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
RDFDescription Describe()
Return information about the dataframe.
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
unsigned int GetNRuns() const
Gets the number of event loops run.
ColumnNames_t GetDatasetTopLevelFieldNames()
Retrieve the names of top-level field names.
RDataSource * GetDataSource() const
ColumnNames_t GetDefinedColumnNames()
Returns the names of the defined columns.
void CheckIMTDisabled(std::string_view callerName)
unsigned int GetNSlots() const
Gets the number of data processing slots.
RInterfaceBase(std::shared_ptr< RDFDetail::RLoopManager > lm)
bool HasColumn(std::string_view columnName)
Checks if a column is present in the dataset.
std::string DescribeDataset() const
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
A descriptor for the systematic variations known to a given RDataFrame node.
const_iterator begin() const
const_iterator end() const
const Int_t n
Definition legend1.C:16
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:650
unsigned int GetColumnWidth(const std::vector< std::string > &names, const unsigned int minColumnSpace=8u)
Get optimal column width for printing a table given the names and the desired minimal space between c...
Definition RDFUtils.cxx:492
std::string DescribeDataset(ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:666
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2RVec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:330
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition RDFUtils.cxx:483
const std::vector< std::string > & GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:655
std::vector< std::string > ColumnNames_t
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:675