Logo ROOT  
Reference Guide
RArrowDS.cxx
Go to the documentation of this file.
1 // Author: Giulio Eulisse CERN 2/2018
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 // clang-format off
12 /** \class ROOT::RDF::RArrowDS
13  \ingroup dataframe
14  \brief RDataFrame data source class to interface with Apache Arrow.
15 
16 The RArrowDS implements a proxy RDataSource to be able to use Apache Arrow
17 tables with RDataFrame.
18 
19 A RDataFrame that adapts an arrow::Table class can be constructed using the factory method
20 ROOT::RDF::MakeArrowDataFrame, which accepts one parameter:
21 1. An arrow::Table smart pointer.
22 
23 The types of the columns are derived from the types in the associated
24 arrow::Schema.
25 
26 */
27 // clang-format on
28 
29 #include <ROOT/RDF/Utils.hxx>
30 #include <ROOT/TSeq.hxx>
31 #include <ROOT/RArrowDS.hxx>
32 #include <ROOT/RMakeUnique.hxx>
33 #include <snprintf.h>
34 
35 #include <algorithm>
36 #include <sstream>
37 #include <string>
38 
39 #if defined(__GNUC__)
40 #pragma GCC diagnostic push
41 #pragma GCC diagnostic ignored "-Wshadow"
42 #pragma GCC diagnostic ignored "-Wunused-parameter"
43 #endif
44 #include <arrow/table.h>
45 #include <arrow/stl.h>
46 #if defined(__GNUC__)
47 #pragma GCC diagnostic pop
48 #endif
49 
50 namespace ROOT {
51 namespace Internal {
52 namespace RDF {
53 
54 // This is needed by Arrow 0.12.0 which dropped
55 //
56 // using ArrowType = ArrowType_;
57 //
58 // from ARROW_STL_CONVERSION
59 template <typename T>
60 struct RootConversionTraits {};
61 
62 #define ROOT_ARROW_STL_CONVERSION(c_type, ArrowType_) \
63  template <> \
64  struct RootConversionTraits<c_type> { \
65  using ArrowType = ::arrow::ArrowType_; \
66  };
67 
68 ROOT_ARROW_STL_CONVERSION(bool, BooleanType)
69 ROOT_ARROW_STL_CONVERSION(int8_t, Int8Type)
70 ROOT_ARROW_STL_CONVERSION(int16_t, Int16Type)
71 ROOT_ARROW_STL_CONVERSION(int32_t, Int32Type)
74 ROOT_ARROW_STL_CONVERSION(uint16_t, UInt16Type)
75 ROOT_ARROW_STL_CONVERSION(uint32_t, UInt32Type)
77 ROOT_ARROW_STL_CONVERSION(float, FloatType)
78 ROOT_ARROW_STL_CONVERSION(double, DoubleType)
79 ROOT_ARROW_STL_CONVERSION(std::string, StringType)
80 
81 // Per slot visitor of an Array.
82 class ArrayPtrVisitor : public ::arrow::ArrayVisitor {
83 private:
84  /// The pointer to update.
85  void **fResult;
86  bool fCachedBool{false}; // Booleans need to be unpacked, so we use a cached entry.
87  // FIXME: I should really use a variant here
88  RVec<float> fCachedRVecFloat;
89  RVec<double> fCachedRVecDouble;
90  RVec<ULong64_t> fCachedRVecULong64;
91  RVec<UInt_t> fCachedRVecUInt;
92  RVec<Long64_t> fCachedRVecLong64;
93  RVec<Int_t> fCachedRVecInt;
94  std::string fCachedString;
95  /// The entry in the array which should be looked up.
96  ULong64_t fCurrentEntry;
97 
98  template <typename T>
99  void *getTypeErasedPtrFrom(arrow::ListArray const &array, int32_t entry, RVec<T> &cache)
100  {
101  using ArrowType = typename RootConversionTraits<T>::ArrowType;
102  using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
103  auto values = reinterpret_cast<ArrayType *>(array.values().get());
104  auto offset = array.value_offset(entry);
105  // Here the cast to void* is a worksround while we figure out the
106  // issues we have with long long types, signed and unsigned.
107  RVec<T> tmp(reinterpret_cast<T *>((void *)values->raw_values()) + offset, array.value_length(entry));
108  std::swap(cache, tmp);
109  return (void *)(&cache);
110  }
111 
112 public:
113  ArrayPtrVisitor(void **result) : fResult{result}, fCurrentEntry{0} {}
114 
115  void SetEntry(ULong64_t entry) { fCurrentEntry = entry; }
116 
117  /// Check if we are asking the same entry as before.
118  virtual arrow::Status Visit(arrow::Int32Array const &array) final
119  {
120  *fResult = (void *)(array.raw_values() + fCurrentEntry);
121  return arrow::Status::OK();
122  }
123 
124  virtual arrow::Status Visit(arrow::Int64Array const &array) final
125  {
126  *fResult = (void *)(array.raw_values() + fCurrentEntry);
127  return arrow::Status::OK();
128  }
129 
130  /// Check if we are asking the same entry as before.
131  virtual arrow::Status Visit(arrow::UInt32Array const &array) final
132  {
133  *fResult = (void *)(array.raw_values() + fCurrentEntry);
134  return arrow::Status::OK();
135  }
136 
137  virtual arrow::Status Visit(arrow::UInt64Array const &array) final
138  {
139  *fResult = (void *)(array.raw_values() + fCurrentEntry);
140  return arrow::Status::OK();
141  }
142 
143  virtual arrow::Status Visit(arrow::FloatArray const &array) final
144  {
145  *fResult = (void *)(array.raw_values() + fCurrentEntry);
146  return arrow::Status::OK();
147  }
148 
149  virtual arrow::Status Visit(arrow::DoubleArray const &array) final
150  {
151  *fResult = (void *)(array.raw_values() + fCurrentEntry);
152  return arrow::Status::OK();
153  }
154 
155  virtual arrow::Status Visit(arrow::BooleanArray const &array) final
156  {
157  fCachedBool = array.Value(fCurrentEntry);
158  *fResult = reinterpret_cast<void *>(&fCachedBool);
159  return arrow::Status::OK();
160  }
161 
162  virtual arrow::Status Visit(arrow::StringArray const &array) final
163  {
164  fCachedString = array.GetString(fCurrentEntry);
165  *fResult = reinterpret_cast<void *>(&fCachedString);
166  return arrow::Status::OK();
167  }
168 
169  virtual arrow::Status Visit(arrow::ListArray const &array) final
170  {
171  switch (array.value_type()->id()) {
172  case arrow::Type::FLOAT: {
173  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecFloat);
174  return arrow::Status::OK();
175  }
176  case arrow::Type::DOUBLE: {
177  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecDouble);
178  return arrow::Status::OK();
179  }
180  case arrow::Type::UINT32: {
181  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecUInt);
182  return arrow::Status::OK();
183  }
184  case arrow::Type::UINT64: {
185  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecULong64);
186  return arrow::Status::OK();
187  }
188  case arrow::Type::INT32: {
189  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecInt);
190  return arrow::Status::OK();
191  }
192  case arrow::Type::INT64: {
193  *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecLong64);
194  return arrow::Status::OK();
195  }
196  default: return arrow::Status::TypeError("Type not supported");
197  }
198  }
199 
200  using ::arrow::ArrayVisitor::Visit;
201 };
202 
203 /// Helper class which keeps track for each slot where to get the entry.
204 class TValueGetter {
205 private:
206  std::vector<void *> fValuesPtrPerSlot;
207  std::vector<ULong64_t> fLastEntryPerSlot;
208  std::vector<ULong64_t> fLastChunkPerSlot;
209  std::vector<ULong64_t> fFirstEntryPerChunk;
210  std::vector<ArrayPtrVisitor> fArrayVisitorPerSlot;
211  /// Since data can be chunked in different arrays we need to construct an
212  /// index which contains the first element of each chunk, so that we can
213  /// quickly move to the correct chunk.
214  std::vector<ULong64_t> fChunkIndex;
215  arrow::ArrayVector fChunks;
216 
217 public:
218  TValueGetter(size_t slots, arrow::ArrayVector chunks)
219  : fValuesPtrPerSlot(slots, nullptr), fLastEntryPerSlot(slots, 0), fLastChunkPerSlot(slots, 0), fChunks{chunks}
220  {
221  fChunkIndex.reserve(fChunks.size());
222  size_t next = 0;
223  for (auto &chunk : chunks) {
224  fFirstEntryPerChunk.push_back(next);
225  next += chunk->length();
226  fChunkIndex.push_back(next);
227  }
228  for (size_t si = 0, se = fValuesPtrPerSlot.size(); si != se; ++si) {
229  fArrayVisitorPerSlot.push_back(ArrayPtrVisitor{fValuesPtrPerSlot.data() + si});
230  }
231  }
232 
233  /// This returns the ptr to the ptr to actual data.
234  std::vector<void *> SlotPtrs()
235  {
236  std::vector<void *> result;
237  for (size_t i = 0; i < fValuesPtrPerSlot.size(); ++i) {
238  result.push_back(fValuesPtrPerSlot.data() + i);
239  }
240  return result;
241  }
242 
243  // Convenience method to avoid code duplication between
244  // SetEntry and InitSlot
245  void UncachedSlotLookup(unsigned int slot, ULong64_t entry)
246  {
247  // If entry is greater than the previous one,
248  // we can skip all the chunks before the last one we
249  // queried.
250  size_t ci = 0;
251  assert(slot < fLastChunkPerSlot.size());
252  if (fLastEntryPerSlot[slot] < entry) {
253  ci = fLastChunkPerSlot.at(slot);
254  }
255 
256  for (size_t ce = fChunkIndex.size(); ci != ce; ++ci) {
257  if (entry < fChunkIndex[ci]) {
258  assert(slot < fLastChunkPerSlot.size());
259  fLastChunkPerSlot[slot] = ci;
260  break;
261  }
262  }
263 
264  // Update the pointer to the requested entry.
265  // Notice that we need to find the entry
266  auto chunk = fChunks.at(fLastChunkPerSlot[slot]);
267  assert(slot < fArrayVisitorPerSlot.size());
268  fArrayVisitorPerSlot[slot].SetEntry(entry - fFirstEntryPerChunk[fLastChunkPerSlot[slot]]);
269  fLastEntryPerSlot[slot] = entry;
270  auto status = chunk->Accept(fArrayVisitorPerSlot.data() + slot);
271  if (!status.ok()) {
272  std::string msg = "Could not get pointer for slot ";
273  msg += std::to_string(slot) + " looking at entry " + std::to_string(entry);
274  throw std::runtime_error(msg);
275  }
276  }
277 
278  /// Set the current entry to be retrieved
279  void SetEntry(unsigned int slot, ULong64_t entry)
280  {
281  // Same entry as before
282  if (fLastEntryPerSlot[slot] == entry) {
283  return;
284  }
285  UncachedSlotLookup(slot, entry);
286  }
287 };
288 
289 } // namespace RDF
290 } // namespace Internal
291 
292 namespace RDF {
293 
294 /// Helper to get the contents of a given column
295 
296 /// Helper to get the human readable name of type
297 class RDFTypeNameGetter : public ::arrow::TypeVisitor {
298 private:
299  std::vector<std::string> fTypeName;
300 
301 public:
302  arrow::Status Visit(const arrow::Int64Type &) override
303  {
304  fTypeName.push_back("Long64_t");
305  return arrow::Status::OK();
306  }
307  arrow::Status Visit(const arrow::Int32Type &) override
308  {
309  fTypeName.push_back("Int_t");
310  return arrow::Status::OK();
311  }
312  arrow::Status Visit(const arrow::UInt64Type &) override
313  {
314  fTypeName.push_back("ULong64_t");
315  return arrow::Status::OK();
316  }
317  arrow::Status Visit(const arrow::UInt32Type &) override
318  {
319  fTypeName.push_back("UInt_t");
320  return arrow::Status::OK();
321  }
322  arrow::Status Visit(const arrow::FloatType &) override
323  {
324  fTypeName.push_back("float");
325  return arrow::Status::OK();
326  }
327  arrow::Status Visit(const arrow::DoubleType &) override
328  {
329  fTypeName.push_back("double");
330  return arrow::Status::OK();
331  }
332  arrow::Status Visit(const arrow::StringType &) override
333  {
334  fTypeName.push_back("string");
335  return arrow::Status::OK();
336  }
337  arrow::Status Visit(const arrow::BooleanType &) override
338  {
339  fTypeName.push_back("bool");
340  return arrow::Status::OK();
341  }
342  arrow::Status Visit(const arrow::ListType &l) override
343  {
344  /// Recursively visit List types and map them to
345  /// an RVec. We accumulate the result of the recursion on
346  /// fTypeName so that we can create the actual type
347  /// when the recursion is done.
348  fTypeName.push_back("ROOT::VecOps::RVec<%s>");
349  return l.value_type()->Accept(this);
350  }
351  std::string result()
352  {
353  // This recursively builds a nested type.
354  std::string result = "%s";
355  char buffer[8192];
356  for (size_t i = 0; i < fTypeName.size(); ++i) {
357  snprintf(buffer, 8192, result.c_str(), fTypeName[i].c_str());
358  result = buffer;
359  }
360  return result;
361  }
362 
363  using ::arrow::TypeVisitor::Visit;
364 };
365 
366 /// Helper to determine if a given Column is a supported type.
367 class VerifyValidColumnType : public ::arrow::TypeVisitor {
368 private:
369 public:
370  virtual arrow::Status Visit(const arrow::Int64Type &) override { return arrow::Status::OK(); }
371  virtual arrow::Status Visit(const arrow::UInt64Type &) override { return arrow::Status::OK(); }
372  virtual arrow::Status Visit(const arrow::Int32Type &) override { return arrow::Status::OK(); }
373  virtual arrow::Status Visit(const arrow::UInt32Type &) override { return arrow::Status::OK(); }
374  virtual arrow::Status Visit(const arrow::FloatType &) override { return arrow::Status::OK(); }
375  virtual arrow::Status Visit(const arrow::DoubleType &) override { return arrow::Status::OK(); }
376  virtual arrow::Status Visit(const arrow::StringType &) override { return arrow::Status::OK(); }
377  virtual arrow::Status Visit(const arrow::BooleanType &) override { return arrow::Status::OK(); }
378  virtual arrow::Status Visit(const arrow::ListType &) override { return arrow::Status::OK(); }
379 
380  using ::arrow::TypeVisitor::Visit;
381 };
382 
383 ////////////////////////////////////////////////////////////////////////
384 /// Constructor to create an Arrow RDataSource for RDataFrame.
385 /// \param[in] inTable the arrow Table to observe.
386 /// \param[in] inColumns the name of the columns to use
387 /// In case columns is empty, we use all the columns found in the table
388 RArrowDS::RArrowDS(std::shared_ptr<arrow::Table> inTable, std::vector<std::string> const &inColumns)
389  : fTable{inTable}, fColumnNames{inColumns}
390 {
391  auto &columnNames = fColumnNames;
392  auto &table = fTable;
393  auto &index = fGetterIndex;
394  // We want to allow people to specify which columns they
395  // need so that we can think of upfront IO optimizations.
396  auto filterWantedColumns = [&columnNames, &table]() {
397  if (columnNames.empty()) {
398  for (auto &field : table->schema()->fields()) {
399  columnNames.push_back(field->name());
400  }
401  }
402  };
403 
404  // To support both arrow 0.14.0 and 0.16.0
405  using ColumnType = decltype(fTable->column(0));
406 
407  auto getRecordsFirstColumn = [&columnNames, &table]() {
408  if (columnNames.empty()) {
409  throw std::runtime_error("At least one column required");
410  }
411  const auto name = columnNames.front();
412  const auto columnIdx = table->schema()->GetFieldIndex(name);
413  return table->column(columnIdx)->length();
414  };
415 
416  // All columns are supposed to have the same number of entries.
417  auto verifyColumnSize = [&table](ColumnType column, int columnIdx, int nRecords) {
418  if (column->length() != nRecords) {
419  std::string msg = "Column ";
420  msg += table->schema()->field(columnIdx)->name() + " has a different number of entries.";
421  throw std::runtime_error(msg);
422  }
423  };
424 
425  /// For the moment we support only a few native types.
426  auto verifyColumnType = [&table](ColumnType column, int columnIdx) {
427  auto verifyType = std::make_unique<VerifyValidColumnType>();
428  auto result = column->type()->Accept(verifyType.get());
429  if (result.ok() == false) {
430  std::string msg = "Column ";
431  msg += table->schema()->field(columnIdx)->name() + " contains an unsupported type.";
432  throw std::runtime_error(msg);
433  }
434  };
435 
436  /// This is used to create an index between the columnId
437  /// and the associated getter.
438  auto addColumnToGetterIndex = [&index](int columnId) { index.push_back(std::make_pair(columnId, index.size())); };
439 
440  /// Assuming we can get called more than once, we need to
441  /// reset the getter index each time.
442  auto resetGetterIndex = [&index]() { index.clear(); };
443 
444  /// This is what initialization actually does
445  filterWantedColumns();
446  resetGetterIndex();
447  auto nRecords = getRecordsFirstColumn();
448  for (auto &columnName : fColumnNames) {
449  auto columnIdx = fTable->schema()->GetFieldIndex(columnName);
450  addColumnToGetterIndex(columnIdx);
451 
452  auto column = fTable->column(columnIdx);
453  verifyColumnSize(column, columnIdx, nRecords);
454  verifyColumnType(column, columnIdx);
455  }
456 }
457 
458 ////////////////////////////////////////////////////////////////////////
459 /// Destructor.
461 {
462 }
463 
464 const std::vector<std::string> &RArrowDS::GetColumnNames() const
465 {
466  return fColumnNames;
467 }
468 
469 std::vector<std::pair<ULong64_t, ULong64_t>> RArrowDS::GetEntryRanges()
470 {
471  auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
472  return entryRanges;
473 }
474 
475 std::string RArrowDS::GetTypeName(std::string_view colName) const
476 {
477  auto field = fTable->schema()->GetFieldByName(std::string(colName));
478  if (!field) {
479  std::string msg = "The dataset does not have column ";
480  msg += colName;
481  throw std::runtime_error(msg);
482  }
483  RDFTypeNameGetter typeGetter;
484  auto status = field->type()->Accept(&typeGetter);
485  if (status.ok() == false) {
486  std::string msg = "RArrowDS does not support a column of type ";
487  msg += field->type()->name();
488  throw std::runtime_error(msg);
489  }
490  return typeGetter.result();
491 }
492 
494 {
495  auto field = fTable->schema()->GetFieldByName(std::string(colName));
496  if (!field) {
497  return false;
498  }
499  return true;
500 }
501 
502 bool RArrowDS::SetEntry(unsigned int slot, ULong64_t entry)
503 {
504  for (auto link : fGetterIndex) {
505  auto &getter = fValueGetters[link.second];
506  getter->SetEntry(slot, entry);
507  }
508  return true;
509 }
510 
511 void RArrowDS::InitSlot(unsigned int slot, ULong64_t entry)
512 {
513  for (auto link : fGetterIndex) {
514  auto &getter = fValueGetters[link.second];
515  getter->UncachedSlotLookup(slot, entry);
516  }
517 }
518 
519 void splitInEqualRanges(std::vector<std::pair<ULong64_t, ULong64_t>> &ranges, int nRecords, unsigned int nSlots)
520 {
521  ranges.clear();
522  const auto chunkSize = nRecords / nSlots;
523  const auto remainder = 1U == nSlots ? 0 : nRecords % nSlots;
524  auto start = 0UL;
525  auto end = 0UL;
526  for (auto i : ROOT::TSeqU(nSlots)) {
527  start = end;
528  end += chunkSize;
529  ranges.emplace_back(start, end);
530  (void)i;
531  }
532  ranges.back().second += remainder;
533 }
534 
535 int getNRecords(std::shared_ptr<arrow::Table> &table, std::vector<std::string> &columnNames)
536 {
537  auto index = table->schema()->GetFieldIndex(columnNames.front());
538  return table->column(index)->length();
539 };
540 
541 template <typename T>
542 std::shared_ptr<arrow::ChunkedArray> getData(T p)
543 {
544  return p->data();
545 }
546 
547 template <>
548 std::shared_ptr<arrow::ChunkedArray>
549 getData<std::shared_ptr<arrow::ChunkedArray>>(std::shared_ptr<arrow::ChunkedArray> p)
550 {
551  return p;
552 }
553 
554 void RArrowDS::SetNSlots(unsigned int nSlots)
555 {
556  assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
557  fNSlots = nSlots;
558  // We dump all the previous getters structures and we rebuild it.
559  auto nColumns = fGetterIndex.size();
560 
561  fValueGetters.clear();
562  for (size_t ci = 0; ci != nColumns; ++ci) {
563  auto chunkedArray = getData(fTable->column(fGetterIndex[ci].first));
564  fValueGetters.emplace_back(std::make_unique<ROOT::Internal::RDF::TValueGetter>(nSlots, chunkedArray->chunks()));
565  }
566 }
567 
568 /// This needs to return a pointer to the pointer each value getter
569 /// will point to.
570 std::vector<void *> RArrowDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &)
571 {
572  auto &index = fGetterIndex;
573  auto findGetterIndex = [&index](unsigned int column) {
574  for (auto &entry : index) {
575  if (entry.first == column) {
576  return entry.second;
577  }
578  }
579  throw std::runtime_error("No column found at index " + std::to_string(column));
580  };
581 
582  const int columnIdx = fTable->schema()->GetFieldIndex(std::string(colName));
583  const int getterIdx = findGetterIndex(columnIdx);
584  assert(getterIdx != -1);
585  assert((unsigned int)getterIdx < fValueGetters.size());
586  return fValueGetters[getterIdx]->SlotPtrs();
587 }
588 
590 {
591  auto nRecords = getNRecords(fTable, fColumnNames);
593 }
594 
595 std::string RArrowDS::GetLabel()
596 {
597  return "ArrowDS";
598 }
599 
600 /// Creates a RDataFrame using an arrow::Table as input.
601 /// \param[in] table the arrow Table to observe.
602 /// \param[in] columnNames the name of the columns to use
603 /// In case columnNames is empty, we use all the columns found in the table
604 RDataFrame MakeArrowDataFrame(std::shared_ptr<arrow::Table> table, std::vector<std::string> const &columnNames)
605 {
606  ROOT::RDataFrame tdf(std::make_unique<RArrowDS>(table, columnNames));
607  return tdf;
608 }
609 
610 } // namespace RDF
611 
612 } // namespace ROOT
l
auto * l
Definition: textangle.C:4
snprintf
#define snprintf
Definition: civetweb.c:1540
ROOT::RDF::RArrowDS::fColumnNames
std::vector< std::string > fColumnNames
Definition: RArrowDS.hxx:26
ROOT::RDF::RArrowDS::HasColumn
bool HasColumn(std::string_view colName) const override
Checks if the dataset has a certain column.
Definition: RArrowDS.cxx:493
ROOT_ARROW_STL_CONVERSION
#define ROOT_ARROW_STL_CONVERSION(c_type, ArrowType_)
Definition: RArrowDS.cxx:62
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
string_view
basic_string_view< char > string_view
Definition: libcpp_string_view.h:785
Utils.hxx
uint8_t
uint8_t
Definition: Converters.cxx:858
ROOT::RDF::RArrowDS::GetColumnReadersImpl
std::vector< void * > GetColumnReadersImpl(std::string_view name, const std::type_info &type) override
This needs to return a pointer to the pointer each value getter will point to.
Definition: RArrowDS.cxx:570
ROOT::RDataFrame
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:42
ROOT::RDF::RArrowDS::GetTypeName
std::string GetTypeName(std::string_view colName) const override
Type of a column as a string, e.g.
Definition: RArrowDS.cxx:475
ROOT::RDF::RArrowDS::RArrowDS
RArrowDS(std::shared_ptr< arrow::Table > table, std::vector< std::string > const &columns)
Constructor to create an Arrow RDataSource for RDataFrame.
Definition: RArrowDS.cxx:388
ROOT::RDF::MakeArrowDataFrame
RDataFrame MakeArrowDataFrame(std::shared_ptr< arrow::Table > table, std::vector< std::string > const &columns)
Factory method to create a Apache Arrow RDataFrame.
Definition: RArrowDS.cxx:604
ROOT::RDF::getNRecords
int getNRecords(std::shared_ptr< arrow::Table > &table, std::vector< std::string > &columnNames)
Definition: RArrowDS.cxx:535
ROOT::RDF::getData
std::shared_ptr< arrow::ChunkedArray > getData(T p)
Definition: RArrowDS.cxx:542
ROOT::RDF::RArrowDS::SetEntry
bool SetEntry(unsigned int slot, ULong64_t entry) override
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RArrowDS.cxx:502
ROOT::RDF::RArrowDS::InitSlot
void InitSlot(unsigned int slot, ULong64_t firstEntry) override
Convenience method called at the start of the data processing associated to a slot.
Definition: RArrowDS.cxx:511
TSeq.hxx
ROOT::RDF::RArrowDS::fEntryRanges
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition: RArrowDS.hxx:25
ROOT::RDF::splitInEqualRanges
void splitInEqualRanges(std::vector< std::pair< ULong64_t, ULong64_t >> &ranges, int nRecords, unsigned int nSlots)
Definition: RArrowDS.cxx:519
ROOT::RDF::RArrowDS::fValueGetters
std::vector< std::unique_ptr< ROOT::Internal::RDF::TValueGetter > > fValueGetters
Definition: RArrowDS.hxx:30
ROOT::RDF::RArrowDS::fNSlots
size_t fNSlots
Definition: RArrowDS.hxx:27
ROOT::RDF::RArrowDS::GetLabel
std::string GetLabel() override
Return a string representation of the datasource type.
Definition: RArrowDS.cxx:595
void
typedef void((*Func_t)())
RArrowDS.hxx
ROOT::RDF::RArrowDS::Initialise
void Initialise() override
Convenience method called before starting an event-loop.
Definition: RArrowDS.cxx:589
ULong64_t
unsigned long long ULong64_t
Definition: RtypesCore.h:74
ROOT::RDF::RArrowDS::~RArrowDS
~RArrowDS()
Destructor.
Definition: RArrowDS.cxx:460
ROOT::RDF::RArrowDS::GetColumnNames
const std::vector< std::string > & GetColumnNames() const override
Returns a reference to the collection of the dataset's column names.
Definition: RArrowDS.cxx:464
name
char name[80]
Definition: TGX11.cxx:110
ROOT::Experimental::Internal::swap
void swap(RDirectoryEntry &e1, RDirectoryEntry &e2) noexcept
Definition: RDirectoryEntry.hxx:94
ROOT::RDF::RArrowDS::GetEntryRanges
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() override
Return ranges of entries to distribute to tasks.
Definition: RArrowDS.cxx:469
ROOT::Math::Chebyshev::T
double T(double x)
Definition: ChebyshevPol.h:34
ROOT::RDF::RArrowDS::SetNSlots
void SetNSlots(unsigned int nSlots) override
Inform RDataSource of the number of processing slots (i.e.
Definition: RArrowDS.cxx:554
RMakeUnique.hxx
ROOT::TSeq
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
fTypeName
struct void * fTypeName
Definition: cppyy.h:9
ROOT::RDF::RArrowDS::fTable
std::shared_ptr< arrow::Table > fTable
Definition: RArrowDS.hxx:24
ROOT::RDF::RArrowDS::fGetterIndex
std::vector< std::pair< size_t, size_t > > fGetterIndex
Definition: RArrowDS.hxx:29
ROOT
VSD Structures.
Definition: StringConv.hxx:21