Logo ROOT  
Reference Guide
RCsvDS.hxx
Go to the documentation of this file.
1 // Author: Enric Tejedor CERN 10/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RCSVTDS
12 #define ROOT_RCSVTDS
13 
14 #include "ROOT/RDataFrame.hxx"
15 #include "ROOT/RDataSource.hxx"
16 
17 #include <cstdint>
18 #include <deque>
19 #include <list>
20 #include <map>
21 #include <memory>
22 #include <vector>
23 
24 #include <TRegexp.h>
25 
26 namespace ROOT {
27 
28 namespace Internal {
29 class RRawFile;
30 }
31 
32 namespace RDF {
33 
34 class RCsvDS final : public ROOT::RDF::RDataSource {
35 
36 private:
37  // Possible values are d, b, l, s. This is possible only because we treat double, bool, Long64_t and string
38  using ColType_t = char;
39  static const std::map<ColType_t, std::string> fgColTypeMap;
40 
41  // Regular expressions for type inference
43 
44  std::uint64_t fDataPos = 0;
45  bool fReadHeaders = false;
46  unsigned int fNSlots = 0U;
47  std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
48  const char fDelimiter;
51  ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
52  std::vector<std::string> fHeaders;
53  std::map<std::string, ColType_t> fColTypes;
54  std::list<ColType_t> fColTypesList;
55  std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot]
56  std::vector<Record_t> fRecords; // fRecords[entry][column]
57  std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
58  std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
59  std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
60  // This must be a deque to avoid the specialisation vector<bool>. This would not
61  // work given that the pointer to the boolean in that case cannot be taken
62  std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
63 
64  void FillHeaders(const std::string &);
65  void FillRecord(const std::string &, Record_t &);
66  void GenerateHeaders(size_t);
67  std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &);
68  void InferColTypes(std::vector<std::string> &);
69  void InferType(const std::string &, unsigned int);
70  std::vector<std::string> ParseColumns(const std::string &);
71  size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
72  ColType_t GetType(std::string_view colName) const;
73 
74 protected:
75  std::string AsString();
76 
77 public:
78  RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL);
79  void Finalise();
80  void FreeRecords();
81  ~RCsvDS();
82  const std::vector<std::string> &GetColumnNames() const;
83  std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges();
84  std::string GetTypeName(std::string_view colName) const;
85  bool HasColumn(std::string_view colName) const;
86  bool SetEntry(unsigned int slot, ULong64_t entry);
87  void SetNSlots(unsigned int nSlots);
88  std::string GetLabel();
89 };
90 
91 ////////////////////////////////////////////////////////////////////////////////////////////////
92 /// \brief Factory method to create a CSV RDataFrame.
93 /// \param[in] fileName Path of the CSV file.
94 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
95 /// (default `true`).
96 /// \param[in] delimiter Delimiter character (default ',').
97 RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
98  Long64_t linesChunkSize = -1LL);
99 
100 } // ns RDF
101 
102 } // ns ROOT
103 
104 #endif
ROOT::RDF::RCsvDS::GetEntryRanges
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:354
ROOT::RDF::RCsvDS::fColTypesList
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:54
ROOT::RDF::RCsvDS::SetEntry
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:422
ROOT::RDF::RCsvDS::ParseColumns
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:219
ROOT::RDF::RCsvDS::fBoolEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:62
ROOT::RDF::RCsvDS::fCsvFile
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition: RCsvDS.hxx:47
ROOT::RDF::RCsvDS::fgIntRegex
static const TRegexp fgIntRegex
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::GetTypeName
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:412
ROOT::RDF::RCsvDS::fColTypes
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:53
ROOT::RDF::RCsvDS::Finalise
void Finalise()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:341
ROOT::RDF::RCsvDS::FillHeaders
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:106
ROOT::RDF::RCsvDS::fProcessedLines
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:51
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
ROOT::RDF::RCsvDS::GetLabel
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:470
string_view
basic_string_view< char > string_view
Definition: libcpp_string_view.h:785
ROOT::RDF::RCsvDS::RCsvDS
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:261
ROOT::RDF::RCsvDS::fgColTypeMap
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:39
ROOT::RDF::RCsvDS::fEntryRangesRequested
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:50
ROOT::RDF::RCsvDS::ColType_t
char ColType_t
Definition: RCsvDS.hxx:38
ROOT::RDF::RCsvDS::fgDoubleRegex3
static const TRegexp fgDoubleRegex3
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::fLong64EvtValues
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:58
ROOT::RDF::RCsvDS
RDataFrame data source class for reading CSV files.
Definition: RCsvDS.hxx:34
ROOT::RDataFrame
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:42
ROOT::RDF::RDataSource
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:106
RDataFrame.hxx
ROOT::RDF::RCsvDS::fDelimiter
const char fDelimiter
Definition: RCsvDS.hxx:48
ROOT::RDF::RCsvDS::fgDoubleRegex1
static const TRegexp fgDoubleRegex1
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::GetType
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:401
RDataSource.hxx
TRegexp.h
ROOT::RDF::RCsvDS::fColAddresses
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:55
ROOT::RDF::RCsvDS::fLinesChunkSize
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:49
ROOT::RDF::RCsvDS::SetNSlots
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:453
ROOT::RDF::RCsvDS::fgDoubleRegex2
static const TRegexp fgDoubleRegex2
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::InferType
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:197
ROOT::RDF::RCsvDS::fDoubleEvtValues
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:57
ROOT::RDF::RCsvDS::GenerateHeaders
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:149
ROOT::RDF::RCsvDS::GetColumnNames
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:349
ROOT::RDF::RCsvDS::fReadHeaders
bool fReadHeaders
Definition: RCsvDS.hxx:45
ROOT::RDF::RCsvDS::GetColumnReadersImpl
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:156
ROOT::RDF::RCsvDS::AsString
std::string AsString()
Definition: RCsvDS.cxx:90
ROOT::RDF::RCsvDS::fgTrueRegex
static const TRegexp fgTrueRegex
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::fDataPos
std::uint64_t fDataPos
Definition: RCsvDS.hxx:44
ROOT::RDF::RCsvDS::FreeRecords
void FreeRecords()
Definition: RCsvDS.cxx:305
TRegexp
Regular expression class.
Definition: TRegexp.h:31
ROOT::RDF::RCsvDS::InferColTypes
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:188
ROOT::RDF::MakeCsvDataFrame
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:475
ROOT::RDF::RCsvDS::HasColumn
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:417
ULong64_t
unsigned long long ULong64_t
Definition: RtypesCore.h:74
ROOT::RDF::RDataSource::Record_t
std::vector< void * > Record_t
Definition: RDataSource.hxx:109
ROOT::RDF::RCsvDS::FillRecord
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:114
ROOT::RDF::RCsvDS::ParseValue
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:230
ROOT::RDF::RCsvDS::fNSlots
unsigned int fNSlots
Definition: RCsvDS.hxx:46
ROOT::RDF::RCsvDS::~RCsvDS
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:336
ROOT::RDF::RCsvDS::fRecords
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:56
ROOT::RDF::RCsvDS::fgFalseRegex
static const TRegexp fgFalseRegex
Definition: RCsvDS.hxx:42
ROOT::RDF::RCsvDS::fStringEvtValues
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:59
ROOT
VSD Structures.
Definition: StringConv.hxx:21
ROOT::RDF::RCsvDS::fHeaders
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:52