Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVTDS
12#define ROOT_RCSVTDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <cstdint>
18#include <deque>
19#include <list>
20#include <unordered_map>
21#include <set>
22#include <memory>
23#include <vector>
24
25#include <TRegexp.h>
26
27namespace ROOT {
28
29namespace Internal {
30class RRawFile;
31}
32
33namespace RDF {
34
35class RCsvDS final : public ROOT::RDF::RDataSource {
36public:
37 /// Options that control how the CSV file is parsed
38 struct ROptions {
39 /// The first line describes the columns. The names are used as RDF column names
40 /// unless fColumnNames is not empty, in which case it replaces the given names.
41 /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
42 bool fHeaders = true;
43 char fDelimiter = ','; ///< Column delimiter character
44 bool fLeftTrim = false; ///< Leading whitespaces are removed
45 bool fRightTrim = false; ///< Trailing whitespaces are removed
46 bool fSkipBlankLines = true; ///< Ignore empty lines (after trimming, if trimming is enabled)
47 std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
48 std::int64_t fSkipLastNLines = 0; ///< Ignore the last N lines of the file
49 std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
50 /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
51 /// If it is the first character of the line (after trimming), the line is ignored altogether.
52 /// Note that the comment character must not be part of the data, e.g. in strings.
53 char fComment = '\0';
54 /// Impose column names. This can be used if a header is missing or if the header has unparsable or
55 /// unwanted column names.
56 std::vector<std::string> fColumnNames;
57 /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
58 /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
59 std::unordered_map<std::string, char> fColumnTypes;
60 };
61
62private:
63 // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
64 using ColType_t = char;
65 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
66
67 // Regular expressions for type inference
69
71 std::uint64_t fDataPos = 0;
72 std::int64_t fDataLineNumber = 0;
73 std::int64_t fLineNumber = 0; // used to skip the last lines
74 std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
75 unsigned int fNSlots = 0U;
76 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
78 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
79 std::vector<std::string> fHeaders; // the column names
80 std::unordered_map<std::string, ColType_t> fColTypes;
81 std::set<std::string> fColContainingEmpty; // store columns which had empty entry
82 std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
83 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot] (same ordering as fHeaders)
84 std::vector<Record_t> fRecords; // fRecords[entry][column] (same ordering as fHeaders)
85 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
86 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
87 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
88 // This must be a deque to avoid the specialisation vector<bool>. This would not
89 // work given that the pointer to the boolean in that case cannot be taken
90 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
91
92 void Construct();
93
94 bool Readln(std::string &line);
95 void RewindToData();
96 void FillHeaders(const std::string &);
97 void FillRecord(const std::string &, Record_t &);
98 void GenerateHeaders(size_t);
99 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
100 void ValidateColTypes(std::vector<std::string> &) const;
101 void InferColTypes(std::vector<std::string> &);
102 void InferType(const std::string &, unsigned int);
103 std::vector<std::string> ParseColumns(const std::string &);
104 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
105 ColType_t GetType(std::string_view colName) const;
106 void FreeRecords();
107
108protected:
109 std::string AsString() final;
110
111public:
112 RCsvDS(std::string_view fileName, const ROptions &options);
113 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
114 std::unordered_map<std::string, char> &&colTypes = {});
115 // Rule of five
116 RCsvDS(const RCsvDS &) = delete;
117 RCsvDS &operator=(const RCsvDS &) = delete;
118 RCsvDS(RCsvDS &&) = delete;
119 RCsvDS &operator=(RCsvDS &&) = delete;
120 ~RCsvDS() final;
121
122 void Finalize() final;
123 std::size_t GetNFiles() const final { return 1; }
124 const std::vector<std::string> &GetColumnNames() const final;
125 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
126 std::string GetTypeName(std::string_view colName) const final;
127 bool HasColumn(std::string_view colName) const final;
128 bool SetEntry(unsigned int slot, ULong64_t entry) final;
129 void SetNSlots(unsigned int nSlots) final;
130 std::string GetLabel() final;
131};
132
133////////////////////////////////////////////////////////////////////////////////////////////////
134/// \brief Factory method to create a CSV RDataFrame.
135/// \param[in] fileName Path of the CSV file.
136/// \param[in] options File parsing settings.
137RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
138
139////////////////////////////////////////////////////////////////////////////////////////////////
140/// \brief Factory method to create a CSV RDataFrame.
141/// \param[in] fileName Path of the CSV file.
142/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
143/// (default `true`).
144/// \param[in] delimiter Delimiter character (default ',').
145/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
146/// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
147/// column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
148/// Long64_t, 'T' for std::string)
149RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
150 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
151
152} // ns RDF
153
154} // ns ROOT
155
156#endif
long long Long64_t
Definition RtypesCore.h:69
unsigned long long ULong64_t
Definition RtypesCore.h:70
RDataFrame data source class for reading CSV files.
Definition RCsvDS.hxx:35
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:72
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:597
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:179
void Finalize() final
Convenience method called after concluding an event-loop.
Definition RCsvDS.cxx:515
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
Definition RCsvDS.hxx:123
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:586
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:85
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:319
std::uint64_t fDataPos
Definition RCsvDS.hxx:71
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RCsvDS.cxx:638
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:65
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:351
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:68
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:222
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:83
unsigned int fNSlots
Definition RCsvDS.hxx:75
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:523
std::string AsString() final
Definition RCsvDS.cxx:98
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:79
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:77
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:74
RCsvDS & operator=(RCsvDS &&)=delete
ULong64_t fProcessedLines
Definition RCsvDS.hxx:78
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:602
std::int64_t fLineNumber
Definition RCsvDS.hxx:73
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:286
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:80
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:86
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:68
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:84
RCsvDS(RCsvDS &&)=delete
ROptions fOptions
Definition RCsvDS.hxx:70
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:81
~RCsvDS() final
Destructor.
Definition RCsvDS.cxx:510
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:68
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:68
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:267
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:68
RCsvDS(const RCsvDS &)=delete
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:340
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:76
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition RCsvDS.cxx:528
std::string GetLabel() final
Return a string representation of the datasource type.
Definition RCsvDS.cxx:655
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition RCsvDS.cxx:235
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:68
RCsvDS & operator=(const RCsvDS &)=delete
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:87
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:90
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RCsvDS.cxx:607
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:82
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
TLine * line
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:660
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:38
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:42
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:45
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:47
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:56
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:48
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:59
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
Definition RCsvDS.hxx:46
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:43
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:53
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:44
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:49