Logo ROOT   6.16/01
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts three parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
24
25The types of the columns in the CSV file are automatically inferred. The supported
26types are:
27- Integer: stored as a 64-bit long long int.
28- Floating point number: stored with double precision.
29- Boolean: matches the literals `true` and `false`.
30- String: stored as an std::string, matches anything that does not fall into any of the
31previous types.
32
33These are some formatting rules expected by the RCsvDS implementation:
34- All records must have the same number of fields, in the same order.
35- Any field may be quoted.
36~~~
37 "1997","Ford","E350"
38~~~
39- Fields with embedded delimiters (e.g. comma) must be quoted.
40~~~
41 1997,Ford,E350,"Super, luxurious truck"
42~~~
43- Fields with double-quote characters must be quoted, and each of the embedded
44double-quote characters must be represented by a pair of double-quote characters.
45~~~
46 1997,Ford,E350,"Super, ""luxurious"" truck"
47~~~
48- Fields with embedded line breaks are not supported, even when quoted.
49~~~
50 1997,Ford,E350,"Go get one now
51 they are going fast"
52~~~
53- Spaces are considered part of a field and are not ignored.
54~~~
55 1997, Ford , E350
56 not same as
57 1997,Ford,E350
58 but same as
59 1997, "Ford" , E350
60~~~
61- If a header row is provided, it must contain column names for each of the fields.
62~~~
63 Year,Make,Model
64 1997,Ford,E350
65 2000,Mercury,Cougar
66~~~
67
68The current implementation of RCsvDS reads the entire CSV file content into memory before
69RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
70important to check both how much memory is available and the size of the CSV file.
71*/
72// clang-format on
73
74#include <ROOT/RDF/Utils.hxx>
75#include <ROOT/TSeq.hxx>
76#include <ROOT/RCsvDS.hxx>
77#include <ROOT/RMakeUnique.hxx>
78#include <TError.h>
79
80#include <algorithm>
81#include <iostream>
82#include <sstream>
83#include <string>
84
85namespace ROOT {
86
87namespace RDF {
88
89std::string RCsvDS::AsString()
90{
91 return "CSV data source";
92}
93
94// Regular expressions for type inference
95TRegexp RCsvDS::intRegex("^[-+]?[0-9]+$");
96TRegexp RCsvDS::doubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
97TRegexp RCsvDS::doubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
100
101const std::map<RCsvDS::ColType_t, std::string>
102 RCsvDS::fgColTypeMap({{'b', "bool"}, {'d', "double"}, {'l', "Long64_t"}, {'s', "std::string"}});
103
104void RCsvDS::FillHeaders(const std::string &line)
105{
106 auto columns = ParseColumns(line);
107 for (auto &col : columns) {
108 fHeaders.emplace_back(col);
109 }
110}
111
112void RCsvDS::FillRecord(const std::string &line, Record_t &record)
113{
114 std::istringstream lineStream(line);
115 auto i = 0U;
116
117 auto columns = ParseColumns(line);
118
119 for (auto &col : columns) {
120 auto colType = fColTypes[fHeaders[i]];
121
122 switch (colType) {
123 case 'd': {
124 record.emplace_back(new double(std::stod(col)));
125 break;
126 }
127 case 'l': {
128 record.emplace_back(new Long64_t(std::stoll(col)));
129 break;
130 }
131 case 'b': {
132 auto b = new bool();
133 record.emplace_back(b);
134 std::istringstream is(col);
135 is >> std::boolalpha >> *b;
136 break;
137 }
138 case 's': {
139 record.emplace_back(new std::string(col));
140 break;
141 }
142 }
143 ++i;
144 }
145}
146
147void RCsvDS::GenerateHeaders(size_t size)
148{
149 for (size_t i = 0; i < size; ++i) {
150 fHeaders.push_back("Col" + std::to_string(i));
151 }
152}
153
154std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
155{
156 const auto colType = GetType(colName);
157
158 if ((colType == 'd' && typeid(double) != ti) || (colType == 'l' && typeid(Long64_t) != ti) ||
159 (colType == 's' && typeid(std::string) != ti) || (colType == 'b' && typeid(bool) != ti)) {
160 std::string err = "The type selected for column \"";
161 err += colName;
162 err += "\" does not correspond to column type, which is ";
163 err += fgColTypeMap.at(colType);
164 throw std::runtime_error(err);
165 }
166
167 const auto &colNames = GetColumnNames();
168 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
169 std::vector<void *> ret(fNSlots);
170 for (auto slot : ROOT::TSeqU(fNSlots)) {
171 auto &val = fColAddresses[index][slot];
172 if (ti == typeid(double)) {
173 val = &fDoubleEvtValues[index][slot];
174 } else if (ti == typeid(Long64_t)) {
175 val = &fLong64EvtValues[index][slot];
176 } else if (ti == typeid(std::string)) {
177 val = &fStringEvtValues[index][slot];
178 } else {
179 val = &fBoolEvtValues[index][slot];
180 }
181 ret[slot] = &val;
182 }
183 return ret;
184}
185
186void RCsvDS::InferColTypes(std::vector<std::string> &columns)
187{
188 auto i = 0U;
189 for (auto &col : columns) {
190 InferType(col, i);
191 ++i;
192 }
193}
194
195void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
196{
198 int dummy;
199
200 if (intRegex.Index(col, &dummy) != -1) {
201 type = 'l'; // Long64_t
202 } else if (doubleRegex1.Index(col, &dummy) != -1 || doubleRegex2.Index(col, &dummy) != -1) {
203 type = 'd'; // double
204 } else if (trueRegex.Index(col, &dummy) != -1 || falseRegex.Index(col, &dummy) != -1) {
205 type = 'b'; // bool
206 } else { // everything else is a string
207 type = 's'; // std::string
208 }
209 // TODO: Date
210
211 fColTypes[fHeaders[idxCol]] = type;
212 fColTypesList.push_back(type);
213}
214
215std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
216{
217 std::vector<std::string> columns;
218
219 for (size_t i = 0; i < line.size(); ++i) {
220 i = ParseValue(line, columns, i);
221 }
222
223 return columns;
224}
225
226size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
227{
228 std::stringstream val;
229 bool quoted = false;
230
231 for (; i < line.size(); ++i) {
232 if (line[i] == fDelimiter && !quoted) {
233 break;
234 } else if (line[i] == '"') {
235 // Keep just one quote for escaped quotes, none for the normal quotes
236 if (line[i + 1] != '"') {
237 quoted = !quoted;
238 } else {
239 val << line[++i];
240 }
241 } else {
242 val << line[i];
243 }
244 }
245
246 columns.emplace_back(val.str());
247
248 return i;
249}
250
251////////////////////////////////////////////////////////////////////////
252/// Constructor to create a CSV RDataSource for RDataFrame.
253/// \param[in] fileName Path of the CSV file.
254/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
255/// (default `true`).
256/// \param[in] delimiter Delimiter character (default ',').
257RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize) // TODO: Let users specify types?
258 : fReadHeaders(readHeaders),
259 fStream(std::string(fileName)),
260 fDelimiter(delimiter),
261 fLinesChunkSize(linesChunkSize)
262{
263 std::string line;
264
265 // Read the headers if present
266 if (fReadHeaders) {
267 if (std::getline(fStream, line)) {
269 } else {
270 std::string msg = "Error reading headers of CSV file ";
271 msg += fileName;
272 throw std::runtime_error(msg);
273 }
274 }
275
276 fDataPos = fStream.tellg();
277 if (std::getline(fStream, line)) {
278 auto columns = ParseColumns(line);
279
280 // Generate headers if not present
281 if (!fReadHeaders) {
282 GenerateHeaders(columns.size());
283 }
284
285 // Infer types of columns with first record
286 InferColTypes(columns);
287
288 // rewind one line
289 fStream.seekg(fDataPos);
290 }
291}
292
294{
295 for (auto &record : fRecords) {
296 for (size_t i = 0; i < record.size(); ++i) {
297 void *p = record[i];
298 const auto colType = fColTypes[fHeaders[i]];
299 switch (colType) {
300 case 'd': {
301 delete static_cast<double *>(p);
302 break;
303 }
304 case 'l': {
305 delete static_cast<Long64_t *>(p);
306 break;
307 }
308 case 'b': {
309 delete static_cast<bool *>(p);
310 break;
311 }
312 case 's': {
313 delete static_cast<std::string *>(p);
314 break;
315 }
316 }
317 }
318 }
319 fRecords.clear();
320}
321
322////////////////////////////////////////////////////////////////////////
323/// Destructor.
325{
326 FreeRecords();
327}
328
330{
331 fStream.clear();
332 fStream.seekg(fDataPos);
333 fProcessedLines = 0ULL;
335 FreeRecords();
336}
337
338const std::vector<std::string> &RCsvDS::GetColumnNames() const
339{
340 return fHeaders;
341}
342
343std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
344{
345
346 // Read records and store them in memory
347 auto linesToRead = fLinesChunkSize;
348 FreeRecords();
349
350 std::string line;
351 while ((-1LL == fLinesChunkSize || 0 != linesToRead--) && std::getline(fStream, line)) {
352 fRecords.emplace_back();
353 FillRecord(line, fRecords.back());
354 }
355
356 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
357 const auto nRecords = fRecords.size();
358 if (0 == nRecords)
359 return entryRanges;
360
361 const auto chunkSize = nRecords / fNSlots;
362 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
363 auto start = 0ULL == fEntryRangesRequested ? 0ULL : fProcessedLines;
364 auto end = start;
365
366 for (auto i : ROOT::TSeqU(fNSlots)) {
367 start = end;
368 end += chunkSize;
369 entryRanges.emplace_back(start, end);
370 (void)i;
371 }
372 entryRanges.back().second += remainder;
373
374 fProcessedLines += nRecords;
376
377 return entryRanges;
378}
379
381{
382 if (!HasColumn(colName)) {
383 std::string msg = "The dataset does not have column ";
384 msg += colName;
385 throw std::runtime_error(msg);
386 }
387
388 return fColTypes.at(colName.data());
389}
390
391std::string RCsvDS::GetTypeName(std::string_view colName) const
392{
393 return fgColTypeMap.at(GetType(colName));
394}
395
397{
398 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
399}
400
401bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
402{
403 // Here we need to normalise the entry to the number of lines we already processed.
404 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
405 const auto recordPos = entry - offset;
406 int colIndex = 0;
407 for (auto &colType : fColTypesList) {
408 auto dataPtr = fRecords[recordPos][colIndex];
409 switch (colType) {
410 case 'd': {
411 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
412 break;
413 }
414 case 'l': {
415 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
416 break;
417 }
418 case 'b': {
419 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
420 break;
421 }
422 case 's': {
423 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
424 break;
425 }
426 }
427 colIndex++;
428 }
429 return true;
430}
431
432void RCsvDS::SetNSlots(unsigned int nSlots)
433{
434 R__ASSERT(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
435
436 fNSlots = nSlots;
437
438 const auto nColumns = fHeaders.size();
439 // Initialise the entire set of addresses
440 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
441
442 // Initialize the per event data holders
443 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
444 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
445 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
446 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
447}
448
449std::string RCsvDS::GetLabel()
450{
451 return "RCsv";
452}
453
454RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize)
455{
456 ROOT::RDataFrame tdf(std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize));
457 return tdf;
458}
459
460} // ns RDF
461
462} // ns ROOT
#define b(i)
Definition: RSha256.hxx:100
static RooMathCoreReg dummy
long long Long64_t
Definition: RtypesCore.h:69
unsigned long long ULong64_t
Definition: RtypesCore.h:70
#define R__ASSERT(e)
Definition: TError.h:96
int type
Definition: TGX11.cxx:120
typedef void((*Func_t)())
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:257
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:44
static TRegexp falseRegex
Definition: RCsvDS.hxx:55
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:112
std::ifstream fStream
Definition: RCsvDS.hxx:38
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:380
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:48
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:195
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:226
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:147
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:46
unsigned int fNSlots
Definition: RCsvDS.hxx:37
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:449
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:40
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:43
static TRegexp doubleRegex2
Definition: RCsvDS.hxx:55
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:41
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:338
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:42
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:154
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:186
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:49
static TRegexp trueRegex
Definition: RCsvDS.hxx:55
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:33
const char fDelimiter
Definition: RCsvDS.hxx:39
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:47
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:343
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:401
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:396
std::streampos fDataPos
Definition: RCsvDS.hxx:35
static TRegexp doubleRegex1
Definition: RCsvDS.hxx:55
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:432
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:391
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:215
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:104
std::string AsString()
Definition: RCsvDS.cxx:89
void Finalise()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:329
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:50
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:53
static TRegexp intRegex
Definition: RCsvDS.hxx:55
void FreeRecords()
Definition: RCsvDS.cxx:293
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:324
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:45
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:41
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TLine * line
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:454
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
STL namespace.
double stod(std::string_view str, size_t *pos)
Definition: RStringView.hxx:48
basic_string_view< char > string_view
Definition: RStringView.hxx:35