Logo ROOT  
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts three parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
24
25The types of the columns in the CSV file are automatically inferred. The supported
26types are:
27- Integer: stored as a 64-bit long long int.
28- Floating point number: stored with double precision.
29- Boolean: matches the literals `true` and `false`.
30- String: stored as an std::string, matches anything that does not fall into any of the
31previous types.
32
33These are some formatting rules expected by the RCsvDS implementation:
34- All records must have the same number of fields, in the same order.
35- Any field may be quoted.
36~~~
37 "1997","Ford","E350"
38~~~
39- Fields with embedded delimiters (e.g. comma) must be quoted.
40~~~
41 1997,Ford,E350,"Super, luxurious truck"
42~~~
43- Fields with double-quote characters must be quoted, and each of the embedded
44double-quote characters must be represented by a pair of double-quote characters.
45~~~
46 1997,Ford,E350,"Super, ""luxurious"" truck"
47~~~
48- Fields with embedded line breaks are not supported, even when quoted.
49~~~
50 1997,Ford,E350,"Go get one now
51 they are going fast"
52~~~
53- Spaces are considered part of a field and are not ignored.
54~~~
55 1997, Ford , E350
56 not same as
57 1997,Ford,E350
58 but same as
59 1997, "Ford" , E350
60~~~
61- If a header row is provided, it must contain column names for each of the fields.
62~~~
63 Year,Make,Model
64 1997,Ford,E350
65 2000,Mercury,Cougar
66~~~
67
68The current implementation of RCsvDS reads the entire CSV file content into memory before
69RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
70important to check both how much memory is available and the size of the CSV file.
71*/
72// clang-format on
73
74#include <ROOT/RDF/Utils.hxx>
75#include <ROOT/TSeq.hxx>
76#include <ROOT/RCsvDS.hxx>
77#include <ROOT/RRawFile.hxx>
78#include <TError.h>
79
80#include <algorithm>
81#include <iostream>
82#include <memory>
83#include <sstream>
84#include <string>
85
86namespace ROOT {
87
88namespace RDF {
89
90std::string RCsvDS::AsString()
91{
92 return "CSV data source";
93}
94
95// Regular expressions for type inference
96const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
97const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
98const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
99const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
100const TRegexp RCsvDS::fgTrueRegex("^true$");
101const TRegexp RCsvDS::fgFalseRegex("^false$");
102
103const std::map<RCsvDS::ColType_t, std::string>
104 RCsvDS::fgColTypeMap({{'b', "bool"}, {'d', "double"}, {'l', "Long64_t"}, {'s', "std::string"}});
105
106void RCsvDS::FillHeaders(const std::string &line)
107{
108 auto columns = ParseColumns(line);
109 for (auto &col : columns) {
110 fHeaders.emplace_back(col);
111 }
112}
113
114void RCsvDS::FillRecord(const std::string &line, Record_t &record)
115{
116 std::istringstream lineStream(line);
117 auto i = 0U;
118
119 auto columns = ParseColumns(line);
120
121 for (auto &col : columns) {
122 auto colType = fColTypes[fHeaders[i]];
123
124 switch (colType) {
125 case 'd': {
126 record.emplace_back(new double(std::stod(col)));
127 break;
128 }
129 case 'l': {
130 record.emplace_back(new Long64_t(std::stoll(col)));
131 break;
132 }
133 case 'b': {
134 auto b = new bool();
135 record.emplace_back(b);
136 std::istringstream is(col);
137 is >> std::boolalpha >> *b;
138 break;
139 }
140 case 's': {
141 record.emplace_back(new std::string(col));
142 break;
143 }
144 }
145 ++i;
146 }
147}
148
150{
151 for (size_t i = 0; i < size; ++i) {
152 fHeaders.push_back("Col" + std::to_string(i));
153 }
154}
155
156std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
157{
158 const auto colType = GetType(colName);
159
160 if ((colType == 'd' && typeid(double) != ti) || (colType == 'l' && typeid(Long64_t) != ti) ||
161 (colType == 's' && typeid(std::string) != ti) || (colType == 'b' && typeid(bool) != ti)) {
162 std::string err = "The type selected for column \"";
163 err += colName;
164 err += "\" does not correspond to column type, which is ";
165 err += fgColTypeMap.at(colType);
166 throw std::runtime_error(err);
167 }
168
169 const auto &colNames = GetColumnNames();
170 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
171 std::vector<void *> ret(fNSlots);
172 for (auto slot : ROOT::TSeqU(fNSlots)) {
173 auto &val = fColAddresses[index][slot];
174 if (ti == typeid(double)) {
175 val = &fDoubleEvtValues[index][slot];
176 } else if (ti == typeid(Long64_t)) {
177 val = &fLong64EvtValues[index][slot];
178 } else if (ti == typeid(std::string)) {
179 val = &fStringEvtValues[index][slot];
180 } else {
181 val = &fBoolEvtValues[index][slot];
182 }
183 ret[slot] = &val;
184 }
185 return ret;
186}
187
188void RCsvDS::InferColTypes(std::vector<std::string> &columns)
189{
190 auto i = 0U;
191 for (auto &col : columns) {
192 InferType(col, i);
193 ++i;
194 }
195}
196
197void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
198{
200 int dummy;
201
202 if (fgIntRegex.Index(col, &dummy) != -1) {
203 type = 'l'; // Long64_t
204 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 ||
205 fgDoubleRegex2.Index(col, &dummy) != -1 ||
206 fgDoubleRegex3.Index(col, &dummy) != -1) {
207 type = 'd'; // double
208 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
209 type = 'b'; // bool
210 } else { // everything else is a string
211 type = 's'; // std::string
212 }
213 // TODO: Date
214
215 fColTypes[fHeaders[idxCol]] = type;
216 fColTypesList.push_back(type);
217}
218
219std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
220{
221 std::vector<std::string> columns;
222
223 for (size_t i = 0; i < line.size(); ++i) {
224 i = ParseValue(line, columns, i);
225 }
226
227 return columns;
228}
229
230size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
231{
232 std::stringstream val;
233 bool quoted = false;
234
235 for (; i < line.size(); ++i) {
236 if (line[i] == fDelimiter && !quoted) {
237 break;
238 } else if (line[i] == '"') {
239 // Keep just one quote for escaped quotes, none for the normal quotes
240 if (line[i + 1] != '"') {
241 quoted = !quoted;
242 } else {
243 val << line[++i];
244 }
245 } else {
246 val << line[i];
247 }
248 }
249
250 columns.emplace_back(val.str());
251
252 return i;
253}
254
255////////////////////////////////////////////////////////////////////////
256/// Constructor to create a CSV RDataSource for RDataFrame.
257/// \param[in] fileName Path or URL of the CSV file.
258/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
259/// (default `true`).
260/// \param[in] delimiter Delimiter character (default ',').
261RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize) // TODO: Let users specify types?
262 : fReadHeaders(readHeaders),
263 fCsvFile(ROOT::Internal::RRawFile::Create(fileName)),
264 fDelimiter(delimiter),
265 fLinesChunkSize(linesChunkSize)
266{
267 std::string line;
268
269 // Read the headers if present
270 if (fReadHeaders) {
271 if (fCsvFile->Readln(line)) {
273 } else {
274 std::string msg = "Error reading headers of CSV file ";
275 msg += fileName;
276 throw std::runtime_error(msg);
277 }
278 }
279
280 fDataPos = fCsvFile->GetFilePos();
281 bool eof = false;
282 do {
283 eof = !fCsvFile->Readln(line);
284 } while (line.empty() && !eof);
285 if (!eof) {
286 auto columns = ParseColumns(line);
287
288 // Generate headers if not present
289 if (!fReadHeaders) {
290 GenerateHeaders(columns.size());
291 }
292
293 // Infer types of columns with first record
294 InferColTypes(columns);
295
296 // rewind
297 fCsvFile->Seek(fDataPos);
298 } else {
299 std::string msg = "Could not infer column types of CSV file ";
300 msg += fileName;
301 throw std::runtime_error(msg);
302 }
303}
304
306{
307 for (auto &record : fRecords) {
308 for (size_t i = 0; i < record.size(); ++i) {
309 void *p = record[i];
310 const auto colType = fColTypes[fHeaders[i]];
311 switch (colType) {
312 case 'd': {
313 delete static_cast<double *>(p);
314 break;
315 }
316 case 'l': {
317 delete static_cast<Long64_t *>(p);
318 break;
319 }
320 case 'b': {
321 delete static_cast<bool *>(p);
322 break;
323 }
324 case 's': {
325 delete static_cast<std::string *>(p);
326 break;
327 }
328 }
329 }
330 }
331 fRecords.clear();
332}
333
334////////////////////////////////////////////////////////////////////////
335/// Destructor.
337{
338 FreeRecords();
339}
340
342{
343 fCsvFile->Seek(fDataPos);
344 fProcessedLines = 0ULL;
346 FreeRecords();
347}
348
349const std::vector<std::string> &RCsvDS::GetColumnNames() const
350{
351 return fHeaders;
352}
353
354std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
355{
356
357 // Read records and store them in memory
358 auto linesToRead = fLinesChunkSize;
359 FreeRecords();
360
361 std::string line;
362 while ((-1LL == fLinesChunkSize || 0 != linesToRead) && fCsvFile->Readln(line)) {
363 if (line.empty()) continue; // skip empty lines
364 fRecords.emplace_back();
365 FillRecord(line, fRecords.back());
366 --linesToRead;
367 }
368
369 if (gDebug > 0) {
370 if (fLinesChunkSize == -1LL) {
371 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
372 } else {
373 Info("GetEntryRanges", "Attempted to read chunk of %lld lines of CSV file into memory, %zu lines read", fLinesChunkSize, fRecords.size());
374 }
375 }
376
377 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
378 const auto nRecords = fRecords.size();
379 if (0 == nRecords)
380 return entryRanges;
381
382 const auto chunkSize = nRecords / fNSlots;
383 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
384 auto start = 0ULL == fEntryRangesRequested ? 0ULL : fProcessedLines;
385 auto end = start;
386
387 for (auto i : ROOT::TSeqU(fNSlots)) {
388 start = end;
389 end += chunkSize;
390 entryRanges.emplace_back(start, end);
391 (void)i;
392 }
393 entryRanges.back().second += remainder;
394
395 fProcessedLines += nRecords;
397
398 return entryRanges;
399}
400
402{
403 if (!HasColumn(colName)) {
404 std::string msg = "The dataset does not have column ";
405 msg += colName;
406 throw std::runtime_error(msg);
407 }
408
409 return fColTypes.at(colName.data());
410}
411
412std::string RCsvDS::GetTypeName(std::string_view colName) const
413{
414 return fgColTypeMap.at(GetType(colName));
415}
416
418{
419 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
420}
421
422bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
423{
424 // Here we need to normalise the entry to the number of lines we already processed.
425 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
426 const auto recordPos = entry - offset;
427 int colIndex = 0;
428 for (auto &colType : fColTypesList) {
429 auto dataPtr = fRecords[recordPos][colIndex];
430 switch (colType) {
431 case 'd': {
432 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
433 break;
434 }
435 case 'l': {
436 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
437 break;
438 }
439 case 'b': {
440 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
441 break;
442 }
443 case 's': {
444 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
445 break;
446 }
447 }
448 colIndex++;
449 }
450 return true;
451}
452
453void RCsvDS::SetNSlots(unsigned int nSlots)
454{
455 R__ASSERT(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
456
457 fNSlots = nSlots;
458
459 const auto nColumns = fHeaders.size();
460 // Initialise the entire set of addresses
461 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
462
463 // Initialize the per event data holders
464 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
465 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
466 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
467 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
468}
469
470std::string RCsvDS::GetLabel()
471{
472 return "RCsv";
473}
474
475RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize)
476{
477 ROOT::RDataFrame tdf(std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize));
478 return tdf;
479}
480
481} // ns RDF
482
483} // ns ROOT
typedef void(GLAPIENTRYP _GLUfuncptr)(void)
#define b(i)
Definition: RSha256.hxx:100
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition: RtypesCore.h:80
unsigned long long ULong64_t
Definition: RtypesCore.h:81
#define R__ASSERT(e)
Definition: TError.h:118
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition: TError.cxx:220
int type
Definition: TGX11.cxx:121
Int_t gDebug
Definition: TROOT.cxx:592
The RRawFile provides read-only access to local and remote files.
Definition: RRawFile.hxx:43
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:261
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:53
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:114
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:401
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:57
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:197
std::uint64_t fDataPos
Definition: RCsvDS.hxx:44
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:230
static const TRegexp fgTrueRegex
Definition: RCsvDS.hxx:42
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:149
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:55
unsigned int fNSlots
Definition: RCsvDS.hxx:46
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:470
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:49
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:52
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:50
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:349
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:51
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:156
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:188
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:58
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:39
const char fDelimiter
Definition: RCsvDS.hxx:48
static const TRegexp fgDoubleRegex2
Definition: RCsvDS.hxx:42
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:56
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:354
static const TRegexp fgFalseRegex
Definition: RCsvDS.hxx:42
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:422
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:417
static const TRegexp fgDoubleRegex3
Definition: RCsvDS.hxx:42
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:453
static const TRegexp fgIntRegex
Definition: RCsvDS.hxx:42
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:412
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:219
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:106
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition: RCsvDS.hxx:47
static const TRegexp fgDoubleRegex1
Definition: RCsvDS.hxx:42
std::string AsString()
Definition: RCsvDS.cxx:90
void Finalise()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:341
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:59
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:62
void FreeRecords()
Definition: RCsvDS.cxx:305
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:336
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:54
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTree,...
Definition: RDataFrame.hxx:40
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TLine * line
basic_string_view< char > string_view
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:475
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...