Logo ROOT   6.14/05
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1 // Author: Enric Tejedor CERN 10/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 // clang-format off
12 /** \class ROOT::RDF::RCsvDS
13  \ingroup dataframe
14  \brief RDataFrame data source class for reading CSV files.
15 
16 The RCsvDS class implements a CSV file reader for RDataFrame.
17 
18 A RDataFrame that reads from a CSV file can be constructed using the factory method
19 ROOT::RDF::MakeCsvDataFrame, which accepts three parameters:
20 1. Path to the CSV file.
21 2. Boolean that specifies whether the first row of the CSV file contains headers or
22 not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
23 3. Delimiter (optional, default ',').
24 
25 The types of the columns in the CSV file are automatically inferred. The supported
26 types are:
27 - Integer: stored as a 64-bit long long int.
28 - Floating point number: stored with double precision.
29 - Boolean: matches the literals `true` and `false`.
30 - String: stored as an std::string, matches anything that does not fall into any of the
31 previous types.
32 
33 These are some formatting rules expected by the RCsvDS implementation:
34 - All records must have the same number of fields, in the same order.
35 - Any field may be quoted.
36 ~~~
37  "1997","Ford","E350"
38 ~~~
39 - Fields with embedded delimiters (e.g. comma) must be quoted.
40 ~~~
41  1997,Ford,E350,"Super, luxurious truck"
42 ~~~
43 - Fields with double-quote characters must be quoted, and each of the embedded
44 double-quote characters must be represented by a pair of double-quote characters.
45 ~~~
46  1997,Ford,E350,"Super, ""luxurious"" truck"
47 ~~~
48 - Fields with embedded line breaks are not supported, even when quoted.
49 ~~~
50  1997,Ford,E350,"Go get one now
51  they are going fast"
52 ~~~
53 - Spaces are considered part of a field and are not ignored.
54 ~~~
55  1997, Ford , E350
56  not same as
57  1997,Ford,E350
58  but same as
59  1997, "Ford" , E350
60 ~~~
61 - If a header row is provided, it must contain column names for each of the fields.
62 ~~~
63  Year,Make,Model
64  1997,Ford,E350
65  2000,Mercury,Cougar
66 ~~~
67 
68 The current implementation of RCsvDS reads the entire CSV file content into memory before
69 RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
70 important to check both how much memory is available and the size of the CSV file.
71 */
72 // clang-format on
73 
74 #include <ROOT/RDFUtils.hxx>
75 #include <ROOT/TSeq.hxx>
76 #include <ROOT/RCsvDS.hxx>
77 #include <ROOT/RMakeUnique.hxx>
78 
79 #include <algorithm>
80 #include <iostream>
81 #include <sstream>
82 #include <string>
83 
84 namespace ROOT {
85 
86 namespace RDF {
87 
88 // Regular expressions for type inference
89 TRegexp RCsvDS::intRegex("^[-+]?[0-9]+$");
90 TRegexp RCsvDS::doubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
91 TRegexp RCsvDS::doubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
92 TRegexp RCsvDS::trueRegex("^true$");
93 TRegexp RCsvDS::falseRegex("^false$");
94 
95 const std::map<RCsvDS::ColType_t, std::string>
96  RCsvDS::fgColTypeMap({{'b', "bool"}, {'d', "double"}, {'l', "Long64_t"}, {'s', "std::string"}});
97 
98 void RCsvDS::FillHeaders(const std::string &line)
99 {
100  auto columns = ParseColumns(line);
101  for (auto &col : columns) {
102  fHeaders.emplace_back(col);
103  }
104 }
105 
106 void RCsvDS::FillRecord(const std::string &line, Record_t &record)
107 {
108  std::istringstream lineStream(line);
109  auto i = 0U;
110 
111  auto columns = ParseColumns(line);
112 
113  for (auto &col : columns) {
114  auto colType = fColTypes[fHeaders[i]];
115 
116  switch (colType) {
117  case 'd': {
118  record.emplace_back(new double(std::stod(col)));
119  break;
120  }
121  case 'l': {
122  record.emplace_back(new Long64_t(std::stoll(col)));
123  break;
124  }
125  case 'b': {
126  auto b = new bool();
127  record.emplace_back(b);
128  std::istringstream is(col);
129  is >> std::boolalpha >> *b;
130  break;
131  }
132  case 's': {
133  record.emplace_back(new std::string(col));
134  break;
135  }
136  }
137  ++i;
138  }
139 }
140 
141 void RCsvDS::GenerateHeaders(size_t size)
142 {
143  for (size_t i = 0; i < size; ++i) {
144  fHeaders.push_back("Col" + std::to_string(i));
145  }
146 }
147 
148 std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
149 {
150  const auto colType = GetType(colName);
151 
152  if ((colType == 'd' && typeid(double) != ti) || (colType == 'l' && typeid(Long64_t) != ti) ||
153  (colType == 's' && typeid(std::string) != ti) || (colType == 'b' && typeid(bool) != ti)) {
154  std::string err = "The type selected for column \"";
155  err += colName;
156  err += "\" does not correspond to column type, which is ";
157  err += fgColTypeMap.at(colType);
158  throw std::runtime_error(err);
159  }
160 
161  const auto &colNames = GetColumnNames();
162  const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
163  std::vector<void *> ret(fNSlots);
164  for (auto slot : ROOT::TSeqU(fNSlots)) {
165  auto &val = fColAddresses[index][slot];
166  if (ti == typeid(double)) {
167  val = &fDoubleEvtValues[index][slot];
168  } else if (ti == typeid(Long64_t)) {
169  val = &fLong64EvtValues[index][slot];
170  } else if (ti == typeid(std::string)) {
171  val = &fStringEvtValues[index][slot];
172  } else {
173  val = &fBoolEvtValues[index][slot];
174  }
175  ret[slot] = &val;
176  }
177  return ret;
178 }
179 
180 void RCsvDS::InferColTypes(std::vector<std::string> &columns)
181 {
182  auto i = 0U;
183  for (auto &col : columns) {
184  InferType(col, i);
185  ++i;
186  }
187 }
188 
189 void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
190 {
191  ColType_t type;
192  int dummy;
193 
194  if (intRegex.Index(col, &dummy) != -1) {
195  type = 'l'; // Long64_t
196  } else if (doubleRegex1.Index(col, &dummy) != -1 || doubleRegex2.Index(col, &dummy) != -1) {
197  type = 'd'; // double
198  } else if (trueRegex.Index(col, &dummy) != -1 || falseRegex.Index(col, &dummy) != -1) {
199  type = 'b'; // bool
200  } else { // everything else is a string
201  type = 's'; // std::string
202  }
203  // TODO: Date
204 
205  fColTypes[fHeaders[idxCol]] = type;
206  fColTypesList.push_back(type);
207 }
208 
209 std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
210 {
211  std::vector<std::string> columns;
212 
213  for (size_t i = 0; i < line.size(); ++i) {
214  i = ParseValue(line, columns, i);
215  }
216 
217  return columns;
218 }
219 
220 size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
221 {
222  std::stringstream val;
223  bool quoted = false;
224 
225  for (; i < line.size(); ++i) {
226  if (line[i] == fDelimiter && !quoted) {
227  break;
228  } else if (line[i] == '"') {
229  // Keep just one quote for escaped quotes, none for the normal quotes
230  if (line[i + 1] != '"') {
231  quoted = !quoted;
232  } else {
233  val << line[++i];
234  }
235  } else {
236  val << line[i];
237  }
238  }
239 
240  columns.emplace_back(val.str());
241 
242  return i;
243 }
244 
245 ////////////////////////////////////////////////////////////////////////
246 /// Constructor to create a CSV RDataSource for RDataFrame.
247 /// \param[in] fileName Path of the CSV file.
248 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
249 /// (default `true`).
250 /// \param[in] delimiter Delimiter character (default ',').
251 RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize) // TODO: Let users specify types?
252  : fStream(std::string(fileName)),
253  fDelimiter(delimiter),
254  fLinesChunkSize(linesChunkSize)
255 {
256  std::string line;
257 
258  // Read the headers if present
259  if (readHeaders) {
260  if (std::getline(fStream, line)) {
261  FillHeaders(line);
262  } else {
263  std::string msg = "Error reading headers of CSV file ";
264  msg += fileName;
265  throw std::runtime_error(msg);
266  }
267  }
268 
269  if (std::getline(fStream, line)) {
270  auto columns = ParseColumns(line);
271 
272  // Generate headers if not present
273  if (!readHeaders) {
274  GenerateHeaders(columns.size());
275  }
276 
277  // Infer types of columns with first record
278  InferColTypes(columns);
279 
280  // Fill with the content of the first line
281  fRecords.emplace_back();
282  FillRecord(line, fRecords.back());
283  }
284 }
285 
287 {
288  for (auto &record : fRecords) {
289  for (size_t i = 0; i < record.size(); ++i) {
290  void *p = record[i];
291  const auto colType = fColTypes[fHeaders[i]];
292  switch (colType) {
293  case 'd': {
294  delete static_cast<double *>(p);
295  break;
296  }
297  case 'l': {
298  delete static_cast<Long64_t *>(p);
299  break;
300  }
301  case 'b': {
302  delete static_cast<bool *>(p);
303  break;
304  }
305  case 's': {
306  delete static_cast<std::string *>(p);
307  break;
308  }
309  }
310  }
311  }
312  fRecords.clear();
313 }
314 
315 ////////////////////////////////////////////////////////////////////////
316 /// Destructor.
318 {
319  FreeRecords();
320 }
321 
322 const std::vector<std::string> &RCsvDS::GetColumnNames() const
323 {
324  return fHeaders;
325 }
326 
327 std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
328 {
329 
330  // Read records and store them in memory
331  // This might be the first time we invoke the method. We need to take
332  // into account the line which was read in the constructor to infer the
333  // column types.
334  // skips a line.
335  auto linesToRead = fLinesChunkSize;
336  if (0ULL == fEntryRangesRequested) {
337  linesToRead--;
338  } else {
339  FreeRecords();
340  }
341  std::string line;
342  while ((-1LL == fLinesChunkSize || 0 != linesToRead--) && std::getline(fStream, line)) {
343  fRecords.emplace_back();
344  FillRecord(line, fRecords.back());
345  }
346 
347  std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
348  const auto nRecords = fRecords.size();
349  if (0 == nRecords)
350  return entryRanges;
351 
352  const auto chunkSize = nRecords / fNSlots;
353  const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
354  auto start = 0ULL == fEntryRangesRequested ? 0ULL : fProcessedLines;
355  auto end = start;
356 
357  for (auto i : ROOT::TSeqU(fNSlots)) {
358  start = end;
359  end += chunkSize;
360  entryRanges.emplace_back(start, end);
361  (void)i;
362  }
363  entryRanges.back().second += remainder;
364 
365  fProcessedLines += nRecords;
367  return entryRanges;
368 }
369 
371 {
372  if (!HasColumn(colName)) {
373  std::string msg = "The dataset does not have column ";
374  msg += colName;
375  throw std::runtime_error(msg);
376  }
377 
378  return fColTypes.at(colName.data());
379 }
380 
381 std::string RCsvDS::GetTypeName(std::string_view colName) const
382 {
383  return fgColTypeMap.at(GetType(colName));
384 }
385 
387 {
388  return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
389 }
390 
391 bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
392 {
393  // Here we need to normalise the entry to the number of lines we already processed.
394  const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
395  const auto recordPos = entry - offset;
396  int colIndex = 0;
397  for (auto &colType : fColTypesList) {
398  auto dataPtr = fRecords[recordPos][colIndex];
399  switch (colType) {
400  case 'd': {
401  fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
402  break;
403  }
404  case 'l': {
405  fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
406  break;
407  }
408  case 'b': {
409  fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
410  break;
411  }
412  case 's': {
413  fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
414  break;
415  }
416  }
417  colIndex++;
418  }
419  return true;
420 }
421 
422 void RCsvDS::SetNSlots(unsigned int nSlots)
423 {
424  assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
425 
426  fNSlots = nSlots;
427 
428  const auto nColumns = fHeaders.size();
429  // Initialise the entire set of addresses
430  fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
431 
432  // Initialize the per event data holders
433  fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
434  fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
435  fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
436  fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
437 }
438 
439 RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize)
440 {
441  ROOT::RDataFrame tdf(std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize));
442  return tdf;
443 }
444 
445 } // ns RDF
446 
447 } // ns ROOT
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:209
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:41
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:220
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:106
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:439
long long Long64_t
Definition: RtypesCore.h:69
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
const char fDelimiter
Definition: RCsvDS.hxx:37
TLine * line
void FreeRecords()
Definition: RCsvDS.cxx:286
static TRegexp doubleRegex1
Definition: RCsvDS.hxx:53
static TRegexp trueRegex
Definition: RCsvDS.hxx:53
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:422
static TRegexp doubleRegex2
Definition: RCsvDS.hxx:53
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:44
Regular expression class.
Definition: TRegexp.h:31
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:46
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:189
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:39
static TRegexp intRegex
Definition: RCsvDS.hxx:53
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:42
static TRegexp falseRegex
Definition: RCsvDS.hxx:53
STL namespace.
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot...
Definition: RCsvDS.cxx:391
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:386
std::ifstream fStream
Definition: RCsvDS.hxx:36
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:381
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:251
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:47
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:180
unsigned int fNSlots
Definition: RCsvDS.hxx:35
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:38
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:45
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:327
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:40
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:141
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:48
ROOT&#39;s RDataFrame offers a high level interface for analyses of data stored in TTrees, CSV&#39;s and other data formats.
Definition: RDataFrame.hxx:42
std::vector< void * > Record_t
Definition: RDataSource.hxx:94
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
int type
Definition: TGX11.cxx:120
unsigned long long ULong64_t
Definition: RtypesCore.h:70
static RooMathCoreReg dummy
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:317
basic_string_view< char > string_view
Definition: RStringView.hxx:35
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset&#39;s column names.
Definition: RCsvDS.cxx:322
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:98
typedef void((*Func_t)())
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:33
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:148
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:51
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:43
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:370