Logo ROOT   6.12/07
Reference Guide
TCsvDS.cxx
Go to the documentation of this file.
1 // Author: Enric Tejedor CERN 10/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 // clang-format off
12 /** \class ROOT::Experimental::TDF::TCsvDS
13  \ingroup dataframe
14  \brief TDataFrame data source class for reading CSV files.
15 
16 The TCsvDS class implements a CSV file reader for TDataFrame.
17 
18 A TDataFrame that reads from a CSV file can be constructed using the factory method
19 ROOT::Experimental::TDF::MakeCsvDataFrame, which accepts three parameters:
20 1. Path to the CSV file.
21 2. Boolean that specifies whether the first row of the CSV file contains headers or
22 not (optional, default `true`). If `false`, header names will be automatically generated.
23 3. Delimiter (optional, default ',').
24 
25 The types of the columns in the CSV file are automatically inferred. The supported
26 types are:
27 - Integer: stored as a 64-bit long long int.
28 - Floating point number: stored with double precision.
29 - Boolean: matches the literals `true` and `false`.
30 - String: stored as an std::string, matches anything that does not fall into any of the
31 previous types.
32 
33 These are some formatting rules expected by the TCsvDS implementation:
34 - All records must have the same number of fields, in the same order.
35 - Any field may be quoted.
36 ~~~
37  "1997","Ford","E350"
38 ~~~
39 - Fields with embedded delimiters (e.g. comma) must be quoted.
40 ~~~
41  1997,Ford,E350,"Super, luxurious truck"
42 ~~~
43 - Fields with double-quote characters must be quoted, and each of the embedded
44 double-quote characters must be represented by a pair of double-quote characters.
45 ~~~
46  1997,Ford,E350,"Super, ""luxurious"" truck"
47 ~~~
48 - Fields with embedded line breaks are not supported, even when quoted.
49 ~~~
50  1997,Ford,E350,"Go get one now
51  they are going fast"
52 ~~~
53 - Spaces are considered part of a field and are not ignored.
54 ~~~
55  1997, Ford , E350
56  not same as
57  1997,Ford,E350
58  but same as
59  1997, "Ford" , E350
60 ~~~
61 - If a header row is provided, it must contain column names for each of the fields.
62 ~~~
63  Year,Make,Model
64  1997,Ford,E350
65  2000,Mercury,Cougar
66 ~~~
67 
68 The current implementation of TCsvDS reads the entire CSV file content into memory before
69 TDataFrame starts processing it. Therefore, before creating a CSV TDataFrame, it is
70 important to check both how much memory is available and the size of the CSV file.
71 */
72 // clang-format on
73 
74 #include <ROOT/TDFUtils.hxx>
75 #include <ROOT/TSeq.hxx>
76 #include <ROOT/TCsvDS.hxx>
77 #include <ROOT/RMakeUnique.hxx>
78 
79 #include <algorithm>
80 #include <iostream>
81 #include <sstream>
82 #include <string>
83 
84 namespace ROOT {
85 namespace Experimental {
86 namespace TDF {
87 
88 // Regular expressions for type inference
89 TRegexp TCsvDS::intRegex("^[-+]?[0-9]+$");
90 TRegexp TCsvDS::doubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
91 TRegexp TCsvDS::doubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
92 TRegexp TCsvDS::trueRegex("^true$");
93 TRegexp TCsvDS::falseRegex("^false$");
94 
95 void TCsvDS::FillHeaders(const std::string &line)
96 {
97  auto columns = ParseColumns(line);
98  for (auto &col : columns) {
99  fHeaders.emplace_back(col);
100  }
101 }
102 
103 void TCsvDS::FillRecord(const std::string &line, Record &record)
104 {
105  std::istringstream lineStream(line);
106  auto i = 0U;
107 
108  auto columns = ParseColumns(line);
109 
110  for (auto &col : columns) {
111  auto &colType = fColTypes[fHeaders[i]];
112 
113  if (colType == "Long64_t") {
114  record.emplace_back(new Long64_t(std::stoll(col)));
115  } else if (colType == "double") {
116  record.emplace_back(new double(std::stod(col)));
117  } else if (colType == "bool") {
118  bool *b = new bool();
119  record.emplace_back(b);
120  std::istringstream is(col);
121  is >> std::boolalpha >> *b;
122  } else {
123  record.emplace_back(new std::string(col));
124  }
125  ++i;
126  }
127 }
128 
129 void TCsvDS::GenerateHeaders(size_t size)
130 {
131  for (size_t i = 0; i < size; ++i) {
132  fHeaders.push_back("Col" + std::to_string(i));
133  }
134 }
135 
136 std::vector<void *> TCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
137 {
138  const auto colTypeName = GetTypeName(colName);
139 
140  if ((colTypeName == "double" && typeid(double) != ti) || (colTypeName == "Long64_t" && typeid(Long64_t) != ti) ||
141  (colTypeName == "std::string" && typeid(std::string) != ti) || (colTypeName == "bool" && typeid(bool) != ti)) {
142  std::string err = "The type selected for column \"";
143  err += colName;
144  err += "\" does not correspond to column type, which is ";
145  err += colTypeName;
146  throw std::runtime_error(err);
147  }
148 
149  const auto &colNames = GetColumnNames();
150  const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
151  std::vector<void *> ret(fNSlots);
152  for (auto slot : ROOT::TSeqU(fNSlots)) {
153  auto &val = fColAddresses[index][slot];
154  if (ti == typeid(double)) {
155  val = &fDoubleEvtValues[index][slot];
156  } else if (ti == typeid(Long64_t)) {
157  val = &fLong64EvtValues[index][slot];
158  } else if (ti == typeid(std::string)) {
159  val = &fStringEvtValues[index][slot];
160  } else {
161  val = &fBoolEvtValues[index][slot];
162  }
163  ret[slot] = &val;
164  }
165  return ret;
166 }
167 
168 void TCsvDS::InferColTypes(std::vector<std::string> &columns)
169 {
170  auto i = 0U;
171  for (auto &col : columns) {
172  InferType(col, i);
173  ++i;
174  }
175 }
176 
177 void TCsvDS::InferType(const std::string &col, unsigned int idxCol)
178 {
179  std::string type;
180  int dummy;
181 
182  if (intRegex.Index(col, &dummy) != -1) {
183  type = "Long64_t";
184  } else if (doubleRegex1.Index(col, &dummy) != -1 || doubleRegex2.Index(col, &dummy) != -1) {
185  type = "double";
186  } else if (trueRegex.Index(col, &dummy) != -1 || falseRegex.Index(col, &dummy) != -1) {
187  type = "bool";
188  } else { // everything else is a string
189  type = "std::string";
190  }
191  // TODO: Date
192 
193  fColTypes[fHeaders[idxCol]] = type;
194  fColTypesList.push_back(type);
195 }
196 
197 std::vector<std::string> TCsvDS::ParseColumns(const std::string &line)
198 {
199  std::vector<std::string> columns;
200 
201  for (size_t i = 0; i < line.size(); ++i) {
202  i = ParseValue(line, columns, i);
203  }
204 
205  return columns;
206 }
207 
208 size_t TCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
209 {
210  std::stringstream val;
211  bool quoted = false;
212 
213  for (; i < line.size(); ++i) {
214  if (line[i] == fDelimiter && !quoted) {
215  break;
216  } else if (line[i] == '"') {
217  // Keep just one quote for escaped quotes, none for the normal quotes
218  if (line[i + 1] != '"') {
219  quoted = !quoted;
220  } else {
221  val << line[++i];
222  }
223  } else {
224  val << line[i];
225  }
226  }
227 
228  columns.emplace_back(val.str());
229 
230  return i;
231 }
232 
233 ////////////////////////////////////////////////////////////////////////
234 /// Constructor to create a CSV TDataSource for TDataFrame.
235 /// \param[in] fileName Path of the CSV file.
236 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
237 /// (default `true`).
238 /// \param[in] delimiter Delimiter character (default ',').
239 TCsvDS::TCsvDS(std::string_view fileName, bool readHeaders, char delimiter) // TODO: Let users specify types?
240  : fFileName(fileName),
241  fDelimiter(delimiter)
242 {
243  std::ifstream stream(fFileName);
244  std::string line;
245 
246  // Read the headers if present
247  if (readHeaders) {
248  if (std::getline(stream, line)) {
249  FillHeaders(line);
250  } else {
251  std::string msg = "Error reading headers of CSV file ";
252  msg += fileName;
253  throw std::runtime_error(msg);
254  }
255  }
256 
257  if (std::getline(stream, line)) {
258  auto columns = ParseColumns(line);
259 
260  // Generate headers if not present
261  if (!readHeaders) {
262  GenerateHeaders(columns.size());
263  }
264 
265  // Infer types of columns with first record
266  InferColTypes(columns);
267 
268  // Read all records and store them in memory
269  do {
270  fRecords.emplace_back();
271  FillRecord(line, fRecords.back());
272  } while (std::getline(stream, line));
273  }
274 }
275 
276 ////////////////////////////////////////////////////////////////////////
277 /// Destructor.
279 {
280  for (auto &record : fRecords) {
281  for (size_t i = 0; i < record.size(); ++i) {
282  void *p = record[i];
283  auto &colType = fColTypes[fHeaders[i]];
284 
285  if (colType == "Long64_t") {
286  delete static_cast<Long64_t *>(p);
287  } else if (colType == "double") {
288  delete static_cast<double *>(p);
289  } else if (colType == "bool") {
290  delete static_cast<bool *>(p);
291  } else {
292  delete static_cast<std::string *>(p);
293  }
294  }
295  }
296 }
297 
298 const std::vector<std::string> &TCsvDS::GetColumnNames() const
299 {
300  return fHeaders;
301 }
302 
303 std::vector<std::pair<ULong64_t, ULong64_t>> TCsvDS::GetEntryRanges()
304 {
305  auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
306  return entryRanges;
307 }
308 
309 std::string TCsvDS::GetTypeName(std::string_view colName) const
310 {
311  if (!HasColumn(colName)) {
312  std::string msg = "The dataset does not have column ";
313  msg += colName;
314  throw std::runtime_error(msg);
315  }
316 
317  return fColTypes.at(colName.data());
318 }
319 
321 {
322  return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
323 }
324 
325 void TCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
326 {
327  int colIndex = 0;
328  for (auto &&colType : fColTypesList) {
329  auto dataPtr = fRecords[entry][colIndex];
330  if (colType == "double") {
331  fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
332  } else if (colType == "Long64_t") {
333  fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
334  } else if (colType == "std::string") {
335  fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
336  } else {
337  fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
338  }
339  colIndex++;
340  }
341 }
342 
343 void TCsvDS::SetNSlots(unsigned int nSlots)
344 {
345  assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
346 
347  fNSlots = nSlots;
348 
349  const auto nColumns = fHeaders.size();
350  // Initialise the entire set of addresses
351  fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
352 
353  // Initialize the per event data holders
354  fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
355  fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
356  fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
357  fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
358 }
359 
361 {
362  const auto nRecords = fRecords.size();
363  const auto chunkSize = nRecords / fNSlots;
364  const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
365  auto start = 0UL;
366  auto end = 0UL;
367 
368  for (auto i : ROOT::TSeqU(fNSlots)) {
369  start = end;
370  end += chunkSize;
371  fEntryRanges.emplace_back(start, end);
372  (void)i;
373  }
374  fEntryRanges.back().second += remainder;
375 }
376 
377 TDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter)
378 {
379  ROOT::Experimental::TDataFrame tdf(std::make_unique<TCsvDS>(fileName, readHeaders, delimiter));
380  return tdf;
381 }
382 
383 } // ns TDF
384 } // ns Experimental
385 } // ns ROOT
std::vector< std::string > fHeaders
Definition: TCsvDS.hxx:26
long long Long64_t
Definition: RtypesCore.h:69
basic_string_view< char > string_view
Definition: RStringView.h:35
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
TLine * line
Regular expression class.
Definition: TRegexp.h:31
void SetNSlots(unsigned int nSlots)
Inform TDataSource of the number of processing slots (i.e.
Definition: TCsvDS.cxx:343
std::vector< std::deque< bool > > fBoolEvtValues
Definition: TCsvDS.hxx:37
void InferType(const std::string &, unsigned int)
Definition: TCsvDS.cxx:177
void SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot...
Definition: TCsvDS.cxx:325
std::vector< std::vector< std::string > > fStringEvtValues
Definition: TCsvDS.hxx:34
static TRegexp doubleRegex2
Definition: TCsvDS.hxx:39
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: TCsvDS.cxx:320
void FillHeaders(const std::string &)
Definition: TCsvDS.cxx:95
void FillRecord(const std::string &, Record &)
Definition: TCsvDS.cxx:103
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: TCsvDS.hxx:33
void Initialise()
Convenience method called before starting an event-loop.
Definition: TCsvDS.cxx:360
std::vector< void * > Record
Definition: TCsvDS.hxx:21
std::vector< std::vector< void * > > fColAddresses
Definition: TCsvDS.hxx:29
void InferColTypes(std::vector< std::string > &)
Definition: TCsvDS.cxx:168
std::vector< std::vector< double > > fDoubleEvtValues
Definition: TCsvDS.hxx:32
TDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',')
Factory method to create a CSV TDataFrame.
Definition: TCsvDS.cxx:377
std::vector< Record > fRecords
Definition: TCsvDS.hxx:31
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: TCsvDS.cxx:136
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition: TCsvDS.hxx:30
std::vector< std::string > ParseColumns(const std::string &)
Definition: TCsvDS.cxx:197
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: TCsvDS.cxx:208
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: TCsvDS.cxx:303
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',')
Constructor to create a CSV TDataSource for TDataFrame.
Definition: TCsvDS.cxx:239
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
int type
Definition: TGX11.cxx:120
unsigned long long ULong64_t
Definition: RtypesCore.h:70
static RooMathCoreReg dummy
std::list< std::string > fColTypesList
Definition: TCsvDS.hxx:28
static TRegexp doubleRegex1
Definition: TCsvDS.hxx:39
typedef void((*Func_t)())
ROOT&#39;s TDataFrame offers a high level interface for analyses of data stored in TTrees.
Definition: TDataFrame.hxx:39
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
std::map< std::string, std::string > fColTypes
Definition: TCsvDS.hxx:27
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: TCsvDS.cxx:309
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset&#39;s column names.
Definition: TCsvDS.cxx:298