Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::FromCSV, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/TSeq.hxx>
82#include <ROOT/RCsvDS.hxx>
83#include <ROOT/RRawFile.hxx>
84#include <TError.h>
85
86#include <algorithm>
87#include <cctype>
88#include <cinttypes>
89#include <iterator>
90#include <memory>
91#include <sstream>
92#include <string>
93
94namespace ROOT {
95
96namespace RDF {
97
98std::string RCsvDS::AsString()
99{
100 return "CSV data source";
101}
102
103// Regular expressions for type inference
104const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex("^true$");
109const TRegexp RCsvDS::fgFalseRegex("^false$");
110
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
113
114bool RCsvDS::Readln(std::string &line)
115{
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart < N && std::isspace(s[idxStart]); ++idxStart)
120 ;
121 if (idxStart)
122 s.erase(0, idxStart);
123 };
124
125 auto fnRightTrim = [](std::string &s) {
126 size_t nTrim = 0;
127 for (auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
128 ;
129 if (nTrim)
130 s.resize(s.size() - nTrim);
131 };
132
133 while (true) {
134 const bool eof = !fCsvFile->Readln(line);
135 if (eof)
136 return false;
137 fLineNumber++;
138 if ((fMaxLineNumber >= 0) && (fLineNumber > fMaxLineNumber))
139 return false;
140
142 fnLeftTrim(line);
143 if (fOptions.fComment) {
144 auto idxComment = line.find(fOptions.fComment);
145 if (idxComment == 0)
146 continue;
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
149 }
151 fnRightTrim(line);
152 if (fOptions.fSkipBlankLines && line.empty())
153 continue;
154
155 return true;
156 }
157}
158
159void RCsvDS::RewindToData()
160{
161 fCsvFile->Seek(fDataPos);
163}
164
165void RCsvDS::FillHeaders(const std::string &line)
166{
167 const auto columns = ParseColumns(line);
168
169 if (!fOptions.fColumnNames.empty()) {
170 if (fOptions.fColumnNames.size() != columns.size()) {
171 auto msg = std::string("Error: passed ") + std::to_string(fOptions.fColumnNames.size()) +
172 " column names for a CSV file containing " + std::to_string(columns.size()) + " columns!";
173 throw std::runtime_error(msg);
174 }
175 std::swap(fHeaders, fOptions.fColumnNames);
176 return;
177 }
178
179 fHeaders.reserve(columns.size());
180 for (auto &col : columns) {
181 fHeaders.emplace_back(col);
182 }
183}
184
185void RCsvDS::FillRecord(const std::string &line, Record_t &record)
186{
187 auto i = 0U;
188
189 auto columns = ParseColumns(line);
190
191 for (auto &col : columns) {
192 auto colType = fColTypes[fHeaders[i]];
193
194 switch (colType) {
195 case 'D': {
196 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
197 break;
198 }
199 case 'L': {
200 if (col != "nan") {
201 record.emplace_back(new Long64_t(std::stoll(col)));
202 } else {
203 fColContainingEmpty.insert(fHeaders[i]);
204 record.emplace_back(new Long64_t(0));
205 }
206 break;
207 }
208 case 'O': {
209 auto b = new bool();
210 record.emplace_back(b);
211 if (col != "nan") {
212 std::istringstream(col) >> std::boolalpha >> *b;
213 } else {
214 fColContainingEmpty.insert(fHeaders[i]);
215 *b = false;
216 }
217 break;
218 }
219 case 'T': {
220 record.emplace_back(new std::string(col));
221 break;
222 }
223 }
224 ++i;
225 }
226}
227
228void RCsvDS::GenerateHeaders(size_t size)
229{
230 if (!fOptions.fColumnNames.empty()) {
231 if (fOptions.fColumnNames.size() != size) {
232 auto msg = std::string("Error: passed ") + std::to_string(fOptions.fColumnNames.size()) +
233 " column names for a CSV file containing " + std::to_string(size) + " columns!";
234 throw std::runtime_error(msg);
235 }
236 std::swap(fHeaders, fOptions.fColumnNames);
237 return;
238 }
239
240 fHeaders.reserve(size);
241 for (size_t i = 0u; i < size; ++i) {
242 fHeaders.push_back("Col" + std::to_string(i));
243 }
244}
245
246std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
247{
248 const auto colType = GetType(colName);
249
250 if ((colType == 'D' && typeid(double) != ti) || (colType == 'L' && typeid(Long64_t) != ti) ||
251 (colType == 'T' && typeid(std::string) != ti) || (colType == 'O' && typeid(bool) != ti)) {
252 std::string err = "The type selected for column \"";
253 err += colName;
254 err += "\" does not correspond to column type, which is ";
255 err += fgColTypeMap.at(colType);
256 throw std::runtime_error(err);
257 }
258
259 const auto &colNames = GetColumnNames();
260 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
261 std::vector<void *> ret(fNSlots);
262 for (auto slot : ROOT::TSeqU(fNSlots)) {
263 auto &val = fColAddresses[index][slot];
264 if (ti == typeid(double)) {
265 val = &fDoubleEvtValues[index][slot];
266 } else if (ti == typeid(Long64_t)) {
267 val = &fLong64EvtValues[index][slot];
268 } else if (ti == typeid(std::string)) {
269 val = &fStringEvtValues[index][slot];
270 } else {
271 val = &fBoolEvtValues[index][slot];
272 }
273 ret[slot] = &val;
274 }
275 return ret;
276}
277
278void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
279{
280 for (const auto &col : fColTypes) {
281 if (!HasColumn(col.first)) {
282 std::string msg = "There is no column with name \"" + col.first + "\".";
283 if (!fOptions.fHeaders) {
284 msg += "\nSince the input csv file does not contain headers, valid column names";
285 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
286 }
287 throw std::runtime_error(msg);
288 }
289 if (std::string("ODLT").find(col.second) == std::string::npos) {
290 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
291 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
292 throw std::runtime_error(msg);
293 }
294 }
295}
296
297void RCsvDS::InferColTypes(std::vector<std::string> &columns)
298{
299 const auto second_line = fCsvFile->GetFilePos();
300
301 for (auto i = 0u; i < columns.size(); ++i) {
302 const auto userSpecifiedType = fColTypes.find(fHeaders[i]);
303 if (userSpecifiedType != fColTypes.end()) {
304 fColTypesList.push_back(userSpecifiedType->second);
305 continue;
306 }
307
308 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
309 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
310 std::string line;
311 if (!Readln(line))
312 break; // EOF
313 const auto temp_columns = ParseColumns(line);
314 if (temp_columns[i] != "nan")
315 columns[i] = temp_columns[i]; // will break the loop in the next iteration
316 }
317 // reset the reading from the second line, because the first line is already loaded in `columns`
318 fCsvFile->Seek(second_line);
319
320 if (columns[i] == "nan") {
321 // could not find a non-empty value, default to double
322 fColTypes[fHeaders[i]] = 'D';
323 fColTypesList.push_back('D');
324 } else {
325 InferType(columns[i], i);
326 }
327 }
328}
329
330void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
331{
333 int dummy;
334
335 if (fgIntRegex.Index(col, &dummy) != -1) {
336 type = 'L'; // Long64_t
337 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
338 fgDoubleRegex3.Index(col, &dummy) != -1) {
339 type = 'D'; // double
340 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
341 type = 'O'; // bool
342 } else { // everything else is a string
343 type = 'T'; // std::string
344 }
345 // TODO: Date
346
347 fColTypes[fHeaders[idxCol]] = type;
348 fColTypesList.push_back(type);
349}
350
351std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
352{
353 std::vector<std::string> columns;
354
355 for (size_t i = 0; i < line.size(); ++i) {
356 i = ParseValue(line, columns, i);
357 }
358
359 return columns;
360}
361
362size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
363{
364 std::string val;
365 bool quoted = false;
366 const size_t prevPos = i; // used to check if cell is empty
367
368 for (; i < line.size(); ++i) {
369 if (line[i] == fOptions.fDelimiter && !quoted) {
370 break;
371 } else if (line[i] == '"') {
372 // Keep just one quote for escaped quotes, none for the normal quotes
373 if (line[i + 1] != '"') {
374 quoted = !quoted;
375 } else {
376 val += line[++i];
377 }
378 } else {
379 val += line[i];
380 }
381 }
382
383 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
384 columns.emplace_back("nan");
385 else
386 columns.emplace_back(std::move(val));
387
388 // if the line ends with the delimiter, we need to append the default column value
389 // for the _next_, last column that won't be parsed (because we are out of characters)
390 if (i == line.size() - 1 && line[i] == fOptions.fDelimiter)
391 columns.emplace_back("nan");
392
393 return i;
394}
395
396void RCsvDS::Construct()
397{
398 std::string line;
399
401 // It is possible to not read the file twice, but the implementation would be more complicated
402 std::int64_t nLines = 0;
403 std::string tmp;
404 while (fCsvFile->Readln(tmp))
405 nLines++;
406 if (nLines < fOptions.fSkipLastNLines) {
407 std::string msg = "Error: too many footer lines to skip in CSV file ";
408 msg += fCsvFile->GetUrl();
409 throw std::runtime_error(msg);
410 }
411 fCsvFile->Seek(0);
413 }
414
415 for (std::int64_t i = 0; i < fOptions.fSkipFirstNLines; ++i) {
416 if (!fCsvFile->Readln(line))
417 break;
418 fLineNumber++;
419 }
420
421 // Read the headers if present
422 if (fOptions.fHeaders) {
423 if (Readln(line)) {
425 } else {
426 std::string msg = "Error reading headers of CSV file ";
427 msg += fCsvFile->GetUrl();
428 throw std::runtime_error(msg);
429 }
430 }
431
432 fDataPos = fCsvFile->GetFilePos();
434 if (Readln(line)) {
435 auto columns = ParseColumns(line);
436
437 // Generate headers if not present
438 if (!fOptions.fHeaders) {
439 GenerateHeaders(columns.size());
440 }
441
442 // Ensure user is trying to set types only of existing columns
443 ValidateColTypes(columns);
444
445 // Infer types of columns with first record
446 InferColTypes(columns);
447
448 // rewind
449 RewindToData();
450 } else {
451 std::string msg = "Could not infer column types of CSV file ";
452 msg += fCsvFile->GetUrl();
453 throw std::runtime_error(msg);
454 }
455}
456
457////////////////////////////////////////////////////////////////////////
458/// Constructor to create a CSV RDataSource for RDataFrame.
459/// \param[in] fileName Path or URL of the CSV file.
460/// \param[in] options File parsing settings
461RCsvDS::RCsvDS(std::string_view fileName, const ROptions &options)
462 : fOptions(options), fCsvFile(ROOT::Internal::RRawFile::Create(fileName))
463{
464 std::swap(fColTypes, fOptions.fColumnTypes);
465
466 Construct();
467}
468
469////////////////////////////////////////////////////////////////////////
470/// Constructor to create a CSV RDataSource for RDataFrame.
471/// \param[in] fileName Path or URL of the CSV file.
472/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
473/// (default `true`).
474/// \param[in] delimiter Delimiter character (default ',').
475/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
476/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
477/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
478/// Long64_t, 'T' for std::string)
479RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
480 std::unordered_map<std::string, char> &&colTypes)
481 : fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fColTypes(std::move(colTypes))
482{
483 fOptions.fHeaders = readHeaders;
484 fOptions.fDelimiter = delimiter;
485 fOptions.fLinesChunkSize = linesChunkSize;
486
487 Construct();
488}
489
490void RCsvDS::FreeRecords()
491{
492 for (auto &record : fRecords) {
493 for (size_t i = 0; i < record.size(); ++i) {
494 void *p = record[i];
495 const auto colType = fColTypes[fHeaders[i]];
496 switch (colType) {
497 case 'D': {
498 delete static_cast<double *>(p);
499 break;
500 }
501 case 'L': {
502 delete static_cast<Long64_t *>(p);
503 break;
504 }
505 case 'O': {
506 delete static_cast<bool *>(p);
507 break;
508 }
509 case 'T': {
510 delete static_cast<std::string *>(p);
511 break;
512 }
513 }
514 }
515 }
516 fRecords.clear();
517}
518
519////////////////////////////////////////////////////////////////////////
520/// Destructor.
521RCsvDS::~RCsvDS()
522{
523 FreeRecords();
524}
525
526void RCsvDS::Finalize()
527{
528 RewindToData();
529 fProcessedLines = 0ULL;
531 FreeRecords();
532}
533
534const std::vector<std::string> &RCsvDS::GetColumnNames() const
535{
536 return fHeaders;
537}
538
539std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
540{
541 // Read records and store them in memory
542 auto linesToRead = fOptions.fLinesChunkSize;
543 FreeRecords();
544
545 std::string line;
546 while ((-1LL == fOptions.fLinesChunkSize || 0 != linesToRead) && Readln(line)) {
547 fRecords.emplace_back();
548 FillRecord(line, fRecords.back());
549 --linesToRead;
550 }
551
552 if (!fColContainingEmpty.empty()) {
553 std::string msg = "";
554 for (const auto &col : fColContainingEmpty) {
555 const auto colT = GetTypeName(col);
556 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
557 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
558 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
559 }
560 msg += "Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
561 Warning("RCsvDS", "%s", msg.c_str());
562 }
563
564 if (gDebug > 0) {
565 if (fOptions.fLinesChunkSize == -1LL) {
566 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
567 } else {
568 Info("GetEntryRanges", "Attempted to read chunk of %" PRId64 " lines of CSV file into memory, %zu lines read",
570 }
571 }
572
573 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
574 const auto nRecords = fRecords.size();
575 if (0 == nRecords)
576 return entryRanges;
577
578 const auto chunkSize = nRecords / fNSlots;
579 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
580 auto start = fProcessedLines;
581 auto end = start;
582
583 for (auto i : ROOT::TSeqU(fNSlots)) {
584 start = end;
585 end += chunkSize;
586 entryRanges.emplace_back(start, end);
587 (void)i;
588 }
589 entryRanges.back().second += remainder;
590
591 fProcessedLines += nRecords;
593
594 return entryRanges;
595}
596
597RCsvDS::ColType_t RCsvDS::GetType(std::string_view colName) const
598{
599 if (!HasColumn(colName)) {
600 std::string msg = "The dataset does not have column ";
601 msg += colName;
602 throw std::runtime_error(msg);
603 }
604
605 return fColTypes.at(colName.data());
606}
607
608std::string RCsvDS::GetTypeName(std::string_view colName) const
609{
610 return fgColTypeMap.at(GetType(colName));
611}
612
613bool RCsvDS::HasColumn(std::string_view colName) const
614{
615 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
616}
617
618bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
619{
620 // Here we need to normalise the entry to the number of lines we already processed.
622 const auto recordPos = entry - offset;
623 int colIndex = 0;
624 for (auto &colType : fColTypesList) {
625 auto dataPtr = fRecords[recordPos][colIndex];
626 switch (colType) {
627 case 'D': {
628 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
629 break;
630 }
631 case 'L': {
632 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
633 break;
634 }
635 case 'O': {
636 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
637 break;
638 }
639 case 'T': {
640 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
641 break;
642 }
643 }
644 colIndex++;
645 }
646 return true;
647}
648
649void RCsvDS::SetNSlots(unsigned int nSlots)
650{
651 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
652
653 fNSlots = nSlots;
654
655 const auto nColumns = fHeaders.size();
656 // Initialize the entire set of addresses
657 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
658
659 // Initialize the per event data holders
660 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
661 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
662 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
663 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
664}
665
666std::string RCsvDS::GetLabel()
667{
668 return "RCsv";
669}
670
671RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
672{
673 ROOT::RDataFrame rdf(std::make_unique<RCsvDS>(fileName, options));
674 return rdf;
675}
676
677RDataFrame FromCSV(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
678 std::unordered_map<std::string, char> &&colTypes)
679{
681 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
682 return rdf;
683}
684
685} // namespace RDF
686
687} // namespace ROOT
#define b(i)
Definition RSha256.hxx:100
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:69
unsigned long long ULong64_t
Definition RtypesCore.h:70
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:229
#define N
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Definition TROOT.cxx:597
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:73
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:608
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:185
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:597
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:86
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:330
std::uint64_t fDataPos
Definition RCsvDS.hxx:72
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:66
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:362
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:69
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:228
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:84
unsigned int fNSlots
Definition RCsvDS.hxx:76
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:534
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:80
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:78
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:75
ULong64_t fProcessedLines
Definition RCsvDS.hxx:79
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:613
std::int64_t fLineNumber
Definition RCsvDS.hxx:74
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:297
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:81
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:87
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:69
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:85
ROptions fOptions
Definition RCsvDS.hxx:71
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:82
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:69
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:69
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:278
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:69
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:351
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:77
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:69
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:88
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:91
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:83
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition TRegexp.cxx:213
TLine * line
std::ostream & Info()
Definition hadd.cxx:163
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:671
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:38
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:42
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:45
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:47
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:57
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:48
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:60
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
Definition RCsvDS.hxx:46
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:43
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:53
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:44
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:49