Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::FromCSV, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/TSeq.hxx>
82#include <ROOT/RCsvDS.hxx>
83#include <ROOT/RRawFile.hxx>
84#include <TError.h>
85
86#include <algorithm>
87#include <cctype>
88#include <cinttypes>
89#include <iterator>
90#include <memory>
91#include <sstream>
92#include <string>
93
94namespace ROOT {
95
96namespace RDF {
97
98std::string RCsvDS::AsString()
99{
100 return "CSV data source";
101}
102
103// Regular expressions for type inference
104const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex("^true$");
109const TRegexp RCsvDS::fgFalseRegex("^false$");
110
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
113
114bool RCsvDS::Readln(std::string &line)
115{
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart < N && std::isspace(s[idxStart]); ++idxStart)
120 ;
121 if (idxStart)
122 s.erase(0, idxStart);
123 };
124
125 auto fnRightTrim = [](std::string &s) {
126 size_t nTrim = 0;
127 for (auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
128 ;
129 if (nTrim)
130 s.resize(s.size() - nTrim);
131 };
132
133 while (true) {
134 const bool eof = !fCsvFile->Readln(line);
135 if (eof)
136 return false;
137 fLineNumber++;
138 if ((fMaxLineNumber >= 0) && (fLineNumber > fMaxLineNumber))
139 return false;
140
142 fnLeftTrim(line);
143 if (fOptions.fComment) {
144 auto idxComment = line.find(fOptions.fComment);
145 if (idxComment == 0)
146 continue;
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
149 }
151 fnRightTrim(line);
152 if (fOptions.fSkipBlankLines && line.empty())
153 continue;
154
155 return true;
156 }
157}
158
159void RCsvDS::RewindToData()
160{
161 fCsvFile->Seek(fDataPos);
163}
164
165void RCsvDS::FillHeaders(const std::string &line)
166{
167 if (!fOptions.fColumnNames.empty()) {
168 std::swap(fHeaders, fOptions.fColumnNames);
169 return;
170 }
171
172 auto columns = ParseColumns(line);
173 fHeaders.reserve(columns.size());
174 for (auto &col : columns) {
175 fHeaders.emplace_back(col);
176 }
177}
178
179void RCsvDS::FillRecord(const std::string &line, Record_t &record)
180{
181 auto i = 0U;
182
183 auto columns = ParseColumns(line);
184
185 for (auto &col : columns) {
186 auto colType = fColTypes[fHeaders[i]];
187
188 switch (colType) {
189 case 'D': {
190 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
191 break;
192 }
193 case 'L': {
194 if (col != "nan") {
195 record.emplace_back(new Long64_t(std::stoll(col)));
196 } else {
197 fColContainingEmpty.insert(fHeaders[i]);
198 record.emplace_back(new Long64_t(0));
199 }
200 break;
201 }
202 case 'O': {
203 auto b = new bool();
204 record.emplace_back(b);
205 if (col != "nan") {
206 std::istringstream(col) >> std::boolalpha >> *b;
207 } else {
208 fColContainingEmpty.insert(fHeaders[i]);
209 *b = false;
210 }
211 break;
212 }
213 case 'T': {
214 record.emplace_back(new std::string(col));
215 break;
216 }
217 }
218 ++i;
219 }
220}
221
222void RCsvDS::GenerateHeaders(size_t size)
223{
224 if (!fOptions.fColumnNames.empty()) {
225 std::swap(fHeaders, fOptions.fColumnNames);
226 return;
227 }
228
229 fHeaders.reserve(size);
230 for (size_t i = 0u; i < size; ++i) {
231 fHeaders.push_back("Col" + std::to_string(i));
232 }
233}
234
235std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
236{
237 const auto colType = GetType(colName);
238
239 if ((colType == 'D' && typeid(double) != ti) || (colType == 'L' && typeid(Long64_t) != ti) ||
240 (colType == 'T' && typeid(std::string) != ti) || (colType == 'O' && typeid(bool) != ti)) {
241 std::string err = "The type selected for column \"";
242 err += colName;
243 err += "\" does not correspond to column type, which is ";
244 err += fgColTypeMap.at(colType);
245 throw std::runtime_error(err);
246 }
247
248 const auto &colNames = GetColumnNames();
249 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
250 std::vector<void *> ret(fNSlots);
251 for (auto slot : ROOT::TSeqU(fNSlots)) {
252 auto &val = fColAddresses[index][slot];
253 if (ti == typeid(double)) {
254 val = &fDoubleEvtValues[index][slot];
255 } else if (ti == typeid(Long64_t)) {
256 val = &fLong64EvtValues[index][slot];
257 } else if (ti == typeid(std::string)) {
258 val = &fStringEvtValues[index][slot];
259 } else {
260 val = &fBoolEvtValues[index][slot];
261 }
262 ret[slot] = &val;
263 }
264 return ret;
265}
266
267void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
268{
269 for (const auto &col : fColTypes) {
270 if (!HasColumn(col.first)) {
271 std::string msg = "There is no column with name \"" + col.first + "\".";
272 if (!fOptions.fHeaders) {
273 msg += "\nSince the input csv file does not contain headers, valid column names";
274 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
275 }
276 throw std::runtime_error(msg);
277 }
278 if (std::string("ODLT").find(col.second) == std::string::npos) {
279 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
280 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
281 throw std::runtime_error(msg);
282 }
283 }
284}
285
286void RCsvDS::InferColTypes(std::vector<std::string> &columns)
287{
288 const auto second_line = fCsvFile->GetFilePos();
289
290 for (auto i = 0u; i < columns.size(); ++i) {
291 const auto userSpecifiedType = fColTypes.find(fHeaders[i]);
292 if (userSpecifiedType != fColTypes.end()) {
293 fColTypesList.push_back(userSpecifiedType->second);
294 continue;
295 }
296
297 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
298 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
299 std::string line;
300 if (!Readln(line))
301 break; // EOF
302 const auto temp_columns = ParseColumns(line);
303 if (temp_columns[i] != "nan")
304 columns[i] = temp_columns[i]; // will break the loop in the next iteration
305 }
306 // reset the reading from the second line, because the first line is already loaded in `columns`
307 fCsvFile->Seek(second_line);
308
309 if (columns[i] == "nan") {
310 // could not find a non-empty value, default to double
311 fColTypes[fHeaders[i]] = 'D';
312 fColTypesList.push_back('D');
313 } else {
314 InferType(columns[i], i);
315 }
316 }
317}
318
319void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
320{
322 int dummy;
323
324 if (fgIntRegex.Index(col, &dummy) != -1) {
325 type = 'L'; // Long64_t
326 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
327 fgDoubleRegex3.Index(col, &dummy) != -1) {
328 type = 'D'; // double
329 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
330 type = 'O'; // bool
331 } else { // everything else is a string
332 type = 'T'; // std::string
333 }
334 // TODO: Date
335
336 fColTypes[fHeaders[idxCol]] = type;
337 fColTypesList.push_back(type);
338}
339
340std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
341{
342 std::vector<std::string> columns;
343
344 for (size_t i = 0; i < line.size(); ++i) {
345 i = ParseValue(line, columns, i);
346 }
347
348 return columns;
349}
350
351size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
352{
353 std::string val;
354 bool quoted = false;
355 const size_t prevPos = i; // used to check if cell is empty
356
357 for (; i < line.size(); ++i) {
358 if (line[i] == fOptions.fDelimiter && !quoted) {
359 break;
360 } else if (line[i] == '"') {
361 // Keep just one quote for escaped quotes, none for the normal quotes
362 if (line[i + 1] != '"') {
363 quoted = !quoted;
364 } else {
365 val += line[++i];
366 }
367 } else {
368 val += line[i];
369 }
370 }
371
372 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
373 columns.emplace_back("nan");
374 else
375 columns.emplace_back(std::move(val));
376
377 // if the line ends with the delimiter, we need to append the default column value
378 // for the _next_, last column that won't be parsed (because we are out of characters)
379 if (i == line.size() - 1 && line[i] == fOptions.fDelimiter)
380 columns.emplace_back("nan");
381
382 return i;
383}
384
385void RCsvDS::Construct()
386{
387 std::string line;
388
390 // It is possible to not read the file twice, but the implementation would be more complicated
391 std::int64_t nLines = 0;
392 std::string tmp;
393 while (fCsvFile->Readln(tmp))
394 nLines++;
395 if (nLines < fOptions.fSkipLastNLines) {
396 std::string msg = "Error: too many footer lines to skip in CSV file ";
397 msg += fCsvFile->GetUrl();
398 throw std::runtime_error(msg);
399 }
400 fCsvFile->Seek(0);
402 }
403
404 for (std::int64_t i = 0; i < fOptions.fSkipFirstNLines; ++i) {
405 if (!fCsvFile->Readln(line))
406 break;
407 fLineNumber++;
408 }
409
410 // Read the headers if present
411 if (fOptions.fHeaders) {
412 if (Readln(line)) {
414 } else {
415 std::string msg = "Error reading headers of CSV file ";
416 msg += fCsvFile->GetUrl();
417 throw std::runtime_error(msg);
418 }
419 }
420
421 fDataPos = fCsvFile->GetFilePos();
423 if (Readln(line)) {
424 auto columns = ParseColumns(line);
425
426 // Generate headers if not present
427 if (!fOptions.fHeaders) {
428 GenerateHeaders(columns.size());
429 }
430
431 // Ensure user is trying to set types only of existing columns
432 ValidateColTypes(columns);
433
434 // Infer types of columns with first record
435 InferColTypes(columns);
436
437 // rewind
438 RewindToData();
439 } else {
440 std::string msg = "Could not infer column types of CSV file ";
441 msg += fCsvFile->GetUrl();
442 throw std::runtime_error(msg);
443 }
444}
445
446////////////////////////////////////////////////////////////////////////
447/// Constructor to create a CSV RDataSource for RDataFrame.
448/// \param[in] fileName Path or URL of the CSV file.
449/// \param[in] options File parsing settings
450RCsvDS::RCsvDS(std::string_view fileName, const ROptions &options)
451 : fOptions(options), fCsvFile(ROOT::Internal::RRawFile::Create(fileName))
452{
453 std::swap(fColTypes, fOptions.fColumnTypes);
454
455 Construct();
456}
457
458////////////////////////////////////////////////////////////////////////
459/// Constructor to create a CSV RDataSource for RDataFrame.
460/// \param[in] fileName Path or URL of the CSV file.
461/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
462/// (default `true`).
463/// \param[in] delimiter Delimiter character (default ',').
464/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
465/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
466/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
467/// Long64_t, 'T' for std::string)
468RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
469 std::unordered_map<std::string, char> &&colTypes)
470 : fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fColTypes(std::move(colTypes))
471{
472 fOptions.fHeaders = readHeaders;
473 fOptions.fDelimiter = delimiter;
474 fOptions.fLinesChunkSize = linesChunkSize;
475
476 Construct();
477}
478
479void RCsvDS::FreeRecords()
480{
481 for (auto &record : fRecords) {
482 for (size_t i = 0; i < record.size(); ++i) {
483 void *p = record[i];
484 const auto colType = fColTypes[fHeaders[i]];
485 switch (colType) {
486 case 'D': {
487 delete static_cast<double *>(p);
488 break;
489 }
490 case 'L': {
491 delete static_cast<Long64_t *>(p);
492 break;
493 }
494 case 'O': {
495 delete static_cast<bool *>(p);
496 break;
497 }
498 case 'T': {
499 delete static_cast<std::string *>(p);
500 break;
501 }
502 }
503 }
504 }
505 fRecords.clear();
506}
507
508////////////////////////////////////////////////////////////////////////
509/// Destructor.
510RCsvDS::~RCsvDS()
511{
512 FreeRecords();
513}
514
515void RCsvDS::Finalize()
516{
517 RewindToData();
518 fProcessedLines = 0ULL;
520 FreeRecords();
521}
522
523const std::vector<std::string> &RCsvDS::GetColumnNames() const
524{
525 return fHeaders;
526}
527
528std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
529{
530 // Read records and store them in memory
531 auto linesToRead = fOptions.fLinesChunkSize;
532 FreeRecords();
533
534 std::string line;
535 while ((-1LL == fOptions.fLinesChunkSize || 0 != linesToRead) && Readln(line)) {
536 fRecords.emplace_back();
537 FillRecord(line, fRecords.back());
538 --linesToRead;
539 }
540
541 if (!fColContainingEmpty.empty()) {
542 std::string msg = "";
543 for (const auto &col : fColContainingEmpty) {
544 const auto colT = GetTypeName(col);
545 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
546 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
547 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
548 }
549 msg += "Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
550 Warning("RCsvDS", "%s", msg.c_str());
551 }
552
553 if (gDebug > 0) {
554 if (fOptions.fLinesChunkSize == -1LL) {
555 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
556 } else {
557 Info("GetEntryRanges", "Attempted to read chunk of %" PRId64 " lines of CSV file into memory, %zu lines read",
559 }
560 }
561
562 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
563 const auto nRecords = fRecords.size();
564 if (0 == nRecords)
565 return entryRanges;
566
567 const auto chunkSize = nRecords / fNSlots;
568 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
569 auto start = fProcessedLines;
570 auto end = start;
571
572 for (auto i : ROOT::TSeqU(fNSlots)) {
573 start = end;
574 end += chunkSize;
575 entryRanges.emplace_back(start, end);
576 (void)i;
577 }
578 entryRanges.back().second += remainder;
579
580 fProcessedLines += nRecords;
582
583 return entryRanges;
584}
585
586RCsvDS::ColType_t RCsvDS::GetType(std::string_view colName) const
587{
588 if (!HasColumn(colName)) {
589 std::string msg = "The dataset does not have column ";
590 msg += colName;
591 throw std::runtime_error(msg);
592 }
593
594 return fColTypes.at(colName.data());
595}
596
597std::string RCsvDS::GetTypeName(std::string_view colName) const
598{
599 return fgColTypeMap.at(GetType(colName));
600}
601
602bool RCsvDS::HasColumn(std::string_view colName) const
603{
604 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
605}
606
607bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
608{
609 // Here we need to normalise the entry to the number of lines we already processed.
611 const auto recordPos = entry - offset;
612 int colIndex = 0;
613 for (auto &colType : fColTypesList) {
614 auto dataPtr = fRecords[recordPos][colIndex];
615 switch (colType) {
616 case 'D': {
617 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
618 break;
619 }
620 case 'L': {
621 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
622 break;
623 }
624 case 'O': {
625 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
626 break;
627 }
628 case 'T': {
629 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
630 break;
631 }
632 }
633 colIndex++;
634 }
635 return true;
636}
637
638void RCsvDS::SetNSlots(unsigned int nSlots)
639{
640 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
641
642 fNSlots = nSlots;
643
644 const auto nColumns = fHeaders.size();
645 // Initialize the entire set of addresses
646 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
647
648 // Initialize the per event data holders
649 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
650 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
651 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
652 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
653}
654
655std::string RCsvDS::GetLabel()
656{
657 return "RCsv";
658}
659
660RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
661{
662 ROOT::RDataFrame rdf(std::make_unique<RCsvDS>(fileName, options));
663 return rdf;
664}
665
666RDataFrame FromCSV(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
667 std::unordered_map<std::string, char> &&colTypes)
668{
670 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
671 return rdf;
672}
673
674} // ns RDF
675
676} // ns ROOT
#define b(i)
Definition RSha256.hxx:100
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:69
unsigned long long ULong64_t
Definition RtypesCore.h:70
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:218
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:229
#define N
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Definition TROOT.cxx:597
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:72
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:597
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:179
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:586
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:85
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:319
std::uint64_t fDataPos
Definition RCsvDS.hxx:71
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:65
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:351
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:68
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:222
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:83
unsigned int fNSlots
Definition RCsvDS.hxx:75
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:523
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:79
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:77
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:74
ULong64_t fProcessedLines
Definition RCsvDS.hxx:78
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:602
std::int64_t fLineNumber
Definition RCsvDS.hxx:73
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:286
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:80
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:86
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:68
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:84
ROptions fOptions
Definition RCsvDS.hxx:70
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:81
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:68
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:68
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:267
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:68
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:340
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:76
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:68
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:87
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:90
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:82
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition TRegexp.cxx:213
TLine * line
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:660
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:38
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:42
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:45
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:47
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:56
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:48
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:59
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
Definition RCsvDS.hxx:46
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:43
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:53
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:44
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:49