98std::string RCsvDS::AsString()
100 return "CSV data source";
104const TRegexp RCsvDS::fgIntRegex(
"^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1(
"^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2(
"^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3(
"^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex(
"^true$");
109const TRegexp RCsvDS::fgFalseRegex(
"^false$");
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{
'O',
"bool"}, {
'D',
"double"}, {
'L',
"Long64_t"}, {
'T',
"std::string"}});
114bool RCsvDS::Readln(std::string &
line)
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart <
N && std::isspace(s[idxStart]); ++idxStart)
122 s.erase(0, idxStart);
125 auto fnRightTrim = [](std::string &s) {
127 for (
auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
130 s.resize(s.size() - nTrim);
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
159void RCsvDS::RewindToData()
165void RCsvDS::FillHeaders(
const std::string &
line)
172 " column names for a CSV file containing " + std::to_string(columns.size()) +
" columns!";
173 throw std::runtime_error(msg);
180 for (
auto &col : columns) {
191 for (
auto &col : columns) {
196 record.emplace_back(
new double((col !=
"nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
201 record.emplace_back(
new Long64_t(std::stoll(col)));
204 record.emplace_back(
new Long64_t(0));
210 record.emplace_back(
b);
212 std::istringstream(col) >> std::boolalpha >> *
b;
220 record.emplace_back(
new std::string(col));
228void RCsvDS::GenerateHeaders(
size_t size)
233 " column names for a CSV file containing " + std::to_string(
size) +
" columns!";
234 throw std::runtime_error(msg);
241 for (
size_t i = 0u; i <
size; ++i) {
242 fHeaders.push_back(
"Col" + std::to_string(i));
246std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName,
const std::type_info &ti)
248 const auto colType =
GetType(colName);
250 if ((colType ==
'D' &&
typeid(
double) != ti) || (colType ==
'L' &&
typeid(
Long64_t) != ti) ||
251 (colType ==
'T' &&
typeid(std::string) != ti) || (colType ==
'O' &&
typeid(
bool) != ti)) {
252 std::string err =
"The type selected for column \"";
254 err +=
"\" does not correspond to column type, which is ";
256 throw std::runtime_error(err);
260 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
261 std::vector<void *> ret(
fNSlots);
264 if (ti ==
typeid(
double)) {
266 }
else if (ti ==
typeid(
Long64_t)) {
268 }
else if (ti ==
typeid(std::string)) {
278void RCsvDS::ValidateColTypes(std::vector<std::string> &columns)
const
282 std::string msg =
"There is no column with name \"" + col.first +
"\".";
284 msg +=
"\nSince the input csv file does not contain headers, valid column names";
285 msg +=
" are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) +
"\"].";
287 throw std::runtime_error(msg);
289 if (std::string(
"ODLT").find(col.second) == std::string::npos) {
290 std::string msg =
"Type alias '" + std::string(1, col.second) +
"' is not supported.\n";
291 msg +=
"Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
292 throw std::runtime_error(msg);
297void RCsvDS::InferColTypes(std::vector<std::string> &columns)
299 const auto second_line =
fCsvFile->GetFilePos();
301 for (
auto i = 0u; i < columns.size(); ++i) {
303 if (userSpecifiedType !=
fColTypes.end()) {
309 for (
auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] ==
"nan"; ++extraRowsRead) {
314 if (temp_columns[i] !=
"nan")
315 columns[i] = temp_columns[i];
320 if (columns[i] ==
"nan") {
330void RCsvDS::InferType(
const std::string &col,
unsigned int idxCol)
351std::vector<std::string> RCsvDS::ParseColumns(
const std::string &
line)
353 std::vector<std::string> columns;
355 for (
size_t i = 0; i <
line.size(); ++i) {
362size_t RCsvDS::ParseValue(
const std::string &
line, std::vector<std::string> &columns,
size_t i)
366 const size_t prevPos = i;
368 for (; i <
line.size(); ++i) {
371 }
else if (
line[i] ==
'"') {
373 if (
line[i + 1] !=
'"') {
383 if (prevPos == i || val ==
"nan" || val ==
"NaN")
384 columns.emplace_back(
"nan");
386 columns.emplace_back(std::move(val));
391 columns.emplace_back(
"nan");
396void RCsvDS::Construct()
402 std::int64_t nLines = 0;
407 std::string msg =
"Error: too many footer lines to skip in CSV file ";
409 throw std::runtime_error(msg);
426 std::string msg =
"Error reading headers of CSV file ";
428 throw std::runtime_error(msg);
451 std::string msg =
"Could not infer column types of CSV file ";
453 throw std::runtime_error(msg);
461RCsvDS::RCsvDS(std::string_view fileName,
const ROptions &options)
462 : fOptions(options), fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName))
479RCsvDS::RCsvDS(std::string_view fileName,
bool readHeaders,
char delimiter,
Long64_t linesChunkSize,
480 std::unordered_map<std::string, char> &&colTypes)
481 : fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName)), fColTypes(std::move(colTypes))
490void RCsvDS::FreeRecords()
493 for (
size_t i = 0; i < record.size(); ++i) {
498 delete static_cast<double *
>(
p);
506 delete static_cast<bool *
>(
p);
510 delete static_cast<std::string *
>(
p);
526void RCsvDS::Finalize()
534const std::vector<std::string> &RCsvDS::GetColumnNames()
const
539std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
553 std::string msg =
"";
556 msg +=
"Column \"" + col +
"\" of type " + colT +
" contains empty cell(s) or NaN(s).\n";
557 msg +=
"There is no `nan` equivalent for type " + colT +
", hence ";
558 msg += std::string(colT ==
"Long64_t" ?
"`0`" :
"`false`") +
" is stored.\n";
560 msg +=
"Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
561 Warning(
"RCsvDS",
"%s", msg.c_str());
566 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
568 Info(
"GetEntryRanges",
"Attempted to read chunk of %" PRId64
" lines of CSV file into memory, %zu lines read",
573 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
574 const auto nRecords =
fRecords.size();
578 const auto chunkSize = nRecords /
fNSlots;
586 entryRanges.emplace_back(start, end);
589 entryRanges.back().second += remainder;
600 std::string msg =
"The dataset does not have column ";
602 throw std::runtime_error(msg);
608std::string RCsvDS::GetTypeName(std::string_view colName)
const
613bool RCsvDS::HasColumn(std::string_view colName)
const
618bool RCsvDS::SetEntry(
unsigned int slot,
ULong64_t entry)
622 const auto recordPos = entry -
offset;
625 auto dataPtr =
fRecords[recordPos][colIndex];
649void RCsvDS::SetNSlots(
unsigned int nSlots)
651 assert(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
655 const auto nColumns =
fHeaders.size();
666std::string RCsvDS::GetLabel()
678 std::unordered_map<std::string, char> &&colTypes)
681 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
unsigned long long ULong64_t
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
The RRawFile provides read-only access to local and remote files.
std::int64_t fDataLineNumber
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void FillRecord(const std::string &, Record_t &)
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
bool Readln(std::string &line)
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
std::int64_t fMaxLineNumber
ULong64_t fProcessedLines
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void InferColTypes(std::vector< std::string > &)
std::unordered_map< std::string, ColType_t > fColTypes
std::vector< std::vector< Long64_t > > fLong64EvtValues
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
std::set< std::string > fColContainingEmpty
static const TRegexp fgFalseRegex
static const TRegexp fgDoubleRegex3
void ValidateColTypes(std::vector< std::string > &) const
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Options that control how the CSV file is parsed.
bool fHeaders
The first line describes the columns.
bool fRightTrim
Trailing whitespaces are removed.
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
std::vector< std::string > fColumnNames
Impose column names.
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
char fDelimiter
Column delimiter character.
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
bool fLeftTrim
Leading whitespaces are removed.
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.