98std::string RCsvDS::AsString()
100 return "CSV data source";
104const TRegexp RCsvDS::fgIntRegex(
"^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1(
"^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2(
"^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3(
"^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex(
"^true$");
109const TRegexp RCsvDS::fgFalseRegex(
"^false$");
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{
'O',
"bool"}, {
'D',
"double"}, {
'L',
"Long64_t"}, {
'T',
"std::string"}});
114bool RCsvDS::Readln(std::string &
line)
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart <
N && std::isspace(s[idxStart]); ++idxStart)
122 s.erase(0, idxStart);
125 auto fnRightTrim = [](std::string &s) {
127 for (
auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
130 s.resize(s.size() - nTrim);
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
159void RCsvDS::RewindToData()
165void RCsvDS::FillHeaders(
const std::string &
line)
174 for (
auto &col : columns) {
185 for (
auto &col : columns) {
190 record.emplace_back(
new double((col !=
"nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
195 record.emplace_back(
new Long64_t(std::stoll(col)));
198 record.emplace_back(
new Long64_t(0));
204 record.emplace_back(
b);
206 std::istringstream(col) >> std::boolalpha >> *
b;
214 record.emplace_back(
new std::string(col));
222void RCsvDS::GenerateHeaders(
size_t size)
230 for (
size_t i = 0u; i <
size; ++i) {
231 fHeaders.push_back(
"Col" + std::to_string(i));
235std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName,
const std::type_info &ti)
237 const auto colType =
GetType(colName);
239 if ((colType ==
'D' &&
typeid(
double) != ti) || (colType ==
'L' &&
typeid(
Long64_t) != ti) ||
240 (colType ==
'T' &&
typeid(std::string) != ti) || (colType ==
'O' &&
typeid(
bool) != ti)) {
241 std::string err =
"The type selected for column \"";
243 err +=
"\" does not correspond to column type, which is ";
245 throw std::runtime_error(err);
249 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
250 std::vector<void *> ret(
fNSlots);
253 if (ti ==
typeid(
double)) {
255 }
else if (ti ==
typeid(
Long64_t)) {
257 }
else if (ti ==
typeid(std::string)) {
267void RCsvDS::ValidateColTypes(std::vector<std::string> &columns)
const
271 std::string msg =
"There is no column with name \"" + col.first +
"\".";
273 msg +=
"\nSince the input csv file does not contain headers, valid column names";
274 msg +=
" are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) +
"\"].";
276 throw std::runtime_error(msg);
278 if (std::string(
"ODLT").find(col.second) == std::string::npos) {
279 std::string msg =
"Type alias '" + std::string(1, col.second) +
"' is not supported.\n";
280 msg +=
"Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
281 throw std::runtime_error(msg);
286void RCsvDS::InferColTypes(std::vector<std::string> &columns)
288 const auto second_line =
fCsvFile->GetFilePos();
290 for (
auto i = 0u; i < columns.size(); ++i) {
292 if (userSpecifiedType !=
fColTypes.end()) {
298 for (
auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] ==
"nan"; ++extraRowsRead) {
303 if (temp_columns[i] !=
"nan")
304 columns[i] = temp_columns[i];
309 if (columns[i] ==
"nan") {
319void RCsvDS::InferType(
const std::string &col,
unsigned int idxCol)
340std::vector<std::string> RCsvDS::ParseColumns(
const std::string &
line)
342 std::vector<std::string> columns;
344 for (
size_t i = 0; i <
line.size(); ++i) {
351size_t RCsvDS::ParseValue(
const std::string &
line, std::vector<std::string> &columns,
size_t i)
355 const size_t prevPos = i;
357 for (; i <
line.size(); ++i) {
360 }
else if (
line[i] ==
'"') {
362 if (
line[i + 1] !=
'"') {
372 if (prevPos == i || val ==
"nan" || val ==
"NaN")
373 columns.emplace_back(
"nan");
375 columns.emplace_back(std::move(val));
380 columns.emplace_back(
"nan");
385void RCsvDS::Construct()
391 std::int64_t nLines = 0;
396 std::string msg =
"Error: too many footer lines to skip in CSV file ";
398 throw std::runtime_error(msg);
415 std::string msg =
"Error reading headers of CSV file ";
417 throw std::runtime_error(msg);
440 std::string msg =
"Could not infer column types of CSV file ";
442 throw std::runtime_error(msg);
450RCsvDS::RCsvDS(std::string_view fileName,
const ROptions &options)
451 : fOptions(options), fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName))
468RCsvDS::RCsvDS(std::string_view fileName,
bool readHeaders,
char delimiter,
Long64_t linesChunkSize,
469 std::unordered_map<std::string, char> &&colTypes)
470 : fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName)), fColTypes(std::move(colTypes))
479void RCsvDS::FreeRecords()
482 for (
size_t i = 0; i < record.size(); ++i) {
487 delete static_cast<double *
>(
p);
495 delete static_cast<bool *
>(
p);
499 delete static_cast<std::string *
>(
p);
515void RCsvDS::Finalize()
523const std::vector<std::string> &RCsvDS::GetColumnNames()
const
528std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
542 std::string msg =
"";
545 msg +=
"Column \"" + col +
"\" of type " + colT +
" contains empty cell(s) or NaN(s).\n";
546 msg +=
"There is no `nan` equivalent for type " + colT +
", hence ";
547 msg += std::string(colT ==
"Long64_t" ?
"`0`" :
"`false`") +
" is stored.\n";
549 msg +=
"Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
550 Warning(
"RCsvDS",
"%s", msg.c_str());
555 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
557 Info(
"GetEntryRanges",
"Attempted to read chunk of %" PRId64
" lines of CSV file into memory, %zu lines read",
562 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
563 const auto nRecords =
fRecords.size();
567 const auto chunkSize = nRecords /
fNSlots;
575 entryRanges.emplace_back(start, end);
578 entryRanges.back().second += remainder;
589 std::string msg =
"The dataset does not have column ";
591 throw std::runtime_error(msg);
597std::string RCsvDS::GetTypeName(std::string_view colName)
const
602bool RCsvDS::HasColumn(std::string_view colName)
const
607bool RCsvDS::SetEntry(
unsigned int slot,
ULong64_t entry)
611 const auto recordPos = entry -
offset;
614 auto dataPtr =
fRecords[recordPos][colIndex];
638void RCsvDS::SetNSlots(
unsigned int nSlots)
640 assert(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
644 const auto nColumns =
fHeaders.size();
655std::string RCsvDS::GetLabel()
667 std::unordered_map<std::string, char> &&colTypes)
670 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
unsigned long long ULong64_t
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
The RRawFile provides read-only access to local and remote files.
std::int64_t fDataLineNumber
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void FillRecord(const std::string &, Record_t &)
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
bool Readln(std::string &line)
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
std::int64_t fMaxLineNumber
ULong64_t fProcessedLines
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void InferColTypes(std::vector< std::string > &)
std::unordered_map< std::string, ColType_t > fColTypes
std::vector< std::vector< Long64_t > > fLong64EvtValues
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
std::set< std::string > fColContainingEmpty
static const TRegexp fgFalseRegex
static const TRegexp fgDoubleRegex3
void ValidateColTypes(std::vector< std::string > &) const
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Options that control how the CSV file is parsed.
bool fHeaders
The first line describes the columns.
bool fRightTrim
Trailing whitespaces are removed.
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
std::vector< std::string > fColumnNames
Impose column names.
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
char fDelimiter
Column delimiter character.
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
bool fLeftTrim
Leading whitespaces are removed.
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.