90std::string RCsvDS::AsString()
92 return "CSV data source";
96const TRegexp RCsvDS::fgIntRegex(
"^[-+]?[0-9]+$");
97const TRegexp RCsvDS::fgDoubleRegex1(
"^[-+]?[0-9]+\\.[0-9]*$");
98const TRegexp RCsvDS::fgDoubleRegex2(
"^[-+]?[0-9]*\\.[0-9]+$");
99const TRegexp RCsvDS::fgDoubleRegex3(
"^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
100const TRegexp RCsvDS::fgTrueRegex(
"^true$");
101const TRegexp RCsvDS::fgFalseRegex(
"^false$");
103const std::map<RCsvDS::ColType_t, std::string>
104 RCsvDS::fgColTypeMap({{
'b',
"bool"}, {
'd',
"double"}, {
'l',
"Long64_t"}, {
's',
"std::string"}});
106void RCsvDS::FillHeaders(
const std::string &
line)
109 for (
auto &col : columns) {
116 std::istringstream lineStream(
line);
121 for (
auto &col : columns) {
126 record.emplace_back(
new double(std::stod(col)));
130 record.emplace_back(
new Long64_t(std::stoll(col)));
135 record.emplace_back(
b);
136 std::istringstream is(col);
137 is >> std::boolalpha >> *
b;
141 record.emplace_back(
new std::string(col));
149void RCsvDS::GenerateHeaders(
size_t size)
151 for (
size_t i = 0; i < size; ++i) {
152 fHeaders.push_back(
"Col" + std::to_string(i));
156std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName,
const std::type_info &ti)
158 const auto colType =
GetType(colName);
160 if ((colType ==
'd' &&
typeid(
double) != ti) || (colType ==
'l' &&
typeid(
Long64_t) != ti) ||
161 (colType ==
's' &&
typeid(std::string) != ti) || (colType ==
'b' &&
typeid(
bool) != ti)) {
162 std::string err =
"The type selected for column \"";
164 err +=
"\" does not correspond to column type, which is ";
166 throw std::runtime_error(err);
170 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
171 std::vector<void *> ret(
fNSlots);
174 if (ti ==
typeid(
double)) {
176 }
else if (ti ==
typeid(
Long64_t)) {
178 }
else if (ti ==
typeid(std::string)) {
188void RCsvDS::InferColTypes(std::vector<std::string> &columns)
191 for (
auto &col : columns) {
197void RCsvDS::InferType(
const std::string &col,
unsigned int idxCol)
219std::vector<std::string> RCsvDS::ParseColumns(
const std::string &
line)
221 std::vector<std::string> columns;
223 for (
size_t i = 0; i <
line.size(); ++i) {
230size_t RCsvDS::ParseValue(
const std::string &
line, std::vector<std::string> &columns,
size_t i)
232 std::stringstream val;
235 for (; i <
line.size(); ++i) {
238 }
else if (
line[i] ==
'"') {
240 if (
line[i + 1] !=
'"') {
250 columns.emplace_back(val.str());
261RCsvDS::RCsvDS(std::string_view fileName,
bool readHeaders,
char delimiter,
Long64_t linesChunkSize)
262 : fReadHeaders(readHeaders),
264 fDelimiter(delimiter),
265 fLinesChunkSize(linesChunkSize)
274 std::string msg =
"Error reading headers of CSV file ";
276 throw std::runtime_error(msg);
284 }
while (
line.empty() && !eof);
299 std::string msg =
"Could not infer column types of CSV file ";
301 throw std::runtime_error(msg);
305void RCsvDS::FreeRecords()
308 for (
size_t i = 0; i < record.size(); ++i) {
313 delete static_cast<double *
>(p);
321 delete static_cast<bool *
>(p);
325 delete static_cast<std::string *
>(p);
341void RCsvDS::Finalise()
349const std::vector<std::string> &RCsvDS::GetColumnNames()
const
354std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
363 if (
line.empty())
continue;
371 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
373 Info(
"GetEntryRanges",
"Attempted to read chunk of %lld lines of CSV file into memory, %zu lines read",
fLinesChunkSize,
fRecords.size());
377 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
378 const auto nRecords =
fRecords.size();
382 const auto chunkSize = nRecords /
fNSlots;
390 entryRanges.emplace_back(start, end);
393 entryRanges.back().second += remainder;
404 std::string msg =
"The dataset does not have column ";
406 throw std::runtime_error(msg);
412std::string RCsvDS::GetTypeName(std::string_view colName)
const
417bool RCsvDS::HasColumn(std::string_view colName)
const
422bool RCsvDS::SetEntry(
unsigned int slot,
ULong64_t entry)
426 const auto recordPos = entry - offset;
429 auto dataPtr =
fRecords[recordPos][colIndex];
453void RCsvDS::SetNSlots(
unsigned int nSlots)
455 R__ASSERT(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
459 const auto nColumns =
fHeaders.size();
470std::string RCsvDS::GetLabel()
477 ROOT::RDataFrame tdf(std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize));
unsigned long long ULong64_t
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
typedef void((*Func_t)())
The RRawFile provides read-only access to local and remote files.
std::map< std::string, ColType_t > fColTypes
void FillRecord(const std::string &, Record_t &)
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
const Long64_t fLinesChunkSize
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
ULong64_t fProcessedLines
void InferColTypes(std::vector< std::string > &)
std::vector< std::vector< Long64_t > > fLong64EvtValues
static const std::map< ColType_t, std::string > fgColTypeMap
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
static const TRegexp fgFalseRegex
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
static const TRegexp fgDoubleRegex3
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Regular expression class.
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU