Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleInspector.cxx
Go to the documentation of this file.
1/// \file RNTupleInspector.cxx
2/// \ingroup NTuple ROOT7
3/// \author Florine de Geus <florine.willemijn.de.geus@cern.ch>
4/// \date 2023-01-09
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2023, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RError.hxx>
21#include <ROOT/RError.hxx>
22
23#include <TFile.h>
24
25#include <algorithm>
26#include <cstring>
27#include <deque>
28#include <exception>
29#include <iomanip>
30#include <iostream>
31
33
35 std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource)
36 : fPageSource(std::move(pageSource))
37{
38 fPageSource->Attach();
39 auto descriptorGuard = fPageSource->GetSharedDescriptorGuard();
41
44}
45
46// NOTE: outlined to avoid including RPageStorage in the header
48
50{
51 fCompressedSize = 0;
52 fUncompressedSize = 0;
53
54 for (const auto &colDesc : fDescriptor.GetColumnIterable()) {
55 if (colDesc.IsAliasColumn())
56 continue;
57
58 auto colId = colDesc.GetPhysicalId();
59
60 // We generate the default memory representation for the given column type in order
61 // to report the size _in memory_ of column elements.
62 std::uint32_t elemSize = RColumnElementBase::Generate(colDesc.GetType())->GetSize();
63 std::uint64_t nElems = 0;
64 std::vector<std::uint64_t> compressedPageSizes{};
65
66 for (const auto &clusterDescriptor : fDescriptor.GetClusterIterable()) {
67 if (!clusterDescriptor.ContainsColumn(colId)) {
68 continue;
69 }
70
71 auto columnRange = clusterDescriptor.GetColumnRange(colId);
72 if (columnRange.IsSuppressed())
73 continue;
74
75 nElems += columnRange.GetNElements();
76
77 if (!fCompressionSettings && columnRange.GetCompressionSettings()) {
78 fCompressionSettings = *columnRange.GetCompressionSettings();
79 } else if (fCompressionSettings && columnRange.GetCompressionSettings() &&
80 (*fCompressionSettings != *columnRange.GetCompressionSettings())) {
81 // Note that currently all clusters and columns are compressed with the same settings and it is not yet
82 // possible to do otherwise. This means that currently, this exception should never be thrown, but this
83 // could change in the future.
84 throw RException(R__FAIL("compression setting mismatch between column ranges (" +
85 std::to_string(*fCompressionSettings) + " vs " +
86 std::to_string(*columnRange.GetCompressionSettings()) +
87 ") for column with physical ID " + std::to_string(colId)));
88 }
89
90 const auto &pageRange = clusterDescriptor.GetPageRange(colId);
91
92 for (const auto &page : pageRange.GetPageInfos()) {
93 compressedPageSizes.emplace_back(page.GetLocator().GetNBytesOnStorage());
94 fUncompressedSize += page.GetNElements() * elemSize;
95 }
96 }
97
98 fCompressedSize +=
99 std::accumulate(compressedPageSizes.begin(), compressedPageSizes.end(), static_cast<std::uint64_t>(0));
101 }
102}
103
106{
107 std::uint64_t compressedSize = 0;
108 std::uint64_t uncompressedSize = 0;
109
110 for (const auto &colDescriptor : fDescriptor.GetColumnIterable(fieldId)) {
111 auto colInfo = GetColumnInspector(colDescriptor.GetPhysicalId());
112 compressedSize += colInfo.GetCompressedSize();
113 uncompressedSize += colInfo.GetUncompressedSize();
114 }
115
116 for (const auto &subFieldDescriptor : fDescriptor.GetFieldIterable(fieldId)) {
117 auto subFieldId = subFieldDescriptor.GetId();
118
119 auto subFieldInfo = CollectFieldTreeInfo(subFieldId);
120
121 compressedSize += subFieldInfo.GetCompressedSize();
122 uncompressedSize += subFieldInfo.GetUncompressedSize();
123 }
124
125 auto fieldInfo = RFieldTreeInspector(fDescriptor.GetFieldDescriptor(fieldId), compressedSize, uncompressedSize);
126 fFieldTreeInfo.emplace(fieldId, fieldInfo);
127 return fieldInfo;
128}
129
130std::vector<ROOT::DescriptorId_t>
132{
133 std::vector<ROOT::DescriptorId_t> colIds;
134 std::deque<ROOT::DescriptorId_t> fieldIdQueue{fieldId};
135
136 while (!fieldIdQueue.empty()) {
137 auto currId = fieldIdQueue.front();
138 fieldIdQueue.pop_front();
139
140 for (const auto &col : fDescriptor.GetColumnIterable(currId)) {
141 if (col.IsAliasColumn()) {
142 continue;
143 }
144
145 colIds.emplace_back(col.GetPhysicalId());
146 }
147
148 for (const auto &fld : fDescriptor.GetFieldIterable(currId)) {
149 fieldIdQueue.push_back(fld.GetId());
150 }
151 }
152
153 return colIds;
154}
155
156std::unique_ptr<ROOT::Experimental::RNTupleInspector>
162
163std::unique_ptr<ROOT::Experimental::RNTupleInspector>
165{
167 return std::unique_ptr<RNTupleInspector>(new RNTupleInspector(std::move(pageSource)));
168}
169
171{
172 if (!fCompressionSettings)
173 return "unknown";
174
175 int algorithm = *fCompressionSettings / 100;
176 int level = *fCompressionSettings - (algorithm * 100);
177
179 " (level " + std::to_string(level) + ")";
180}
181
182//------------------------------------------------------------------------------
183
186{
187 if (physicalColumnId > fDescriptor.GetNPhysicalColumns()) {
188 throw RException(R__FAIL("No column with physical ID " + std::to_string(physicalColumnId) + " present"));
189 }
190
191 return fColumnInfo.at(physicalColumnId);
192}
193
195{
196 size_t typeCount = 0;
197
198 for (auto &[colId, colInfo] : fColumnInfo) {
199 if (colInfo.GetType() == colType) {
200 ++typeCount;
201 }
202 }
203
204 return typeCount;
205}
206
207const std::vector<ROOT::DescriptorId_t>
209{
210 std::vector<ROOT::DescriptorId_t> colIds;
211
212 for (const auto &[colId, colInfo] : fColumnInfo) {
213 if (colInfo.GetType() == colType)
214 colIds.emplace_back(colId);
215 }
216
217 return colIds;
218}
219
220const std::vector<ROOT::ENTupleColumnType> ROOT::Experimental::RNTupleInspector::GetColumnTypes()
221{
222 std::set<ROOT::ENTupleColumnType> colTypes;
223
224 for (const auto &[colId, colInfo] : fColumnInfo) {
225 colTypes.emplace(colInfo.GetType());
226 }
227
228 return std::vector(colTypes.begin(), colTypes.end());
229}
230
232{
233 struct ColumnTypeInfo {
234 std::uint64_t nElems = 0;
235 std::uint64_t compressedSize = 0;
236 std::uint64_t uncompressedSize = 0;
237 std::uint64_t nPages = 0;
238 std::uint32_t count = 0;
239
241 {
242 this->count++;
243 this->nElems += colInfo.GetNElements();
244 this->compressedSize += colInfo.GetCompressedSize();
245 this->uncompressedSize += colInfo.GetUncompressedSize();
246 this->nPages += colInfo.GetNPages();
247 }
248
249 // Helper method to calculate compression factor
250 float GetCompressionFactor() const
251 {
252 if (compressedSize == 0)
253 return 1.0;
254 return static_cast<float>(uncompressedSize) / static_cast<float>(compressedSize);
255 }
256 };
257
258 std::map<ENTupleColumnType, ColumnTypeInfo> colTypeInfo;
259
260 // Collect information for each column
261 for (const auto &[colId, colInfo] : fColumnInfo) {
262 colTypeInfo[colInfo.GetType()] += colInfo;
263 }
264
265 switch (format) {
267 output << " column type | count | # elements | compressed bytes | uncompressed bytes | compression ratio | "
268 "# pages \n"
269 << "----------------|---------|-------------|------------------|--------------------|-------------------|-"
270 "------"
271 << std::endl;
272 for (const auto &[colType, typeInfo] : colTypeInfo)
273 output << std::setw(15) << RColumnElementBase::GetColumnTypeName(colType) << " |" << std::setw(8)
274 << typeInfo.count << " |" << std::setw(12) << typeInfo.nElems << " |" << std::setw(17)
275 << typeInfo.compressedSize << " |" << std::setw(19) << typeInfo.uncompressedSize << " |" << std::fixed
276 << std::setprecision(3) << std::setw(18) << typeInfo.GetCompressionFactor() << " |" << std::setw(6)
277 << typeInfo.nPages << " " << std::endl;
278 break;
280 output << "columnType,count,nElements,compressedSize,uncompressedSize,compressionFactor,nPages" << std::endl;
281 for (const auto &[colType, typeInfo] : colTypeInfo) {
282 output << RColumnElementBase::GetColumnTypeName(colType) << "," << typeInfo.count << "," << typeInfo.nElems
283 << "," << typeInfo.compressedSize << "," << typeInfo.uncompressedSize << "," << std::fixed
284 << std::setprecision(3) << typeInfo.GetCompressionFactor() << "," << typeInfo.nPages << std::endl;
285 }
286 break;
287 default: R__ASSERT(false && "Invalid print format");
288 }
289}
290
291std::unique_ptr<TH1D>
293 std::string_view histName, std::string_view histTitle)
294{
295 if (histName.empty()) {
296 switch (histKind) {
297 case ENTupleInspectorHist::kCount: histName = "colTypeCountHist"; break;
298 case ENTupleInspectorHist::kNElems: histName = "colTypeElemCountHist"; break;
299 case ENTupleInspectorHist::kCompressedSize: histName = "colTypeCompSizeHist"; break;
300 case ENTupleInspectorHist::kUncompressedSize: histName = "colTypeUncompSizeHist"; break;
301 default: throw RException(R__FAIL("Unknown histogram type"));
302 }
303 }
304
305 if (histTitle.empty()) {
306 switch (histKind) {
307 case ENTupleInspectorHist::kCount: histTitle = "Column count by type"; break;
308 case ENTupleInspectorHist::kNElems: histTitle = "Number of elements by column type"; break;
309 case ENTupleInspectorHist::kCompressedSize: histTitle = "Compressed size by column type"; break;
310 case ENTupleInspectorHist::kUncompressedSize: histTitle = "Uncompressed size by column type"; break;
311 default: throw RException(R__FAIL("Unknown histogram type"));
312 }
313 }
314
315 auto hist = std::make_unique<TH1D>(std::string(histName).c_str(), std::string(histTitle).c_str(), 1, 0, 1);
316
317 double data;
318 for (const auto &[colId, colInfo] : fColumnInfo) {
319 switch (histKind) {
320 case ENTupleInspectorHist::kCount: data = 1.; break;
321 case ENTupleInspectorHist::kNElems: data = colInfo.GetNElements(); break;
322 case ENTupleInspectorHist::kCompressedSize: data = colInfo.GetCompressedSize(); break;
323 case ENTupleInspectorHist::kUncompressedSize: data = colInfo.GetUncompressedSize(); break;
324 default: throw RException(R__FAIL("Unknown histogram type"));
325 }
326
327 hist->AddBinContent(hist->GetXaxis()->FindBin(RColumnElementBase::GetColumnTypeName(colInfo.GetType())), data);
328 }
329
330 return hist;
331}
332
333std::unique_ptr<TH1D>
335 std::string histName, std::string histTitle, size_t nBins)
336{
337 if (histTitle.empty())
338 histTitle = "Page size distribution for column with ID " + std::to_string(physicalColumnId);
339
340 return GetPageSizeDistribution({physicalColumnId}, histName, histTitle, nBins);
341}
342
344 std::string histName,
345 std::string histTitle, size_t nBins)
346{
347 if (histName.empty())
348 histName = "pageSizeHistCol" + std::string{RColumnElementBase::GetColumnTypeName(colType)};
349 if (histTitle.empty())
350 histTitle =
351 "Page size distribution for columns with type " + std::string{RColumnElementBase::GetColumnTypeName(colType)};
352
353 auto perTypeHist = GetPageSizeDistribution({colType}, histName, histTitle, nBins);
354
355 if (perTypeHist->GetNhists() < 1)
356 return std::make_unique<TH1D>(histName.c_str(), histTitle.c_str(), 64, 0, 0);
357
358 auto hist = std::unique_ptr<TH1D>(dynamic_cast<TH1D *>(perTypeHist->GetHists()->First()));
359
360 hist->SetName(histName.c_str());
361 hist->SetTitle(histTitle.c_str());
362 hist->SetXTitle("Page size (B)");
363 hist->SetYTitle("N_{pages}");
364 return hist;
365}
366
367std::unique_ptr<TH1D>
369 std::string histName, std::string histTitle, size_t nBins)
370{
371 auto hist = std::make_unique<TH1D>();
372
373 if (histName.empty())
374 histName = "pageSizeHist";
375 hist->SetName(histName.c_str());
376 if (histTitle.empty())
377 histTitle = "Page size distribution";
378 hist->SetTitle(histTitle.c_str());
379 hist->SetXTitle("Page size (B)");
380 hist->SetYTitle("N_{pages}");
381
382 std::vector<std::uint64_t> pageSizes;
383 std::for_each(colIds.begin(), colIds.end(), [this, &pageSizes](const auto colId) {
384 auto colInfo = GetColumnInspector(colId);
385 pageSizes.insert(pageSizes.end(), colInfo.GetCompressedPageSizes().begin(),
386 colInfo.GetCompressedPageSizes().end());
387 });
388
389 if (!pageSizes.empty()) {
390 auto histMinMax = std::minmax_element(pageSizes.begin(), pageSizes.end());
391 hist->SetBins(nBins, *histMinMax.first,
392 *histMinMax.second + ((*histMinMax.second - *histMinMax.first) / static_cast<double>(nBins)));
393
394 for (const auto pageSize : pageSizes) {
395 hist->Fill(pageSize);
396 }
397 }
398
399 return hist;
400}
401
402std::unique_ptr<THStack>
403ROOT::Experimental::RNTupleInspector::GetPageSizeDistribution(std::initializer_list<ROOT::ENTupleColumnType> colTypes,
404 std::string histName, std::string histTitle, size_t nBins)
405{
406 if (histName.empty())
407 histName = "pageSizeHist";
408 if (histTitle.empty())
409 histTitle = "Per-column type page size distribution";
410
411 auto stackedHist = std::make_unique<THStack>(histName.c_str(), histTitle.c_str());
412
413 double histMin = std::numeric_limits<double>::max();
414 double histMax = 0;
415 std::map<ROOT::ENTupleColumnType, std::vector<std::uint64_t>> pageSizes;
416
417 std::vector<ROOT::ENTupleColumnType> colTypeVec = colTypes;
418 if (std::empty(colTypes)) {
419 colTypeVec = GetColumnTypes();
420 }
421
422 for (const auto colType : colTypeVec) {
423 auto colIds = GetColumnsByType(colType);
424
425 if (colIds.empty())
426 continue;
427
428 std::vector<std::uint64_t> pageSizesForColType;
429 std::for_each(colIds.cbegin(), colIds.cend(), [this, &pageSizesForColType](const auto colId) {
430 auto colInfo = GetColumnInspector(colId);
431 pageSizesForColType.insert(pageSizesForColType.end(), colInfo.GetCompressedPageSizes().begin(),
432 colInfo.GetCompressedPageSizes().end());
433 });
434 if (pageSizesForColType.empty())
435 continue;
436
438
439 auto histMinMax = std::minmax_element(pageSizesForColType.begin(), pageSizesForColType.end());
440 histMin = std::min(histMin, static_cast<double>(*histMinMax.first));
441 histMax = std::max(histMax, static_cast<double>(*histMinMax.second));
442 }
443
444 for (const auto &[colType, pageSizesForColType] : pageSizes) {
445 auto hist = std::make_unique<TH1D>(
448 histMax + ((histMax - histMin) / static_cast<double>(nBins)));
449
450 for (const auto pageSize : pageSizesForColType) {
451 hist->Fill(pageSize);
452 }
453
454 stackedHist->Add(hist.release());
455 }
456
457 return stackedHist;
458}
459
460//------------------------------------------------------------------------------
461
464{
465 if (fieldId >= fDescriptor.GetNFields()) {
466 throw RException(R__FAIL("No field with ID " + std::to_string(fieldId) + " present"));
467 }
468
469 return fFieldTreeInfo.at(fieldId);
470}
471
474{
475 auto fieldId = fDescriptor.FindFieldId(fieldName);
476
478 throw RException(R__FAIL("Could not find field `" + std::string(fieldName) + "`"));
479 }
480
481 return GetFieldTreeInspector(fieldId);
482}
483
485 bool includeSubfields) const
486{
487 size_t typeCount = 0;
488
489 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
490 if (!includeSubfields && fldInfo.GetDescriptor().GetParentId() != fDescriptor.GetFieldZeroId()) {
491 continue;
492 }
493
494 if (std::regex_match(fldInfo.GetDescriptor().GetTypeName(), typeNamePattern)) {
495 typeCount++;
496 }
497 }
498
499 return typeCount;
500}
501
502const std::vector<ROOT::DescriptorId_t>
504{
505 std::vector<ROOT::DescriptorId_t> fieldIds;
506
507 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
508
509 if (!searchInSubfields && fldInfo.GetDescriptor().GetParentId() != fDescriptor.GetFieldZeroId()) {
510 continue;
511 }
512
513 if (std::regex_match(fldInfo.GetDescriptor().GetFieldName(), fieldNamePattern)) {
514 fieldIds.emplace_back(fldId);
515 }
516 }
517
518 return fieldIds;
519}
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t format
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:486
The available trivial, native content types of a column.
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
ROOT::DescriptorId_t GetFieldZeroId() const
Returns the logical parent of all top-level NTuple data fields.
Provides column-level storage information.
Inspect on-disk and storage-related information of an RNTuple.
std::vector< ROOT::DescriptorId_t > GetColumnsByFieldId(ROOT::DescriptorId_t fieldId) const
Get the columns that make up the given field, including its subfields.
const RFieldTreeInspector & GetFieldTreeInspector(ROOT::DescriptorId_t fieldId) const
Get storage information for a given (sub)field by ID.
std::unique_ptr< Internal::RPageSource > fPageSource
std::unique_ptr< TH1D > GetPageSizeDistribution(ROOT::DescriptorId_t physicalColumnId, std::string histName="", std::string histTitle="", size_t nBins=64)
Get a histogram containing the size distribution of the compressed pages for an individual column.
size_t GetColumnCountByType(ROOT::ENTupleColumnType colType) const
Get the number of columns of a given type present in the RNTuple.
const std::vector< ROOT::ENTupleColumnType > GetColumnTypes()
Get all column types present in the RNTuple being inspected.
RNTupleInspector(std::unique_ptr< Internal::RPageSource > pageSource)
size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubfields=true) const
Get the number of fields of a given type or class present in the RNTuple.
const std::vector< ROOT::DescriptorId_t > GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubfields=true) const
Get the IDs of (sub-)fields whose name matches the given string.
std::string GetCompressionSettingsAsString() const
Get a string describing compression settings of the RNTuple being inspected.
RFieldTreeInspector CollectFieldTreeInfo(ROOT::DescriptorId_t fieldId)
Recursively gather field-level information.
void PrintColumnTypeInfo(ENTupleInspectorPrintFormat format=ENTupleInspectorPrintFormat::kTable, std::ostream &output=std::cout)
Print storage information per column type.
const RColumnInspector & GetColumnInspector(ROOT::DescriptorId_t physicalColumnId) const
Get storage information for a given column.
static std::unique_ptr< RNTupleInspector > Create(const RNTuple &sourceNTuple)
Create a new RNTupleInspector.
const std::vector< ROOT::DescriptorId_t > GetColumnsByType(ROOT::ENTupleColumnType colType)
Get the IDs of all columns with the given type.
void CollectColumnInfo()
Gather column-level and RNTuple-level information.
std::unique_ptr< TH1D > GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName="", std::string_view histTitle="")
Get a histogram showing information for each column type present,.
A column element encapsulates the translation between basic C++ types and their column representation...
static const char * GetColumnTypeName(ROOT::ENTupleColumnType type)
static std::unique_ptr< RColumnElementBase > Generate(ROOT::ENTupleColumnType type)
If CppT == void, use the default C++ type for the given column type.
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:69
const_iterator begin() const
const_iterator end() const
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:695
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition TString.cxx:2378
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
Definition Compression.h:88
static std::string AlgorithmToString(EAlgorithm::EValues algorithm)
static void output()