Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleInspector.cxx
Go to the documentation of this file.
1/// \file RNTupleInspector.cxx
2/// \ingroup NTuple ROOT7
3/// \author Florine de Geus <florine.willemijn.de.geus@cern.ch>
4/// \date 2023-01-09
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2023, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RError.hxx>
21#include <ROOT/RError.hxx>
22
23#include <TFile.h>
24
25#include <algorithm>
26#include <cstring>
27#include <deque>
28#include <exception>
29#include <iomanip>
30#include <iostream>
31
33 std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource)
34 : fPageSource(std::move(pageSource))
35{
36 fPageSource->Attach();
37 auto descriptorGuard = fPageSource->GetSharedDescriptorGuard();
38 fDescriptor = descriptorGuard->Clone();
39
41 CollectFieldTreeInfo(fDescriptor->GetFieldZeroId());
42}
43
44// NOTE: outlined to avoid including RPageStorage in the header
46
48{
49 fCompressedSize = 0;
50 fUncompressedSize = 0;
51
52 for (const auto &colDesc : fDescriptor->GetColumnIterable()) {
53 if (colDesc.IsAliasColumn())
54 continue;
55
56 auto colId = colDesc.GetPhysicalId();
57
58 // We generate the default memory representation for the given column type in order
59 // to report the size _in memory_ of column elements.
60 std::uint32_t elemSize = ROOT::Experimental::Internal::RColumnElementBase::Generate(colDesc.GetType())->GetSize();
61 std::uint64_t nElems = 0;
62 std::vector<std::uint64_t> compressedPageSizes{};
63
64 for (const auto &clusterDescriptor : fDescriptor->GetClusterIterable()) {
65 if (!clusterDescriptor.ContainsColumn(colId)) {
66 continue;
67 }
68
69 auto columnRange = clusterDescriptor.GetColumnRange(colId);
70 if (columnRange.fIsSuppressed)
71 continue;
72
73 nElems += columnRange.fNElements;
74
75 if (fCompressionSettings == -1) {
76 fCompressionSettings = columnRange.fCompressionSettings;
77 } else if (fCompressionSettings != columnRange.fCompressionSettings &&
78 columnRange.fCompressionSettings != kUnknownCompressionSettings) {
79 // Note that currently all clusters and columns are compressed with the same settings and it is not yet
80 // possible to do otherwise. This means that currently, this exception should never be thrown, but this
81 // could change in the future.
82 throw RException(R__FAIL("compression setting mismatch between column ranges (" +
83 std::to_string(fCompressionSettings) + " vs " +
84 std::to_string(columnRange.fCompressionSettings) +
85 ") for column with physical ID " + std::to_string(colId)));
86 }
87
88 const auto &pageRange = clusterDescriptor.GetPageRange(colId);
89
90 for (const auto &page : pageRange.fPageInfos) {
91 compressedPageSizes.emplace_back(page.fLocator.fBytesOnStorage);
92 fUncompressedSize += page.fNElements * elemSize;
93 }
94 }
95
96 fCompressedSize += std::accumulate(compressedPageSizes.begin(), compressedPageSizes.end(), static_cast<std::uint64_t>(0));
97 fColumnInfo.emplace(colId, RColumnInspector(colDesc, compressedPageSizes, elemSize, nElems));
98 }
99}
100
103{
104 std::uint64_t compressedSize = 0;
105 std::uint64_t uncompressedSize = 0;
106
107 for (const auto &colDescriptor : fDescriptor->GetColumnIterable(fieldId)) {
108 auto colInfo = GetColumnInspector(colDescriptor.GetPhysicalId());
109 compressedSize += colInfo.GetCompressedSize();
110 uncompressedSize += colInfo.GetUncompressedSize();
111 }
112
113 for (const auto &subFieldDescriptor : fDescriptor->GetFieldIterable(fieldId)) {
114 DescriptorId_t subFieldId = subFieldDescriptor.GetId();
115
116 auto subFieldInfo = CollectFieldTreeInfo(subFieldId);
117
118 compressedSize += subFieldInfo.GetCompressedSize();
119 uncompressedSize += subFieldInfo.GetUncompressedSize();
120 }
121
122 auto fieldInfo = RFieldTreeInspector(fDescriptor->GetFieldDescriptor(fieldId), compressedSize, uncompressedSize);
123 fFieldTreeInfo.emplace(fieldId, fieldInfo);
124 return fieldInfo;
125}
126
127std::vector<ROOT::Experimental::DescriptorId_t>
129{
130 std::vector<DescriptorId_t> colIds;
131 std::deque<DescriptorId_t> fieldIdQueue{fieldId};
132
133 while (!fieldIdQueue.empty()) {
134 auto currId = fieldIdQueue.front();
135 fieldIdQueue.pop_front();
136
137 for (const auto &col : fDescriptor->GetColumnIterable(currId)) {
138 if (col.IsAliasColumn()) {
139 continue;
140 }
141
142 colIds.emplace_back(col.GetPhysicalId());
143 }
144
145 for (const auto &fld : fDescriptor->GetFieldIterable(currId)) {
146 fieldIdQueue.push_back(fld.GetId());
147 }
148 }
149
150 return colIds;
151}
152
153std::unique_ptr<ROOT::Experimental::RNTupleInspector>
155{
156 auto pageSource = Internal::RPageSourceFile::CreateFromAnchor(sourceNTuple);
157 return std::unique_ptr<RNTupleInspector>(new RNTupleInspector(std::move(pageSource)));
158}
159
160std::unique_ptr<ROOT::Experimental::RNTupleInspector>
161ROOT::Experimental::RNTupleInspector::Create(std::string_view ntupleName, std::string_view sourceFileName)
162{
163 auto pageSource = ROOT::Experimental::Internal::RPageSource::Create(ntupleName, sourceFileName);
164 return std::unique_ptr<RNTupleInspector>(new RNTupleInspector(std::move(pageSource)));
165}
166
168{
169 int algorithm = fCompressionSettings / 100;
170 int level = fCompressionSettings - (algorithm * 100);
171
173 " (level " + std::to_string(level) + ")";
174}
175
176//------------------------------------------------------------------------------
177
180{
181 if (physicalColumnId > fDescriptor->GetNPhysicalColumns()) {
182 throw RException(R__FAIL("No column with physical ID " + std::to_string(physicalColumnId) + " present"));
183 }
184
185 return fColumnInfo.at(physicalColumnId);
186}
187
189{
190 size_t typeCount = 0;
191
192 for (auto &[colId, colInfo] : fColumnInfo) {
193 if (colInfo.GetType() == colType) {
194 ++typeCount;
195 }
196 }
197
198 return typeCount;
199}
200
201const std::vector<ROOT::Experimental::DescriptorId_t>
203{
204 std::vector<DescriptorId_t> colIds;
205
206 for (const auto &[colId, colInfo] : fColumnInfo) {
207 if (colInfo.GetType() == colType)
208 colIds.emplace_back(colId);
209 }
210
211 return colIds;
212}
213
214const std::vector<ROOT::Experimental::EColumnType> ROOT::Experimental::RNTupleInspector::GetColumnTypes()
215{
216 std::set<EColumnType> colTypes;
217
218 for (const auto &[colId, colInfo] : fColumnInfo) {
219 colTypes.emplace(colInfo.GetType());
220 }
221
222 return std::vector(colTypes.begin(), colTypes.end());
223}
224
226{
227 struct ColumnTypeInfo {
228 std::uint32_t count;
229 std::uint64_t nElems, compressedSize, uncompressedSize;
230
231 void operator+=(const RColumnInspector &colInfo)
232 {
233 this->count++;
234 this->nElems += colInfo.GetNElements();
235 this->compressedSize += colInfo.GetCompressedSize();
236 this->uncompressedSize += colInfo.GetUncompressedSize();
237 }
238 };
239
240 std::map<EColumnType, ColumnTypeInfo> colTypeInfo;
241
242 for (const auto &[colId, colInfo] : fColumnInfo) {
243 colTypeInfo[colInfo.GetType()] += colInfo;
244 }
245
246 switch (format) {
248 output << " column type | count | # elements | compressed bytes | uncompressed bytes\n"
249 << "----------------|---------|-----------------|-------------------|--------------------" << std::endl;
250 for (const auto &[colType, typeInfo] : colTypeInfo) {
251 output << std::setw(15) << Internal::RColumnElementBase::GetTypeName(colType) << " |" << std::setw(8)
252 << typeInfo.count << " |" << std::setw(16) << typeInfo.nElems << " |" << std::setw(18)
253 << typeInfo.compressedSize << " |" << std::setw(18) << typeInfo.uncompressedSize << " " << std::endl;
254 }
255 break;
257 output << "columnType,count,nElements,compressedSize,uncompressedSize" << std::endl;
258 for (const auto &[colType, typeInfo] : colTypeInfo) {
259 output << Internal::RColumnElementBase::GetTypeName(colType) << "," << typeInfo.count << "," << typeInfo.nElems
260 << "," << typeInfo.compressedSize << "," << typeInfo.uncompressedSize << std::endl;
261 }
262 break;
263 default: throw RException(R__FAIL("Invalid print format"));
264 }
265}
266
267std::unique_ptr<TH1D>
269 std::string_view histName, std::string_view histTitle)
270{
271 if (histName.empty()) {
272 switch (histKind) {
273 case ENTupleInspectorHist::kCount: histName = "colTypeCountHist"; break;
274 case ENTupleInspectorHist::kNElems: histName = "colTypeElemCountHist"; break;
275 case ENTupleInspectorHist::kCompressedSize: histName = "colTypeCompSizeHist"; break;
276 case ENTupleInspectorHist::kUncompressedSize: histName = "colTypeUncompSizeHist"; break;
277 default: throw RException(R__FAIL("Unknown histogram type"));
278 }
279 }
280
281 if (histTitle.empty()) {
282 switch (histKind) {
283 case ENTupleInspectorHist::kCount: histTitle = "Column count by type"; break;
284 case ENTupleInspectorHist::kNElems: histTitle = "Number of elements by column type"; break;
285 case ENTupleInspectorHist::kCompressedSize: histTitle = "Compressed size by column type"; break;
286 case ENTupleInspectorHist::kUncompressedSize: histTitle = "Uncompressed size by column type"; break;
287 default: throw RException(R__FAIL("Unknown histogram type"));
288 }
289 }
290
291 auto hist = std::make_unique<TH1D>(std::string(histName).c_str(), std::string(histTitle).c_str(), 1, 0, 1);
292
293 double data;
294 for (const auto &[colId, colInfo] : fColumnInfo) {
295 switch (histKind) {
296 case ENTupleInspectorHist::kCount: data = 1.; break;
297 case ENTupleInspectorHist::kNElems: data = colInfo.GetNElements(); break;
298 case ENTupleInspectorHist::kCompressedSize: data = colInfo.GetCompressedSize(); break;
299 case ENTupleInspectorHist::kUncompressedSize: data = colInfo.GetUncompressedSize(); break;
300 default: throw RException(R__FAIL("Unknown histogram type"));
301 }
302
303 hist->AddBinContent(
304 hist->GetXaxis()->FindBin(Internal::RColumnElementBase::GetTypeName(colInfo.GetType()).c_str()), data);
305 }
306
307 return hist;
308}
309
311 std::string histName,
312 std::string histTitle, size_t nBins)
313{
314 if (histTitle.empty())
315 histTitle = "Page size distribution for column with ID " + std::to_string(physicalColumnId);
316
317 return GetPageSizeDistribution({physicalColumnId}, histName, histTitle, nBins);
318}
319
320std::unique_ptr<TH1D>
322 std::string histName, std::string histTitle, size_t nBins)
323{
324 if (histName.empty())
325 histName = "pageSizeHistCol" + Internal::RColumnElementBase::GetTypeName(colType);
326 if (histTitle.empty())
327 histTitle = "Page size distribution for columns with type " + Internal::RColumnElementBase::GetTypeName(colType);
328
329 auto perTypeHist = GetPageSizeDistribution({colType}, histName, histTitle, nBins);
330
331 if (perTypeHist->GetNhists() < 1)
332 return std::make_unique<TH1D>(histName.c_str(), histTitle.c_str(), 64, 0, 0);
333
334 auto hist = std::unique_ptr<TH1D>(dynamic_cast<TH1D *>(perTypeHist->GetHists()->First()));
335
336 hist->SetName(histName.c_str());
337 hist->SetTitle(histTitle.c_str());
338 hist->SetXTitle("Page size (B)");
339 hist->SetYTitle("N_{pages}");
340 return hist;
341}
342
343std::unique_ptr<TH1D>
344ROOT::Experimental::RNTupleInspector::GetPageSizeDistribution(std::initializer_list<DescriptorId_t> colIds,
345 std::string histName, std::string histTitle, size_t nBins)
346{
347 auto hist = std::make_unique<TH1D>();
348
349 if (histName.empty())
350 histName = "pageSizeHist";
351 hist->SetName(histName.c_str());
352 if (histTitle.empty())
353 histTitle = "Page size distribution";
354 hist->SetTitle(histTitle.c_str());
355 hist->SetXTitle("Page size (B)");
356 hist->SetYTitle("N_{pages}");
357
358 std::vector<std::uint64_t> pageSizes;
359 std::for_each(colIds.begin(), colIds.end(), [this, &pageSizes](const auto colId) {
360 auto colInfo = GetColumnInspector(colId);
361 pageSizes.insert(pageSizes.end(), colInfo.GetCompressedPageSizes().begin(),
362 colInfo.GetCompressedPageSizes().end());
363 });
364
365 auto histMinMax = std::minmax_element(pageSizes.begin(), pageSizes.end());
366 hist->SetBins(nBins, *histMinMax.first,
367 *histMinMax.second + ((*histMinMax.second - *histMinMax.first) / static_cast<double>(nBins)));
368
369 for (const auto pageSize : pageSizes) {
370 hist->Fill(pageSize);
371 }
372
373 return hist;
374}
375
377 std::initializer_list<ROOT::Experimental::EColumnType> colTypes, std::string histName, std::string histTitle,
378 size_t nBins)
379{
380 if (histName.empty())
381 histName = "pageSizeHist";
382 if (histTitle.empty())
383 histTitle = "Per-column type page size distribution";
384
385 auto stackedHist = std::make_unique<THStack>(histName.c_str(), histTitle.c_str());
386
387 double histMin = std::numeric_limits<double>::max();
388 double histMax = 0;
389 std::map<EColumnType, std::vector<std::uint64_t>> pageSizes;
390
391 std::vector<EColumnType> colTypeVec = colTypes;
392 if (std::empty(colTypes)) {
393 colTypeVec = GetColumnTypes();
394 }
395
396 for (const auto colType : colTypeVec) {
397 auto colIds = GetColumnsByType(colType);
398
399 if (colIds.empty())
400 continue;
401
402 std::vector<std::uint64_t> pageSizesForColType;
403 std::for_each(colIds.cbegin(), colIds.cend(), [this, &pageSizesForColType](const auto colId) {
404 auto colInfo = GetColumnInspector(colId);
405 pageSizesForColType.insert(pageSizesForColType.end(), colInfo.GetCompressedPageSizes().begin(),
406 colInfo.GetCompressedPageSizes().end());
407 });
408 pageSizes.emplace(colType, pageSizesForColType);
409
410 auto histMinMax = std::minmax_element(pageSizesForColType.begin(), pageSizesForColType.end());
411 histMin = std::min(histMin, static_cast<double>(*histMinMax.first));
412 histMax = std::max(histMax, static_cast<double>(*histMinMax.second));
413 }
414
415 for (const auto &[colType, pageSizesForColType] : pageSizes) {
416 auto hist = std::make_unique<TH1D>(
417 TString::Format("%s%s", histName.c_str(), Internal::RColumnElementBase::GetTypeName(colType).c_str()),
418 Internal::RColumnElementBase::GetTypeName(colType).c_str(), nBins, histMin,
419 histMax + ((histMax - histMin) / static_cast<double>(nBins)));
420
421 for (const auto pageSize : pageSizesForColType) {
422 hist->Fill(pageSize);
423 }
424
425 stackedHist->Add(hist.release());
426 }
427
428 return stackedHist;
429}
430
431//------------------------------------------------------------------------------
432
435{
436 if (fieldId >= fDescriptor->GetNFields()) {
437 throw RException(R__FAIL("No field with ID " + std::to_string(fieldId) + " present"));
438 }
439
440 return fFieldTreeInfo.at(fieldId);
441}
442
445{
446 DescriptorId_t fieldId = fDescriptor->FindFieldId(fieldName);
447
448 if (fieldId == kInvalidDescriptorId) {
449 throw RException(R__FAIL("Could not find field `" + std::string(fieldName) + "`"));
450 }
451
452 return GetFieldTreeInspector(fieldId);
453}
454
455size_t ROOT::Experimental::RNTupleInspector::GetFieldCountByType(const std::regex &typeNamePattern,
456 bool includeSubFields) const
457{
458 size_t typeCount = 0;
459
460 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
461 if (!includeSubFields && fldInfo.GetDescriptor().GetParentId() != fDescriptor->GetFieldZeroId()) {
462 continue;
463 }
464
465 if (std::regex_match(fldInfo.GetDescriptor().GetTypeName(), typeNamePattern)) {
466 typeCount++;
467 }
468 }
469
470 return typeCount;
471}
472
473const std::vector<ROOT::Experimental::DescriptorId_t>
474ROOT::Experimental::RNTupleInspector::GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubFields) const
475{
476 std::vector<DescriptorId_t> fieldIds;
477
478 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
479
480 if (!searchInSubFields && fldInfo.GetDescriptor().GetParentId() != fDescriptor->GetFieldZeroId()) {
481 continue;
482 }
483
484 if (std::regex_match(fldInfo.GetDescriptor().GetFieldName(), fieldNamePattern)) {
485 fieldIds.emplace_back(fldId);
486 }
487 }
488
489 return fieldIds;
490}
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t format
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:486
The available trivial, native content types of a column.
static std::string GetTypeName(EColumnType type)
static std::unique_ptr< RColumnElementBase > Generate(EColumnType type)
If CppT == void, use the default C++ type for the given column type.
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const RNTupleReadOptions &options=RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
Provides column-level storage information.
Inspect on-disk and storage-related information of an RNTuple.
const RColumnInspector & GetColumnInspector(DescriptorId_t physicalColumnId) const
Get storage information for a given column.
const std::vector< DescriptorId_t > GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubFields=true) const
Get the IDs of (sub-)fields whose name matches the given string.
std::unique_ptr< Internal::RPageSource > fPageSource
const std::vector< EColumnType > GetColumnTypes()
Get all column types present in the RNTuple being inspected.
size_t GetColumnCountByType(EColumnType colType) const
Get the number of columns of a given type present in the RNTuple.
RNTupleInspector(std::unique_ptr< Internal::RPageSource > pageSource)
std::string GetCompressionSettingsAsString() const
Get a string describing compression settings of the RNTuple being inspected.
std::unique_ptr< TH1D > GetPageSizeDistribution(DescriptorId_t physicalColumnId, std::string histName="", std::string histTitle="", size_t nBins=64)
Get a histogram containing the size distribution of the compressed pages for an individual column.
const std::vector< DescriptorId_t > GetColumnsByType(EColumnType colType)
Get the IDs of all columns with the given type.
void PrintColumnTypeInfo(ENTupleInspectorPrintFormat format=ENTupleInspectorPrintFormat::kTable, std::ostream &output=std::cout)
Print storage information per column type.
RFieldTreeInspector CollectFieldTreeInfo(DescriptorId_t fieldId)
Recursively gather field-level information.
std::unique_ptr< RNTupleDescriptor > fDescriptor
std::vector< DescriptorId_t > GetColumnsByFieldId(DescriptorId_t fieldId) const
Get the columns that make up the given field, including its subfields.
static std::unique_ptr< RNTupleInspector > Create(const RNTuple &sourceNTuple)
Create a new RNTupleInspector.
void CollectColumnInfo()
Gather column-level and RNTuple-level information.
size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubFields=true) const
Get the number of fields of a given type or class present in the RNTuple.
const RFieldTreeInspector & GetFieldTreeInspector(DescriptorId_t fieldId) const
Get storage information for a given (sub)field by ID.
std::unique_ptr< TH1D > GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName="", std::string_view histTitle="")
Get a histogram showing information for each column type present,.
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:61
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:670
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition TString.cxx:2378
constexpr int kUnknownCompressionSettings
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
Definition Compression.h:88
static std::string AlgorithmToString(EAlgorithm::EValues algorithm)
static void output()