Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleExporter.cxx
Go to the documentation of this file.
1/// \file RNTupleExporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Giacomo Parolini <giacomo.parolini@cern.ch>
4/// \date 2024-12-10
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RPageStorage.hxx>
19#include <ROOT/RClusterPool.hxx>
20#include <ROOT/RLogger.hxx>
21#include <fstream>
22#include <sstream>
23
25
26namespace {
27
29{
30 static RLogChannel sLog("ROOT.RNTupleExporter");
31 return sLog;
32}
33
34struct RColumnExportInfo {
35 const RColumnDescriptor *fColDesc;
36 const RFieldDescriptor *fFieldDesc;
37 std::string fQualifiedName;
38
39 RColumnExportInfo(const RNTupleDescriptor &desc, const RColumnDescriptor &colDesc, const RFieldDescriptor &fieldDesc)
40 : fColDesc(&colDesc),
42 // NOTE: we don't need to keep the column representation index into account because exactly 1 representation
43 // is active per page, so there is no risk of name collisions.
44 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) + '-' + std::to_string(colDesc.GetIndex()))
45 {
46 }
47};
48
49struct RAddColumnsResult {
50 int fNColsTotal = 0;
51
52 RAddColumnsResult &operator+=(const RAddColumnsResult &other)
53 {
54 fNColsTotal += other.fNColsTotal;
55 return *this;
56 }
57};
58
59template <typename T>
60bool ItemIsFilteredOut(const RNTupleExporter::RFilter<T> &filter, const T &item)
61{
62 bool filterHasType = filter.fSet.find(item) != filter.fSet.end();
64 return isFiltered;
65}
66
67RAddColumnsResult AddColumnsFromField(std::vector<RColumnExportInfo> &vec, const RNTupleDescriptor &desc,
68 const RFieldDescriptor &fieldDesc, const RNTupleExporter::RPagesOptions &options)
69{
70 R__LOG_DEBUG(1, RNTupleExporterLog()) << "processing field \"" << desc.GetQualifiedFieldName(fieldDesc.GetId())
71 << "\"";
72
73 RAddColumnsResult res{};
74
75 for (const auto &subfieldDesc : desc.GetFieldIterable(fieldDesc)) {
76 if (subfieldDesc.IsProjectedField())
77 continue;
78
79 for (const auto &colDesc : desc.GetColumnIterable(subfieldDesc)) {
80 // Filter columns by type
81 bool typeIsFiltered = ItemIsFilteredOut(options.fColumnTypeFilter, colDesc.GetType());
82 if (!typeIsFiltered)
83 vec.emplace_back(desc, colDesc, subfieldDesc);
84 res.fNColsTotal += 1;
85 }
86 res += AddColumnsFromField(vec, desc, subfieldDesc, options);
87 }
88
89 return res;
90}
91
92int CountPages(const RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
93{
94 int nPages = 0;
95 DescriptorId_t clusterId = desc.FindClusterId(0, 0);
97 const auto &clusterDesc = desc.GetClusterDescriptor(clusterId);
98 for (const auto &colInfo : columns) {
99 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
100 nPages += pages.fPageInfos.size();
101 }
102 clusterId = desc.FindNextClusterId(clusterId);
103 }
104 return nPages;
105}
106
107} // namespace
108
110{
111 RPagesResult res = {};
112
113 // make sure the source is attached
114 source.Attach();
115
116 auto desc = source.GetSharedDescriptorGuard();
118
119 // Collect column info
120 std::vector<RColumnExportInfo> columnInfos;
121 const RAddColumnsResult addColRes = AddColumnsFromField(columnInfos, desc.GetRef(), desc->GetFieldZero(), options);
122
123 // Collect ColumnSet for the cluster pool query
125 columnSet.reserve(columnInfos.size());
126 for (const auto &colInfo : columnInfos) {
127 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
128 }
129
130 const auto nPages = CountPages(desc.GetRef(), columnInfos);
131
132 const bool showProgress = (options.fFlags & RPagesOptions::kShowProgressBar) != 0;
133 res.fExportedFileNames.reserve(nPages);
134
135 // Iterate over the clusters in order and dump pages
136 DescriptorId_t clusterId = nPages > 0 ? desc->FindClusterId(0, 0) : kInvalidDescriptorId;
137 int pagesExported = 0;
138 int prevIntPercent = 0;
140 const auto &clusterDesc = desc->GetClusterDescriptor(clusterId);
141 const RCluster *cluster = clusterPool.GetCluster(clusterId, columnSet);
142 for (const auto &colInfo : columnInfos) {
143 DescriptorId_t columnId = colInfo.fColDesc->GetPhysicalId();
144 const auto &pages = clusterDesc.GetPageRange(columnId);
145 const auto &colRange = clusterDesc.GetColumnRange(columnId);
146 std::uint64_t pageIdx = 0;
147
149 << "exporting column \"" << colInfo.fQualifiedName << "\" (" << pages.fPageInfos.size() << " pages)";
150
151 // We should never try to export a suppressed column range
152 assert(!colRange.fIsSuppressed || pages.fPageInfos.empty());
153
154 for (const auto &pageInfo : pages.fPageInfos) {
156 const ROnDiskPage *onDiskPage = cluster->GetOnDiskPage(key);
157
158 // dump the page
159 const void *pageBuf = onDiskPage->GetAddress();
160 const bool incChecksum = (options.fFlags & RPagesOptions::kIncludeChecksums) != 0 && pageInfo.fHasChecksum;
161 const std::size_t maybeChecksumSize = incChecksum * 8;
162 const std::uint64_t pageBufSize = pageInfo.fLocator.GetNBytesOnStorage() + maybeChecksumSize;
163 std::ostringstream ss{options.fOutputPath, std::ios_base::ate};
164 assert(colRange.fCompressionSettings);
165 ss << "/cluster_" << clusterDesc.GetId() << "_" << colInfo.fQualifiedName << "_page_" << pageIdx
166 << "_elems_" << pageInfo.fNElements << "_comp_" << *colRange.fCompressionSettings << ".page";
167 const auto outFileName = ss.str();
168 std::ofstream outFile{outFileName, std::ios_base::binary};
169 if (!outFile)
170 throw ROOT::RException(
171 R__FAIL(std::string("output path ") + options.fOutputPath + " does not exist or is not writable!"));
172
173 outFile.write(reinterpret_cast<const char *>(pageBuf), pageBufSize);
174
175 res.fExportedFileNames.push_back(outFileName);
176
178
179 if (showProgress) {
180 int intPercent = static_cast<int>(100.f * pagesExported / res.fExportedFileNames.size());
181 if (intPercent != prevIntPercent) {
182 fprintf(stderr, "\rExport progress: %02d%%", intPercent);
183 if (intPercent == 100)
184 fprintf(stderr, "\n");
186 }
187 }
188 }
189 }
190 clusterId = desc->FindNextClusterId(clusterId);
191 }
192
193 assert(res.fExportedFileNames.size() == static_cast<size_t>(pagesExported));
194 std::ostringstream ss;
195 ss << "exported " << res.fExportedFileNames.size() << " pages (";
196 if (options.fColumnTypeFilter.fSet.empty()) {
197 ss << addColRes.fNColsTotal << " columns)";
198 } else {
199 auto nColsFilteredOut = addColRes.fNColsTotal - columnInfos.size();
200 ss << nColsFilteredOut << "/" << addColRes.fNColsTotal << " columns filtered out)";
201 }
203
204 return res;
205}
206
207} // namespace ROOT::Experimental::Internal
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition RLogger.hxx:360
#define R__LOG_INFO(...)
Definition RLogger.hxx:359
const RColumnDescriptor * fColDesc
const RFieldDescriptor * fFieldDesc
int fNColsTotal
std::string fQualifiedName
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const RFieldDescriptor &srcFieldDesc, const RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:486
Managed a set of clusters containing compressed and packed pages.
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:152
std::unordered_set< DescriptorId_t > ColumnSet_t
Definition RCluster.hxx:154
@ kBlacklist
Don't export items contained in the filter's set.
static RPagesResult ExportPages(RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
A page as being stored on disk, that is packed and compressed.
Definition RCluster.hxx:42
Abstract interface to read data from an ntuple.
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
A log configuration for a channel, e.g.
Definition RLogger.hxx:98
const_iterator end() const
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
RFilter< ENTupleColumnType > fColumnTypeFilter
Optional filter that determines which columns are included or excluded from being exported.
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
On-disk pages within a page source are identified by the column and page number.
Definition RCluster.hxx:52