62void logArchitectureInfo(
bool useGPU)
74 static std::pair<bool, bool> lastUseGPU;
75 if (lastUseGPU.second && lastUseGPU.first == useGPU)
77 lastUseGPU = {useGPU,
true};
80 auto log = [](std::string_view message) {
85 throw std::runtime_error(std::string(
"In: ") + __func__ +
"(), " + __FILE__ +
":" + __LINE__ +
86 ": Cuda implementation of the computing library is not available\n");
89 log(
"using generic CPU library compiled with no vectorizations");
94 log(
"using CUDA computation library");
113 std::shared_ptr<Detail::AbsBuffer>
buffer;
118 bool copyAfterEvaluation =
false;
132 std::unique_ptr<RooFit::Detail::CudaInterface::CudaEvent>
event;
133 std::unique_ptr<RooFit::Detail::CudaInterface::CudaStream> stream;
137 void decrementRemainingClients()
154 : _bufferManager{std::make_unique<Detail::BufferManager>()},
160 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because ROOT was compiled without CUDA support!");
174 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
178 std::size_t iNode = 0;
182 auto &nodeInfo =
_nodes.back();
183 nodeInfo.absArg = arg;
184 nodeInfo.originalOperMode = arg->operMode();
185 nodeInfo.iNode = iNode;
186 nodeInfos[arg] = &nodeInfo;
189 nodeInfo.isVariable =
true;
191 arg->setDataToken(iNode);
194 nodeInfo.isCategory =
true;
201 info.serverInfos.reserve(info.absArg->servers().size());
202 for (
RooAbsArg *server : info.absArg->servers()) {
203 if (server->isValueServer(*info.absArg)) {
204 auto *serverInfo = nodeInfos.at(server);
205 info.serverInfos.emplace_back(serverInfo);
206 serverInfo->clientInfos.emplace_back(&info);
216 for (
auto &info :
_nodes) {
217 info.event = std::make_unique<CudaInterface::CudaEvent>(
false);
218 info.stream = std::make_unique<CudaInterface::CudaStream>();
220 cfg.setCudaStream(info.stream.get());
233 std::size_t iValueServer = 0;
234 for (
RooAbsArg *server : info.absArg->servers()) {
235 if (server->isValueServer(*info.absArg)) {
236 auto *knownServer = info.serverInfos[iValueServer]->absArg;
237 if (knownServer->hasDataToken()) {
238 server->setDataToken(knownServer->dataToken());
249 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
257 std::size_t iNode = 0;
258 for (
auto &info :
_nodes) {
259 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
260 if (fromArrayInput) {
261 info.fromArrayInput =
true;
262 info.absArg->setDataToken(iNode);
263 info.outputSize = inputArray.size();
266 if (info.outputSize == 1) {
280 _dataMapCPU.
set(info.absArg, {info.buffer->cpuReadPtr(), gpuSpan.size()});
286 _dataMapCUDA.
set(info.absArg, {info.buffer->gpuReadPtr(), cpuSpan.size()});
297 info.isDirty = !info.fromArrayInput;
306 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
307 for (
auto &info :
_nodes) {
308 if (info.fromArrayInput) {
309 sizeMap[info.absArg] = info.outputSize;
317 auto found = sizeMap.find(key);
318 return found != sizeMap.end() ? found->second : 0;
321 for (
auto &info :
_nodes) {
322 info.outputSize = outputSizeMap.at(info.absArg);
329 if (!info.isScalar()) {
347 for (
auto &info :
_nodes) {
348 info.absArg->resetDataToken();
354 using namespace Detail;
356 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
360 double *buffer =
nullptr;
373 <<
" could not be evaluated on the GPU because the class doesn't support it. "
374 "Consider requesting or implementing it to benefit from a speed up."
381 info.
buffer = info.copyAfterEvaluation ?
_bufferManager->makePinnedBuffer(nOut, info.stream.get())
387 buffer = info.
buffer->cpuWritePtr();
390 nodeAbsReal->computeBatch(buffer, nOut,
_dataMapCPU);
392 if (info.copyAfterEvaluation) {
406 auto *var =
static_cast<RooRealVar const *
>(node);
410 clientInfo->isDirty =
true;
422 clientInfo->isDirty =
true;
440 for (
auto &nodeInfo :
_nodes) {
441 if (!nodeInfo.fromArrayInput) {
442 if (nodeInfo.isVariable) {
445 if (nodeInfo.isDirty) {
448 nodeInfo.isDirty =
false;
462 for (
auto &info :
_nodes) {
463 info.remClients = info.clientInfos.size();
464 info.remServers = info.serverInfos.size();
465 if (info.buffer && !info.fromArrayInput) {
471 for (
auto &info :
_nodes) {
472 if (info.remServers == 0 && info.computeInGPU()) {
480 for (
auto &info :
_nodes) {
481 if (info.remServers == -1 && !info.stream->isActive()) {
484 for (
auto *infoClient : info.clientInfos) {
485 --infoClient->remServers;
486 if (infoClient->computeInGPU() && infoClient->remServers == 0) {
490 for (
auto *serverInfo : info.serverInfos) {
491 serverInfo->decrementRemainingClients();
498 for (; it !=
_nodes.end(); it++) {
499 if (it->remServers == 0 && !it->computeInGPU())
505 std::this_thread::sleep_for(std::chrono::milliseconds(1));
520 if (--infoClient->remServers == 0 && infoClient->computeInGPU()) {
525 serverInfo->decrementRemainingClients();
542 using namespace Detail;
551 if (infoServer->event)
552 info.stream->waitForEvent(*infoServer->event);
557 double *buffer =
nullptr;
562 info.
buffer = info.copyAfterEvaluation ?
_bufferManager->makePinnedBuffer(nOut, info.stream.get())
564 buffer = info.
buffer->gpuWritePtr();
569 if (info.copyAfterEvaluation) {
579 for (
auto &info :
_nodes) {
580 info.copyAfterEvaluation =
false;
582 if (!info.isScalar()) {
583 for (
auto *clientInfo : info.clientInfos) {
584 if (info.computeInGPU() != clientInfo->computeInGPU()) {
585 info.copyAfterEvaluation =
true;
605 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
607 std::vector<int> widths{9, 37, 20, 9, 10, 20};
609 auto printElement = [&](
int iCol,
auto const &t) {
610 const char separator =
' ';
611 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
615 auto printHorizontalRow = [&]() {
617 for (
int w : widths) {
620 for (
int i = 0; i <
n; i++) {
626 printHorizontalRow();
629 printElement(0,
"Index");
630 printElement(1,
"Name");
631 printElement(2,
"Class");
632 printElement(3,
"Size");
633 printElement(4,
"From Data");
634 printElement(5,
"1st value");
637 printHorizontalRow();
639 for (std::size_t iNode = 0; iNode <
_nodes.size(); ++iNode) {
640 auto &nodeInfo =
_nodes[iNode];
646 printElement(0, iNode);
647 printElement(1, node->
GetName());
649 printElement(3, nodeInfo.outputSize);
650 printElement(4, nodeInfo.fromArrayInput);
651 printElement(5, span[0]);
656 printHorizontalRow();
668 for (
auto &nodeInfo :
_nodes) {
669 if (!nodeInfo.fromArrayInput && nodeInfo.isVariable) {
670 parameters.
add(*nodeInfo.absArg);
Common abstract base class for objects that represent a value and a "shape" in RooFit.
virtual bool canComputeBatchWithCuda() const
virtual bool isReducerNode() const
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void set(RooAbsArg const *arg, std::span< const double > const &span)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
RooFit::Detail::DataMap _dataMapCPU
std::span< const double > run()
Returns the value of the top node in the computation graph.
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
std::vector< NodeInfo > _nodes
bool _needToUpdateOutputSizes
void print(std::ostream &os) const
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
RooFit::Detail::DataMap _dataMapCUDA
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
std::unique_ptr< Detail::BufferManager > _bufferManager
std::stack< std::unique_ptr< ChangeOperModeRAII > > _changeOperModeRAIIs
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
RooRealVar represents a variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
std::string cpuArchitectureName()
Architecture cpuArchitecture()
void cudaEventRecord(CudaEvent &, CudaStream &)
Records a CUDA event.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
std::size_t lastSetValCount
std::vector< NodeInfo * > serverInfos
std::shared_ptr< Detail::AbsBuffer > buffer
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos