Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Evaluator.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file Evaluator.cxx
16\class RooFit::Evaluator
17\ingroup Roofitcore
18
19Evaluates a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFit/Evaluator.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
37#include <RooNameReg.h>
38#include <RooSimultaneous.h>
39
40#include <RooBatchCompute.h>
41
43#include "RooFitImplHelpers.h"
44
45#include <chrono>
46#include <iomanip>
47#include <numeric>
48#include <thread>
49
50namespace RooFit {
51
52namespace {
53
54// To avoid deleted move assignment.
55template <class T>
56void assignSpan(std::span<T> &to, std::span<T> const &from)
57{
58 to = from;
59}
60
62{
63 // We have to exit early if the message stream is not active. Otherwise it's
64 // possible that this function skips logging because it thinks it has
65 // already logged, but actually it didn't.
66 if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {
67 return;
68 }
69
70 // Don't repeat logging architecture info if the useGPU option didn't change
71 {
72 // Second element of pair tracks whether this function has already been called
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
75 return;
76 lastUseGPU = {useGPU, true};
77 }
78
79 auto log = [](std::string_view message) {
80 oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;
81 };
82
84 log("using generic CPU library compiled with no vectorizations");
85 } else {
86 log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());
87 }
88 if (useGPU) {
89 log("using CUDA computation library");
90 }
91}
92
93} // namespace
94
95/// A struct used by the Evaluator to store information on the RooAbsArgs in
96/// the computation graph.
97struct NodeInfo {
98
99 bool isScalar() const { return outputSize == 1; }
100
101 RooAbsArg *absArg = nullptr;
103
104 std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;
105 std::size_t iNode = 0;
106 int remClients = 0;
107 int remServers = 0;
109 bool fromArrayInput = false;
110 bool isVariable = false;
111 bool isDirty = true;
112 bool isCategory = false;
113 bool hasLogged = false;
114 bool computeInGPU = false;
115 bool isValueServer = false; // if this node is a value server to the top node
116 std::size_t outputSize = 1;
117 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
118 int lastCatVal = std::numeric_limits<int>::max();
119 double scalarBuffer = 0.0;
120 std::vector<NodeInfo *> serverInfos;
121 std::vector<NodeInfo *> clientInfos;
122
125
126 /// Check the servers of a node that has been computed and release its
127 /// resources if they are no longer needed.
129 {
130 if (--remClients == 0 && !fromArrayInput) {
131 buffer.reset();
132 }
133 }
134
142};
143
144/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,
145/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
146/// there's also some CUDA-related initialization.
147///
148/// \param[in] absReal The RooAbsReal object that sits on top of the
149/// computation graph that we want to evaluate.
150/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.
152 : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}
153{
155 if (useGPU && RooBatchCompute::initCUDA() != 0) {
156 throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
157 }
158 // Some checks and logging of used architectures
160
163
166
168 if (useGPU) {
170 }
171
172 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
173
174 // Fill the ordered nodes list and initialize the node info structs.
175 _nodes.reserve(serverSet.size());
176 std::size_t iNode = 0;
177 for (RooAbsArg *arg : serverSet) {
178
179 _nodes.emplace_back();
180 auto &nodeInfo = _nodes.back();
181 _nodesMap[arg->namePtr()] = &nodeInfo;
182
183 nodeInfo.absArg = arg;
184 nodeInfo.originalOperMode = arg->operMode();
185 nodeInfo.iNode = iNode;
186 nodeInfos[arg] = &nodeInfo;
187
188 if (dynamic_cast<RooRealVar const *>(arg)) {
189 nodeInfo.isVariable = true;
190 } else {
191 arg->setDataToken(iNode);
192 }
193 if (dynamic_cast<RooAbsCategory const *>(arg)) {
194 nodeInfo.isCategory = true;
195 }
196
197 ++iNode;
198 }
199
200 for (NodeInfo &info : _nodes) {
201 info.serverInfos.reserve(info.absArg->servers().size());
202 for (RooAbsArg *server : info.absArg->servers()) {
203 if (server->isValueServer(*info.absArg)) {
204 auto *serverInfo = nodeInfos.at(server);
205 info.serverInfos.emplace_back(serverInfo);
206 serverInfo->clientInfos.emplace_back(&info);
207 }
208 }
209 }
210
211 // Figure out which nodes are value servers to the top node
212 _nodes.back().isValueServer = true; // the top node itself
213 for (auto iter = _nodes.rbegin(); iter != _nodes.rend(); ++iter) {
214 if (!iter->isValueServer)
215 continue;
216 for (auto &serverInfo : iter->serverInfos) {
217 serverInfo->isValueServer = true;
218 }
219 }
220
222
223 if (_useGPU) {
224 // create events and streams for every node
225 for (auto &info : _nodes) {
229 cfg.setCudaStream(info.stream);
230 _evalContextCUDA.setConfig(info.absArg, cfg);
231 }
232 }
233}
234
235/// If there are servers with the same name that got de-duplicated in the
236/// `_nodes` list, we need to set their data tokens too. We find such nodes by
237/// visiting the servers of every known node.
239{
240 for (NodeInfo &info : _nodes) {
241 std::size_t iValueServer = 0;
242 for (RooAbsArg *server : info.absArg->servers()) {
243 if (server->isValueServer(*info.absArg)) {
244 auto *knownServer = info.serverInfos[iValueServer]->absArg;
245 if (knownServer->hasDataToken()) {
246 server->setDataToken(knownServer->dataToken());
247 }
248 ++iValueServer;
249 }
250 }
251 }
252}
253
254void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)
255{
256 if (isOnDevice && !_useGPU) {
257 throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
258 }
259
260 // Check if "name" is used in the computation graph. If yes, add the span to
261 // the data map and set the node info accordingly.
262
263 auto found = _nodesMap.find(RooNameReg::ptr(name.c_str()));
264
265 if (found == _nodesMap.end())
266 return;
267
269
270 NodeInfo &info = *found->second;
271
272 info.fromArrayInput = true;
273 info.absArg->setDataToken(info.iNode);
274 info.outputSize = inputArray.size();
275
276 if (!_useGPU) {
278 return;
279 }
280
281 if (info.outputSize <= 1) {
282 // Empty or scalar observables from the data don't need to be
283 // copied to the GPU.
286 return;
287 }
288
289 // For simplicity, we put the data on both host and device for
290 // now. This could be optimized by inspecting the clients of the
291 // variable.
292 if (isOnDevice) {
294 auto gpuSpan = _evalContextCUDA.at(info.absArg);
295 info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
296 info.buffer->assignFromDevice(gpuSpan);
297 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
298 } else {
300 auto cpuSpan = _evalContextCPU.at(info.absArg);
301 info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
302 info.buffer->assignFromHost(cpuSpan);
303 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
304 }
305}
306
308{
309 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
310 for (auto &info : _nodes) {
311 if (info.fromArrayInput) {
312 sizeMap[info.absArg] = info.outputSize;
313 } else {
314 // any buffer for temporary results is invalidated by resetting the output sizes
315 info.buffer.reset();
316 }
317 }
318
319 auto outputSizeMap =
320 RooFit::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {
321 auto found = sizeMap.find(key);
322 return found != sizeMap.end() ? found->second : -1;
323 });
324
325 for (auto &info : _nodes) {
326 info.outputSize = outputSizeMap.at(info.absArg);
327 info.isDirty = true;
328
329 // In principle we don't need dirty flag propagation because the driver
330 // takes care of deciding which node needs to be re-evaluated. However,
331 // disabling it also for scalar mode results in very long fitting times
332 // for specific models (test 14 in stressRooFit), which still needs to be
333 // understood. TODO.
334 if (!info.isScalar()) {
336 } else {
337 setOperMode(info.absArg, info.originalOperMode);
338 }
339 }
340
341 if (_useGPU) {
342 markGPUNodes();
343 }
344
346}
347
349{
350 for (auto &info : _nodes) {
351 if (!info.isVariable) {
352 info.absArg->resetDataToken();
353 }
354 }
355}
356
358{
359 using namespace Detail;
360
361 const std::size_t nOut = info.outputSize;
362
363 double *buffer = nullptr;
364 if (nOut == 1) {
365 buffer = &info.scalarBuffer;
366 if (_useGPU) {
367 _evalContextCUDA.set(node, {buffer, nOut});
368 }
369 } else {
370 if (!info.hasLogged && _useGPU) {
371 RooAbsArg const &arg = *info.absArg;
372 oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()
373 << " could not be evaluated on the GPU because the class doesn't support it. "
374 "Consider requesting or implementing it to benefit from a speed up."
375 << std::endl;
376 info.hasLogged = true;
377 }
378 if (!info.buffer) {
379 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
380 : _bufferManager->makeCpuBuffer(nOut);
381 }
382 buffer = info.buffer->hostWritePtr();
383 }
385 _evalContextCPU.set(node, {buffer, nOut});
386 if (nOut > 1) {
388 }
389 if (info.isCategory) {
390 auto nodeAbsCategory = static_cast<RooAbsCategory const *>(node);
391 if (nOut == 1) {
392 buffer[0] = nodeAbsCategory->getCurrentIndex();
393 } else {
394 throw std::runtime_error("RooFit::Evaluator - non-scalar category values are not supported!");
395 }
396 } else {
397 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
399 }
402 if (info.copyAfterEvaluation) {
403 _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});
404 if (info.event) {
406 }
407 }
408}
409
410/// Process a variable in the computation graph. This is a separate non-inlined
411/// function such that we can see in performance profiles how long this takes.
413{
414 RooAbsArg *node = nodeInfo.absArg;
415 auto *var = static_cast<RooRealVar const *>(node);
416 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
417 nodeInfo.lastSetValCount = var->valueResetCounter();
418 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
419 clientInfo->isDirty = true;
420 }
422 nodeInfo.isDirty = false;
423 }
424}
425
426/// Process a category in the computation graph. This is a separate non-inlined
427/// function such that we can see in performance profiles how long this takes.
429{
430 RooAbsArg *node = nodeInfo.absArg;
431 auto *cat = static_cast<RooAbsCategory const *>(node);
432 if (nodeInfo.lastCatVal != cat->getCurrentIndex()) {
433 nodeInfo.lastCatVal = cat->getCurrentIndex();
434 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
435 clientInfo->isDirty = true;
436 }
438 nodeInfo.isDirty = false;
439 }
440}
441
442/// Flags all the clients of a given node dirty. This is a separate non-inlined
443/// function such that we can see in performance profiles how long this takes.
445{
446 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
447 clientInfo->isDirty = true;
448 }
449}
450
451/// Returns the value of the top node in the computation graph
452std::span<const double> Evaluator::run()
453{
456
458
459 if (_useGPU) {
460 return getValHeterogeneous();
461 }
462
463 for (auto &nodeInfo : _nodes) {
464 if (!nodeInfo.fromArrayInput) {
465 if (nodeInfo.isVariable) {
467 } else if (nodeInfo.isCategory) {
469 } else {
470 if (nodeInfo.isDirty) {
473 nodeInfo.isDirty = false;
474 }
475 }
476 }
477 }
478
479 // return the final output
480 return _evalContextCPU.at(&_topNode);
481}
482
483/// Returns the value of the top node in the computation graph
484std::span<const double> Evaluator::getValHeterogeneous()
485{
486 for (auto &info : _nodes) {
487 info.remClients = info.clientInfos.size();
488 info.remServers = info.serverInfos.size();
489 if (info.buffer && !info.fromArrayInput) {
490 info.buffer.reset();
491 }
492 }
493
494 // find initial GPU nodes and assign them to GPU
495 for (auto &info : _nodes) {
496 if (info.remServers == 0 && info.computeInGPU) {
498 }
499 }
500
501 NodeInfo const &topNodeInfo = _nodes.back();
502 while (topNodeInfo.remServers != -2) {
503 // find finished GPU nodes
504 for (auto &info : _nodes) {
505 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {
506 info.remServers = -2;
507 // Decrement number of remaining servers for clients and start GPU computations
508 for (auto *infoClient : info.clientInfos) {
509 --infoClient->remServers;
510 if (infoClient->computeInGPU && infoClient->remServers == 0) {
512 }
513 }
514 for (auto *serverInfo : info.serverInfos) {
515 serverInfo->decrementRemainingClients();
516 }
517 }
518 }
519
520 // find next CPU node
521 auto it = _nodes.begin();
522 for (; it != _nodes.end(); it++) {
523 if (it->remServers == 0 && !it->computeInGPU)
524 break;
525 }
526
527 // if no CPU node available sleep for a while to save CPU usage
528 if (it == _nodes.end()) {
529 std::this_thread::sleep_for(std::chrono::milliseconds(1));
530 continue;
531 }
532
533 // compute next CPU node
534 NodeInfo &info = *it;
535 RooAbsArg const *node = info.absArg;
536 info.remServers = -2; // so that it doesn't get picked again
537
538 if (!info.fromArrayInput) {
539 computeCPUNode(node, info);
540 }
541
542 // Assign the clients that are computed on the GPU
543 for (auto *infoClient : info.clientInfos) {
544 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
546 }
547 }
548 for (auto *serverInfo : info.serverInfos) {
549 serverInfo->decrementRemainingClients();
550 }
551 }
552
553 // return the final value
555}
556
557/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
558/// in case they only depend on GPU nodes.
560{
561 using namespace Detail;
562
563 info.remServers = -1;
564
565 auto node = static_cast<RooAbsReal const *>(info.absArg);
566
567 // wait for every server to finish
568 for (auto *infoServer : info.serverInfos) {
569 if (infoServer->event)
571 }
572
573 const std::size_t nOut = info.outputSize;
574
575 double *buffer = nullptr;
576 if (nOut == 1) {
577 buffer = &info.scalarBuffer;
578 _evalContextCPU.set(node, {buffer, nOut});
579 } else {
580 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
581 : _bufferManager->makeGpuBuffer(nOut);
582 buffer = info.buffer->deviceWritePtr();
583 }
585 _evalContextCUDA.set(node, {buffer, nOut});
586 node->doEval(_evalContextCUDA);
588 if (info.copyAfterEvaluation) {
589 _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});
590 }
591}
592
593/// Decides which nodes are assigned to the GPU in a CUDA fit.
595{
596 // Decide which nodes get evaluated on the GPU: we select nodes that support
597 // CUDA evaluation and have at least one input of size greater than one.
598 for (auto &info : _nodes) {
599 info.computeInGPU = false;
600 if (!info.absArg->canComputeBatchWithCuda()) {
601 continue;
602 }
603 for (NodeInfo const *serverInfo : info.serverInfos) {
604 if (serverInfo->outputSize > 1) {
605 info.computeInGPU = true;
606 break;
607 }
608 }
609 }
610
611 // In a second pass, figure out which nodes need to copy over their results.
612 for (auto &info : _nodes) {
613 info.copyAfterEvaluation = false;
614 // scalar nodes don't need copying
615 if (!info.isScalar()) {
616 for (auto *clientInfo : info.clientInfos) {
617 if (info.computeInGPU != clientInfo->computeInGPU) {
618 info.copyAfterEvaluation = true;
619 break;
620 }
621 }
622 }
623 }
624}
625
626/// Temporarily change the operation mode of a RooAbsArg until the
627/// Evaluator gets deleted.
629{
630 if (opMode != arg->operMode()) {
631 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, opMode));
632 }
633}
634
635void Evaluator::print(std::ostream &os)
636{
637 std::cout << "--- RooFit BatchMode evaluation ---\n";
638
639 std::vector<int> widths{9, 37, 20, 9, 10, 20};
640
641 auto printElement = [&](int iCol, auto const &t) {
642 const char separator = ' ';
643 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
644 os << "|";
645 };
646
647 auto printHorizontalRow = [&]() {
648 int n = 0;
649 for (int w : widths) {
650 n += w + 2;
651 }
652 for (int i = 0; i < n; i++) {
653 os << '-';
654 }
655 os << "|\n";
656 };
657
659
660 os << "|";
661 printElement(0, "Index");
662 printElement(1, "Name");
663 printElement(2, "Class");
664 printElement(3, "Size");
665 printElement(4, "From Data");
666 printElement(5, "1st value");
667 std::cout << "\n";
668
670
671 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
672 auto &nodeInfo = _nodes[iNode];
673 RooAbsArg *node = nodeInfo.absArg;
674
675 auto span = _evalContextCPU.at(node);
676
677 os << "|";
678 printElement(0, iNode);
679 printElement(1, node->GetName());
680 printElement(2, node->ClassName());
681 printElement(3, nodeInfo.outputSize);
682 printElement(4, nodeInfo.fromArrayInput);
683 printElement(5, span[0]);
684
685 std::cout << "\n";
686 }
687
689}
690
691/// Gets all the parameters of the RooAbsReal. This is in principle not
692/// necessary, because we can always ask the RooAbsReal itself, but the
693/// Evaluator has the cached information to get the answer quicker.
694/// Therefore, this is not meant to be used in general, just where it matters.
695/// \warning If we find another solution to get the parameters efficiently,
696/// this function might be removed without notice.
698{
699 RooArgSet parameters;
700 for (auto &nodeInfo : _nodes) {
701 if (nodeInfo.isValueServer && nodeInfo.absArg->isFundamental()) {
702 parameters.add(*nodeInfo.absArg);
703 }
704 }
705 // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.
706 parameters.sort();
707 return parameters;
708}
709
710/// \brief Sets the offset mode for evaluation.
711///
712/// This function sets the offset mode for evaluation to the specified mode.
713/// It updates the offset mode for both CPU and CUDA evaluation contexts.
714///
715/// \param mode The offset mode to be set.
716///
717/// \note This function marks reducer nodes as dirty if the offset mode is
718/// changed, because only reducer nodes can use offsetting.
720{
722 return;
723
726
727 for (auto &nodeInfo : _nodes) {
728 if (nodeInfo.absArg->isReducerNode()) {
729 nodeInfo.isDirty = true;
730 }
731 }
732}
733
734} // namespace RooFit
#define oocoutI(o, a)
#define oocxcoutI(o, a)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char mode
char name[80]
Definition TGX11.cxx:110
const_iterator begin() const
const_iterator end() const
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:76
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:419
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:63
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void enableVectorBuffers(bool enable)
OffsetMode _offsetMode
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
std::span< double > _currentOutput
void resize(std::size_t n)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
const bool _useGPU
Definition Evaluator.h:62
std::unordered_map< TNamed const *, NodeInfo * > _nodesMap
Definition Evaluator.h:68
std::vector< NodeInfo > _nodes
Definition Evaluator.h:67
bool _needToUpdateOutputSizes
Definition Evaluator.h:64
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
void processCategory(NodeInfo &nodeInfo)
Process a category in the computation graph.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:60
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:66
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:65
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
std::stack< std::unique_ptr< ChangeOperModeRAII > > _changeOperModeRAIIs
Definition Evaluator.h:69
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
RooAbsReal & _topNode
Definition Evaluator.h:61
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
Definition RooRealVar.h:37
const char * GetName() const override
Returns name of object.
Definition TNamed.h:49
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:226
const Int_t n
Definition legend1.C:16
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition CodegenImpl.h:67
@ FastEvaluations
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:97
RooBatchCompute::CudaInterface::CudaStream * stream
RooAbsArg * absArg
bool isScalar() const
Definition Evaluator.cxx:99
std::size_t iNode
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::size_t outputSize
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...