Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Evaluator.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file Evaluator.cxx
16\class RooFit::Evaluator
17\ingroup Roofitcore
18
19Evaluates a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFit/Evaluator.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
37#include <RooNameReg.h>
38#include <RooSimultaneous.h>
39
40#include <RooBatchCompute.h>
41
43#include "RooFitImplHelpers.h"
44
45#include <chrono>
46#include <iomanip>
47#include <numeric>
48#include <thread>
49
50namespace RooFit {
51
52namespace {
53
54// To avoid deleted move assignment.
55template <class T>
56void assignSpan(std::span<T> &to, std::span<T> const &from)
57{
58 to = from;
59}
60
61void logArchitectureInfo(bool useGPU)
62{
63 // We have to exit early if the message stream is not active. Otherwise it's
64 // possible that this function skips logging because it thinks it has
65 // already logged, but actually it didn't.
66 if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {
67 return;
68 }
69
70 // Don't repeat logging architecture info if the useGPU option didn't change
71 {
72 // Second element of pair tracks whether this function has already been called
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
75 return;
76 lastUseGPU = {useGPU, true};
77 }
78
79 auto log = [](std::string_view message) {
80 oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;
81 };
82
84 log("using generic CPU library compiled with no vectorizations");
85 } else {
86 log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());
87 }
88 if (useGPU) {
89 log("using CUDA computation library");
90 }
91}
92
93} // namespace
94
95/// A struct used by the Evaluator to store information on the RooAbsArgs in
96/// the computation graph.
97struct NodeInfo {
98
99 bool isScalar() const { return outputSize == 1; }
100
101 bool computeInGPU() const { return (absArg->isReducerNode() || !isScalar()) && absArg->canComputeBatchWithCuda(); }
102
103 RooAbsArg *absArg = nullptr;
105
106 std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;
107 std::size_t iNode = 0;
108 int remClients = 0;
109 int remServers = 0;
111 bool fromArrayInput = false;
112 bool isVariable = false;
113 bool isDirty = true;
114 bool isCategory = false;
115 bool hasLogged = false;
116 std::size_t outputSize = 1;
117 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
118 double scalarBuffer = 0.0;
119 std::vector<NodeInfo *> serverInfos;
120 std::vector<NodeInfo *> clientInfos;
121
124
125 /// Check the servers of a node that has been computed and release its
126 /// resources if they are no longer needed.
128 {
129 if (--remClients == 0 && !fromArrayInput) {
130 buffer.reset();
131 }
132 }
133
135 {
136 if (event)
138 if (stream)
140 }
141};
142
143/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,
144/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
145/// there's also some CUDA-related initialization.
146///
147/// \param[in] absReal The RooAbsReal object that sits on top of the
148/// computation graph that we want to evaluate.
149/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.
150Evaluator::Evaluator(const RooAbsReal &absReal, bool useGPU)
151 : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}
152{
154 if (useGPU && RooBatchCompute::initCUDA() != 0) {
155 throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
156 }
157 // Some checks and logging of used architectures
158 logArchitectureInfo(_useGPU);
159
162
163 RooArgSet serverSet;
165
166 _evalContextCPU.resize(serverSet.size());
167 if (useGPU) {
168 _evalContextCUDA.resize(serverSet.size());
169 }
170
171 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
172
173 // Fill the ordered nodes list and initialize the node info structs.
174 _nodes.reserve(serverSet.size());
175 std::size_t iNode = 0;
176 for (RooAbsArg *arg : serverSet) {
177
178 _nodes.emplace_back();
179 auto &nodeInfo = _nodes.back();
180 nodeInfo.absArg = arg;
181 nodeInfo.originalOperMode = arg->operMode();
182 nodeInfo.iNode = iNode;
183 nodeInfos[arg] = &nodeInfo;
184
185 if (dynamic_cast<RooRealVar const *>(arg)) {
186 nodeInfo.isVariable = true;
187 } else {
188 arg->setDataToken(iNode);
189 }
190 if (dynamic_cast<RooAbsCategory const *>(arg)) {
191 nodeInfo.isCategory = true;
192 }
193
194 ++iNode;
195 }
196
197 for (NodeInfo &info : _nodes) {
198 info.serverInfos.reserve(info.absArg->servers().size());
199 for (RooAbsArg *server : info.absArg->servers()) {
200 if (server->isValueServer(*info.absArg)) {
201 auto *serverInfo = nodeInfos.at(server);
202 info.serverInfos.emplace_back(serverInfo);
203 serverInfo->clientInfos.emplace_back(&info);
204 }
205 }
206 }
207
209
210 if (_useGPU) {
211 // create events and streams for every node
212 for (auto &info : _nodes) {
213 info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);
216 cfg.setCudaStream(info.stream);
217 _evalContextCUDA.setConfig(info.absArg, cfg);
218 }
219 }
220}
221
222/// If there are servers with the same name that got de-duplicated in the
223/// `_nodes` list, we need to set their data tokens too. We find such nodes by
224/// visiting the servers of every known node.
226{
227 for (NodeInfo &info : _nodes) {
228 std::size_t iValueServer = 0;
229 for (RooAbsArg *server : info.absArg->servers()) {
230 if (server->isValueServer(*info.absArg)) {
231 auto *knownServer = info.serverInfos[iValueServer]->absArg;
232 if (knownServer->hasDataToken()) {
233 server->setDataToken(knownServer->dataToken());
234 }
235 ++iValueServer;
236 }
237 }
238 }
239}
240
241void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)
242{
243 if (isOnDevice && !_useGPU) {
244 throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
245 }
246
247 auto namePtr = RooNameReg::ptr(name.c_str());
248
249 // Iterate over the given data spans and add them to the data map. Check if
250 // they are used in the computation graph. If yes, add the span to the data
251 // map and set the node info accordingly.
252 std::size_t iNode = 0;
253 for (auto &info : _nodes) {
254 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
255 if (fromArrayInput) {
256 info.fromArrayInput = true;
257 info.absArg->setDataToken(iNode);
258 info.outputSize = inputArray.size();
259 if (_useGPU && info.outputSize <= 1) {
260 // Empty or scalar observables from the data don't need to be
261 // copied to the GPU.
262 _evalContextCPU.set(info.absArg, inputArray);
263 _evalContextCUDA.set(info.absArg, inputArray);
264 } else if (_useGPU && info.outputSize > 1) {
265 // For simplicity, we put the data on both host and device for
266 // now. This could be optimized by inspecting the clients of the
267 // variable.
268 if (isOnDevice) {
269 _evalContextCUDA.set(info.absArg, inputArray);
270 auto gpuSpan = _evalContextCUDA.at(info.absArg);
271 info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
272 info.buffer->assignFromDevice(gpuSpan);
273 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
274 } else {
275 _evalContextCPU.set(info.absArg, inputArray);
276 auto cpuSpan = _evalContextCPU.at(info.absArg);
277 info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
278 info.buffer->assignFromHost(cpuSpan);
279 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
280 }
281 } else {
282 _evalContextCPU.set(info.absArg, inputArray);
283 }
284 }
285 info.isDirty = !info.fromArrayInput;
286 ++iNode;
287 }
288
289 _needToUpdateOutputSizes = true;
290}
291
292void Evaluator::updateOutputSizes()
293{
294 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
295 for (auto &info : _nodes) {
296 if (info.fromArrayInput) {
297 sizeMap[info.absArg] = info.outputSize;
298 } else {
299 // any buffer for temporary results is invalidated by resetting the output sizes
300 info.buffer.reset();
301 }
302 }
303
304 auto outputSizeMap =
305 RooFit::Detail::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {
306 auto found = sizeMap.find(key);
307 return found != sizeMap.end() ? found->second : -1;
308 });
309
310 for (auto &info : _nodes) {
311 info.outputSize = outputSizeMap.at(info.absArg);
312
313 // In principle we don't need dirty flag propagation because the driver
314 // takes care of deciding which node needs to be re-evaluated. However,
315 // disabling it also for scalar mode results in very long fitting times
316 // for specific models (test 14 in stressRooFit), which still needs to be
317 // understood. TODO.
318 if (!info.isScalar()) {
319 setOperMode(info.absArg, RooAbsArg::ADirty);
320 } else {
321 setOperMode(info.absArg, info.originalOperMode);
322 }
323 }
324
325 if (_useGPU) {
326 markGPUNodes();
327 }
328
329 _needToUpdateOutputSizes = false;
330}
331
332Evaluator::~Evaluator()
333{
334 for (auto &info : _nodes) {
335 if(!info.isVariable) {
336 info.absArg->resetDataToken();
337 }
338 }
339}
340
341void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)
342{
343 using namespace Detail;
344
345 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
346
347 const std::size_t nOut = info.outputSize;
348
349 double *buffer = nullptr;
350 if (nOut == 1) {
351 buffer = &info.scalarBuffer;
352 if (_useGPU) {
353 _evalContextCUDA.set(node, {buffer, nOut});
354 }
355 } else {
356 if (!info.hasLogged && _useGPU) {
357 RooAbsArg const &arg = *info.absArg;
358 oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()
359 << " could not be evaluated on the GPU because the class doesn't support it. "
360 "Consider requesting or implementing it to benefit from a speed up."
361 << std::endl;
362 info.hasLogged = true;
363 }
364 if (!info.buffer) {
365 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
366 : _bufferManager->makeCpuBuffer(nOut);
367 }
368 buffer = info.buffer->hostWritePtr();
369 }
370 assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
371 _evalContextCPU.set(node, {buffer, nOut});
372 if (nOut > 1) {
373 _evalContextCPU.enableVectorBuffers(true);
374 }
375 nodeAbsReal->doEval(_evalContextCPU);
376 _evalContextCPU.resetVectorBuffers();
377 _evalContextCPU.enableVectorBuffers(false);
378 if (info.copyAfterEvaluation) {
379 _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});
380 if (info.event) {
382 }
383 }
384}
385
386/// Process a variable in the computation graph. This is a separate non-inlined
387/// function such that we can see in performance profiles how long this takes.
388void Evaluator::processVariable(NodeInfo &nodeInfo)
389{
390 RooAbsArg *node = nodeInfo.absArg;
391 auto *var = static_cast<RooRealVar const *>(node);
392 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
393 nodeInfo.lastSetValCount = var->valueResetCounter();
394 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
395 clientInfo->isDirty = true;
396 }
397 computeCPUNode(node, nodeInfo);
398 nodeInfo.isDirty = false;
399 }
400}
401
402/// Flags all the clients of a given node dirty. This is a separate non-inlined
403/// function such that we can see in performance profiles how long this takes.
404void Evaluator::setClientsDirty(NodeInfo &nodeInfo)
405{
406 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
407 clientInfo->isDirty = true;
408 }
409}
410
411/// Returns the value of the top node in the computation graph
412std::span<const double> Evaluator::run()
413{
414 if (_needToUpdateOutputSizes)
415 updateOutputSizes();
416
417 ++_nEvaluations;
418
419 if (_useGPU) {
420 return getValHeterogeneous();
421 }
422
423 for (auto &nodeInfo : _nodes) {
424 if (!nodeInfo.fromArrayInput) {
425 if (nodeInfo.isVariable) {
426 processVariable(nodeInfo);
427 } else {
428 if (nodeInfo.isDirty) {
429 setClientsDirty(nodeInfo);
430 computeCPUNode(nodeInfo.absArg, nodeInfo);
431 nodeInfo.isDirty = false;
432 }
433 }
434 }
435 }
436
437 // return the final output
438 return _evalContextCPU.at(&_topNode);
439}
440
441/// Returns the value of the top node in the computation graph
442std::span<const double> Evaluator::getValHeterogeneous()
443{
444 for (auto &info : _nodes) {
445 info.remClients = info.clientInfos.size();
446 info.remServers = info.serverInfos.size();
447 if (info.buffer && !info.fromArrayInput) {
448 info.buffer.reset();
449 }
450 }
451
452 // find initial GPU nodes and assign them to GPU
453 for (auto &info : _nodes) {
454 if (info.remServers == 0 && info.computeInGPU()) {
455 assignToGPU(info);
456 }
457 }
458
459 NodeInfo const &topNodeInfo = _nodes.back();
460 while (topNodeInfo.remServers != -2) {
461 // find finished GPU nodes
462 for (auto &info : _nodes) {
463 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {
464 info.remServers = -2;
465 // Decrement number of remaining servers for clients and start GPU computations
466 for (auto *infoClient : info.clientInfos) {
467 --infoClient->remServers;
468 if (infoClient->computeInGPU() && infoClient->remServers == 0) {
469 assignToGPU(*infoClient);
470 }
471 }
472 for (auto *serverInfo : info.serverInfos) {
473 serverInfo->decrementRemainingClients();
474 }
475 }
476 }
477
478 // find next CPU node
479 auto it = _nodes.begin();
480 for (; it != _nodes.end(); it++) {
481 if (it->remServers == 0 && !it->computeInGPU())
482 break;
483 }
484
485 // if no CPU node available sleep for a while to save CPU usage
486 if (it == _nodes.end()) {
487 std::this_thread::sleep_for(std::chrono::milliseconds(1));
488 continue;
489 }
490
491 // compute next CPU node
492 NodeInfo &info = *it;
493 RooAbsArg const *node = info.absArg;
494 info.remServers = -2; // so that it doesn't get picked again
495
496 if (!info.fromArrayInput) {
497 computeCPUNode(node, info);
498 }
499
500 // Assign the clients that are computed on the GPU
501 for (auto *infoClient : info.clientInfos) {
502 if (--infoClient->remServers == 0 && infoClient->computeInGPU()) {
503 assignToGPU(*infoClient);
504 }
505 }
506 for (auto *serverInfo : info.serverInfos) {
507 serverInfo->decrementRemainingClients();
508 }
509 }
510
511 // return the final value
512 return _evalContextCUDA.at(&_topNode);
513}
514
515/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
516/// in case they only depend on GPU nodes.
517void Evaluator::assignToGPU(NodeInfo &info)
518{
519 using namespace Detail;
520
521 info.remServers = -1;
522
523 auto node = static_cast<RooAbsReal const *>(info.absArg);
524
525 // wait for every server to finish
526 for (auto *infoServer : info.serverInfos) {
527 if (infoServer->event)
529 }
530
531 const std::size_t nOut = info.outputSize;
532
533 double *buffer = nullptr;
534 if (nOut == 1) {
535 buffer = &info.scalarBuffer;
536 _evalContextCPU.set(node, {buffer, nOut});
537 } else {
538 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
539 : _bufferManager->makeGpuBuffer(nOut);
540 buffer = info.buffer->deviceWritePtr();
541 }
542 assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
543 _evalContextCUDA.set(node, {buffer, nOut});
544 node->doEval(_evalContextCUDA);
546 if (info.copyAfterEvaluation) {
547 _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});
548 }
549}
550
551/// Decides which nodes are assigned to the GPU in a CUDA fit.
552void Evaluator::markGPUNodes()
553{
554 for (auto &info : _nodes) {
555 info.copyAfterEvaluation = false;
556 // scalar nodes don't need copying
557 if (!info.isScalar()) {
558 for (auto *clientInfo : info.clientInfos) {
559 if (info.computeInGPU() != clientInfo->computeInGPU()) {
560 info.copyAfterEvaluation = true;
561 break;
562 }
563 }
564 }
565 }
566}
567
568/// Temporarily change the operation mode of a RooAbsArg until the
569/// Evaluator gets deleted.
570void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
571{
572 if (opMode != arg->operMode()) {
573 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, opMode));
574 }
575}
576
577void Evaluator::print(std::ostream &os)
578{
579 std::cout << "--- RooFit BatchMode evaluation ---\n";
580
581 std::vector<int> widths{9, 37, 20, 9, 10, 20};
582
583 auto printElement = [&](int iCol, auto const &t) {
584 const char separator = ' ';
585 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
586 os << "|";
587 };
588
589 auto printHorizontalRow = [&]() {
590 int n = 0;
591 for (int w : widths) {
592 n += w + 2;
593 }
594 for (int i = 0; i < n; i++) {
595 os << '-';
596 }
597 os << "|\n";
598 };
599
600 printHorizontalRow();
601
602 os << "|";
603 printElement(0, "Index");
604 printElement(1, "Name");
605 printElement(2, "Class");
606 printElement(3, "Size");
607 printElement(4, "From Data");
608 printElement(5, "1st value");
609 std::cout << "\n";
610
611 printHorizontalRow();
612
613 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
614 auto &nodeInfo = _nodes[iNode];
615 RooAbsArg *node = nodeInfo.absArg;
616
617 auto span = _evalContextCPU.at(node);
618
619 os << "|";
620 printElement(0, iNode);
621 printElement(1, node->GetName());
622 printElement(2, node->ClassName());
623 printElement(3, nodeInfo.outputSize);
624 printElement(4, nodeInfo.fromArrayInput);
625 printElement(5, span[0]);
626
627 std::cout << "\n";
628 }
629
630 printHorizontalRow();
631}
632
633/// Gets all the parameters of the RooAbsReal. This is in principle not
634/// necessary, because we can always ask the RooAbsReal itself, but the
635/// Evaluator has the cached information to get the answer quicker.
636/// Therefore, this is not meant to be used in general, just where it matters.
637/// \warning If we find another solution to get the parameters efficiently,
638/// this function might be removed without notice.
639RooArgSet Evaluator::getParameters() const
640{
641 RooArgSet parameters;
642 for (auto &nodeInfo : _nodes) {
643 if (nodeInfo.isVariable) {
644 parameters.add(*nodeInfo.absArg);
645 }
646 }
647 // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.
648 parameters.sort();
649 return parameters;
650}
651
652/// \brief Sets the offset mode for evaluation.
653///
654/// This function sets the offset mode for evaluation to the specified mode.
655/// It updates the offset mode for both CPU and CUDA evaluation contexts.
656///
657/// \param mode The offset mode to be set.
658///
659/// \note This function marks reducer nodes as dirty if the offset mode is
660/// changed, because only reducer nodes can use offsetting.
661void Evaluator::setOffsetMode(RooFit::EvalContext::OffsetMode mode)
662{
663 if (mode == _evalContextCPU._offsetMode)
664 return;
665
666 _evalContextCPU._offsetMode = mode;
667 _evalContextCUDA._offsetMode = mode;
668
669 for (auto &nodeInfo : _nodes) {
670 if (nodeInfo.absArg->isReducerNode()) {
671 nodeInfo.isDirty = true;
672 }
673 }
674}
675
676} // namespace RooFit
#define oocoutI(o, a)
#define oocxcoutI(o, a)
Option_t Option_t TPoint TPoint const char mode
char name[80]
Definition TGX11.cxx:110
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:77
virtual bool canComputeBatchWithCuda() const
Definition RooAbsArg.h:539
virtual bool isReducerNode() const
Definition RooAbsArg.h:540
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:447
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:59
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
const bool _useGPU
Definition Evaluator.h:61
std::vector< NodeInfo > _nodes
Definition Evaluator.h:66
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:59
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:65
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:64
RooAbsReal & _topNode
Definition Evaluator.h:60
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
Definition RooRealVar.h:37
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:207
RVec< PromoteType< T > > log(const RVec< T > &v)
Definition RVec.hxx:1841
const Int_t n
Definition legend1.C:16
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition JSONIO.h:26
@ FastEvaluations
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:97
RooBatchCompute::CudaInterface::CudaStream * stream
RooAbsArg * absArg
bool isScalar() const
Definition Evaluator.cxx:99
std::size_t iNode
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
bool computeInGPU() const
RooAbsArg::OperMode originalOperMode
std::size_t outputSize
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...