56void assignSpan(std::span<T> &to, std::span<T>
const &from)
79 auto log = [](std::string_view message) {
84 log(
"using generic CPU library compiled with no vectorizations");
89 log(
"using CUDA computation library");
104 std::shared_ptr<RooBatchCompute::AbsBuffer>
buffer;
154 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
170 std::map<RooFit::Detail::DataKey, NodeInfo *>
nodeInfos;
174 std::size_t iNode = 0;
180 nodeInfo.originalOperMode = arg->operMode();
187 arg->setDataToken(iNode);
197 info.serverInfos.reserve(
info.absArg->servers().size());
243 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
251 std::size_t iNode = 0;
253 const bool fromArrayInput =
info.absArg->namePtr() == namePtr;
254 if (fromArrayInput) {
255 info.fromArrayInput =
true;
256 info.absArg->setDataToken(iNode);
284 info.isDirty = !
info.fromArrayInput;
288 _needToUpdateOutputSizes =
true;
291void Evaluator::updateOutputSizes()
293 std::map<RooFit::Detail::DataKey, std::size_t>
sizeMap;
294 for (
auto &
info : _nodes) {
295 if (
info.fromArrayInput) {
305 auto found =
sizeMap.find(key);
306 return found !=
sizeMap.
end() ? found->second : -1;
309 for (
auto &
info : _nodes) {
317 if (!
info.isScalar()) {
320 setOperMode(
info.absArg,
info.originalOperMode);
328 _needToUpdateOutputSizes =
false;
331Evaluator::~Evaluator()
333 for (
auto &
info : _nodes) {
334 if (!
info.isVariable) {
335 info.absArg->resetDataToken();
342 using namespace Detail;
344 const std::size_t
nOut =
info.outputSize;
346 double *buffer =
nullptr;
348 buffer = &
info.scalarBuffer;
350 _evalContextCUDA.set(node, {buffer,
nOut});
353 if (!
info.hasLogged && _useGPU) {
356 <<
" could not be evaluated on the GPU because the class doesn't support it. "
357 "Consider requesting or implementing it to benefit from a speed up."
359 info.hasLogged =
true;
362 info.buffer =
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut,
info.stream)
363 : _bufferManager->makeCpuBuffer(
nOut);
365 buffer =
info.buffer->hostWritePtr();
367 assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
368 _evalContextCPU.set(node, {buffer,
nOut});
370 _evalContextCPU.enableVectorBuffers(
true);
372 if (
info.isCategory) {
377 throw std::runtime_error(
"RooFit::Evaluator - non-scalar category values are not supported!");
383 _evalContextCPU.resetVectorBuffers();
384 _evalContextCPU.enableVectorBuffers(
false);
385 if (
info.copyAfterEvaluation) {
386 _evalContextCUDA.set(node, {
info.buffer->deviceReadPtr(),
nOut});
398 auto *var =
static_cast<RooRealVar const *
>(node);
399 if (
nodeInfo.lastSetValCount != var->valueResetCounter()) {
400 nodeInfo.lastSetValCount = var->valueResetCounter();
419std::span<const double> Evaluator::run()
421 if (_needToUpdateOutputSizes)
427 return getValHeterogeneous();
445 return _evalContextCPU.at(&_topNode);
449std::span<const double> Evaluator::getValHeterogeneous()
451 for (
auto &
info : _nodes) {
452 info.remClients =
info.clientInfos.size();
453 info.remServers =
info.serverInfos.size();
454 if (
info.buffer && !
info.fromArrayInput) {
460 for (
auto &
info : _nodes) {
461 if (
info.remServers == 0 &&
info.computeInGPU) {
469 for (
auto &
info : _nodes) {
471 info.remServers = -2;
486 auto it = _nodes.
begin();
487 for (; it != _nodes.end(); it++) {
488 if (it->remServers == 0 && !it->computeInGPU)
493 if (it == _nodes.end()) {
494 std::this_thread::sleep_for(std::chrono::milliseconds(1));
501 info.remServers = -2;
503 if (!
info.fromArrayInput) {
504 computeCPUNode(node,
info);
519 return _evalContextCUDA.at(&_topNode);
526 using namespace Detail;
528 info.remServers = -1;
538 const std::size_t
nOut =
info.outputSize;
540 double *buffer =
nullptr;
542 buffer = &
info.scalarBuffer;
543 _evalContextCPU.set(node, {buffer,
nOut});
545 info.buffer =
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut,
info.stream)
546 : _bufferManager->makeGpuBuffer(
nOut);
547 buffer =
info.buffer->deviceWritePtr();
549 assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
550 _evalContextCUDA.set(node, {buffer,
nOut});
551 node->doEval(_evalContextCUDA);
553 if (
info.copyAfterEvaluation) {
554 _evalContextCPU.set(node, {
info.buffer->hostReadPtr(),
nOut});
559void Evaluator::markGPUNodes()
563 for (
auto &
info : _nodes) {
564 info.computeInGPU =
false;
565 if (!
info.absArg->canComputeBatchWithCuda()) {
570 info.computeInGPU =
true;
577 for (
auto &
info : _nodes) {
578 info.copyAfterEvaluation =
false;
580 if (!
info.isScalar()) {
583 info.copyAfterEvaluation =
true;
596 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg,
opMode));
600void Evaluator::print(std::ostream &os)
602 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
604 std::vector<int>
widths{9, 37, 20, 9, 10, 20};
607 const char separator =
' ';
608 os << separator << std::left << std::setw(
widths[
iCol]) << std::setfill(separator) << t;
617 for (
int i = 0; i <
n; i++) {
636 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
640 auto span = _evalContextCPU.at(node);
666 if (
nodeInfo.absArg->isFundamental()) {
686 if (
mode == _evalContextCPU._offsetMode)
689 _evalContextCPU._offsetMode =
mode;
690 _evalContextCUDA._offsetMode =
mode;
693 if (
nodeInfo.absArg->isReducerNode()) {
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char mode
const_iterator begin() const
const_iterator end() const
Common abstract base class for objects that represent a value and a "shape" in RooFit.
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::vector< NodeInfo > _nodes
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
RooBatchCompute::CudaInterface::CudaStream * stream
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...