56void assignSpan(std::span<T> &to, std::span<T>
const &from)
61void logArchitectureInfo(
bool useGPU)
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
76 lastUseGPU = {useGPU,
true};
79 auto log = [](std::string_view message) {
80 oocxcoutI(
static_cast<RooAbsArg *
>(
nullptr),
Fitting) << message << std::endl;
84 log(
"using generic CPU library compiled with no vectorizations");
89 log(
"using CUDA computation library");
106 std::shared_ptr<RooBatchCompute::AbsBuffer>
buffer;
155 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
171 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
175 std::size_t iNode = 0;
179 auto &nodeInfo =
_nodes.back();
180 nodeInfo.absArg = arg;
181 nodeInfo.originalOperMode = arg->operMode();
182 nodeInfo.iNode = iNode;
183 nodeInfos[arg] = &nodeInfo;
186 nodeInfo.isVariable =
true;
188 arg->setDataToken(iNode);
191 nodeInfo.isCategory =
true;
198 info.serverInfos.reserve(info.absArg->servers().size());
200 if (server->isValueServer(*info.absArg)) {
201 auto *serverInfo = nodeInfos.at(server);
202 info.serverInfos.emplace_back(serverInfo);
203 serverInfo->clientInfos.emplace_back(&info);
212 for (
auto &info :
_nodes) {
228 std::size_t iValueServer = 0;
230 if (server->isValueServer(*info.absArg)) {
231 auto *knownServer = info.serverInfos[iValueServer]->absArg;
232 if (knownServer->hasDataToken()) {
233 server->setDataToken(knownServer->dataToken());
244 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
252 std::size_t iNode = 0;
253 for (
auto &info :
_nodes) {
254 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
255 if (fromArrayInput) {
256 info.fromArrayInput =
true;
257 info.absArg->setDataToken(iNode);
258 info.outputSize = inputArray.size();
259 if (
_useGPU && info.outputSize <= 1) {
264 }
else if (
_useGPU && info.outputSize > 1) {
272 info.buffer->assignFromDevice(gpuSpan);
273 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
278 info.buffer->assignFromHost(cpuSpan);
279 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
285 info.isDirty = !info.fromArrayInput;
294 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
295 for (
auto &info :
_nodes) {
296 if (info.fromArrayInput) {
297 sizeMap[info.absArg] = info.outputSize;
306 auto found = sizeMap.find(key);
307 return found != sizeMap.end() ? found->second : -1;
310 for (
auto &info :
_nodes) {
311 info.outputSize = outputSizeMap.at(info.absArg);
318 if (!info.isScalar()) {
334 for (
auto &info :
_nodes) {
335 if(!info.isVariable) {
336 info.absArg->resetDataToken();
345 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
349 double *buffer =
nullptr;
359 <<
" could not be evaluated on the GPU because the class doesn't support it. "
360 "Consider requesting or implementing it to benefit from a speed up."
368 buffer = info.
buffer->hostWritePtr();
391 auto *var =
static_cast<RooRealVar const *
>(node);
423 for (
auto &nodeInfo :
_nodes) {
424 if (!nodeInfo.fromArrayInput) {
425 if (nodeInfo.isVariable) {
428 if (nodeInfo.isDirty) {
431 nodeInfo.isDirty =
false;
444 for (
auto &info :
_nodes) {
445 info.remClients = info.clientInfos.size();
446 info.remServers = info.serverInfos.size();
447 if (info.buffer && !info.fromArrayInput) {
453 for (
auto &info :
_nodes) {
454 if (info.remServers == 0 && info.computeInGPU()) {
462 for (
auto &info :
_nodes) {
464 info.remServers = -2;
466 for (
auto *infoClient : info.clientInfos) {
467 --infoClient->remServers;
468 if (infoClient->computeInGPU() && infoClient->remServers == 0) {
472 for (
auto *serverInfo : info.serverInfos) {
473 serverInfo->decrementRemainingClients();
480 for (; it !=
_nodes.end(); it++) {
481 if (it->remServers == 0 && !it->computeInGPU())
487 std::this_thread::sleep_for(std::chrono::milliseconds(1));
502 if (--infoClient->remServers == 0 && infoClient->computeInGPU()) {
507 serverInfo->decrementRemainingClients();
527 if (infoServer->event)
533 double *buffer =
nullptr;
540 buffer = info.
buffer->deviceWritePtr();
554 for (
auto &info :
_nodes) {
555 info.copyAfterEvaluation =
false;
557 if (!info.isScalar()) {
558 for (
auto *clientInfo : info.clientInfos) {
559 if (info.computeInGPU() != clientInfo->computeInGPU()) {
560 info.copyAfterEvaluation =
true;
579 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
581 std::vector<int> widths{9, 37, 20, 9, 10, 20};
583 auto printElement = [&](
int iCol,
auto const &t) {
584 const char separator =
' ';
585 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
589 auto printHorizontalRow = [&]() {
591 for (
int w : widths) {
594 for (
int i = 0;
i <
n;
i++) {
600 printHorizontalRow();
603 printElement(0,
"Index");
604 printElement(1,
"Name");
605 printElement(2,
"Class");
606 printElement(3,
"Size");
607 printElement(4,
"From Data");
608 printElement(5,
"1st value");
611 printHorizontalRow();
613 for (std::size_t iNode = 0; iNode <
_nodes.size(); ++iNode) {
614 auto &nodeInfo =
_nodes[iNode];
620 printElement(0, iNode);
621 printElement(1, node->
GetName());
623 printElement(3, nodeInfo.outputSize);
624 printElement(4, nodeInfo.fromArrayInput);
625 printElement(5, span[0]);
630 printHorizontalRow();
642 for (
auto &nodeInfo :
_nodes) {
643 if (nodeInfo.isVariable) {
644 parameters.
add(*nodeInfo.absArg);
669 for (
auto &nodeInfo :
_nodes) {
670 if (nodeInfo.absArg->isReducerNode()) {
671 nodeInfo.isDirty =
true;
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void w
Option_t Option_t TPoint TPoint const char mode
Common abstract base class for objects that represent a value and a "shape" in RooFit.
TIterator Use servers() and begin()
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::vector< NodeInfo > _nodes
bool _needToUpdateOutputSizes
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
std::stack< std::unique_ptr< ChangeOperModeRAII > > _changeOperModeRAIIs
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
RVec< PromoteType< T > > log(const RVec< T > &v)
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
RooBatchCompute::CudaInterface::CudaStream * stream
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
bool computeInGPU() const
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...