51namespace Experimental {
67 static std::pair<RooFit::BatchModeOption, bool> lastBatchMode;
68 if (lastBatchMode.second && lastBatchMode.first == batchMode)
70 lastBatchMode = {batchMode,
true};
73 auto log = [](std::string_view message) {
78 throw std::runtime_error(std::string(
"In: ") + __func__ +
"(), " + __FILE__ +
":" + __LINE__ +
79 ": Cuda implementation of the computing library is not available\n");
82 log(
"using generic CPU library compiled with no vectorizations");
84 log(std::string(
"using CPU computation library compiled with -m") +
88 log(
"using CUDA computation library");
115 cudaEvent_t *
event =
nullptr;
149 : _topNode{const_cast<
RooAbsReal &>(absReal)}, _batchMode{batchMode}
163 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
167 std::size_t iNode = 0;
170 auto &nodeInfo =
_nodes[iNode];
171 nodeInfo.absArg = arg;
172 nodeInfo.iNode = iNode;
173 nodeInfos[arg] = &nodeInfo;
176 nodeInfo.isVariable =
true;
178 arg->setDataToken(iNode);
181 nodeInfo.isCategory =
true;
188 info.serverInfos.reserve(info.absArg->servers().size());
189 for (
RooAbsArg *server : info.absArg->servers()) {
190 if (server->isValueServer(*info.absArg)) {
191 auto *serverInfo = nodeInfos.at(server);
192 info.serverInfos.emplace_back(serverInfo);
193 serverInfo->clientInfos.emplace_back(&info);
202 for (
auto &info :
_nodes) {
215 std::size_t iValueServer = 0;
216 for (
RooAbsArg *server : info.absArg->servers()) {
217 if (server->isValueServer(*info.absArg)) {
218 auto *knownServer = info.serverInfos[iValueServer]->absArg;
219 if (knownServer->hasDataToken()) {
220 server->setDataToken(knownServer->dataToken());
229 bool skipZeroWeights,
bool takeGlobalObservablesFromData)
241 std::size_t totalSize = 0;
242 std::size_t iNode = 0;
243 for (
auto &info :
_nodes) {
246 info.buffer =
nullptr;
248 auto found = dataSpans.find(info.absArg->namePtr());
249 if (found != dataSpans.end()) {
250 info.absArg->setDataToken(iNode);
252 info.outputSize = found->second.size();
253 info.fromDataset =
true;
254 info.isDirty =
false;
255 totalSize += info.outputSize;
258 info.fromDataset =
false;
267 for (
auto &info :
_nodes) {
273 if (!info.isScalar()) {
286 for (
auto &info :
_nodes) {
287 if (!info.fromDataset)
289 std::size_t
size = info.outputSize;
304RooFitDriver::~RooFitDriver()
306 for (
auto &info : _nodes) {
307 info.absArg->resetDataToken();
315std::vector<double> RooFitDriver::getValues()
319 auto dataSpan = _dataMapCPU.at(&topNode());
320 std::vector<double> out;
321 out.reserve(dataSpan.size());
322 for (
auto const &
x : dataSpan) {
330 using namespace Detail;
332 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
336 double *buffer =
nullptr;
340 _dataMapCUDA.set(node, {buffer, nOut});
346 <<
" could not be evaluated on the GPU because the class doesn't support it. "
347 "Consider requesting or implementing it to benefit from a speed up."
353 : _bufferManager.makeCpuBuffer(nOut);
357 _dataMapCPU.set(node, {buffer, nOut});
358 nodeAbsReal->computeBatch(
nullptr, buffer, nOut, _dataMapCPU);
369void RooFitDriver::processVariable(
NodeInfo &nodeInfo)
372 auto *var =
static_cast<RooRealVar const *
>(node);
376 clientInfo->isDirty =
true;
378 computeCPUNode(node, nodeInfo);
385void RooFitDriver::setClientsDirty(
NodeInfo &nodeInfo)
388 clientInfo->isDirty =
true;
393double RooFitDriver::getVal()
395 ++_getValInvocations;
398 return getValHeterogeneous();
401 for (
auto &nodeInfo : _nodes) {
402 if (!nodeInfo.fromDataset) {
403 if (nodeInfo.isVariable) {
404 processVariable(nodeInfo);
406 if (nodeInfo.isDirty) {
407 setClientsDirty(nodeInfo);
408 computeCPUNode(nodeInfo.absArg, nodeInfo);
409 nodeInfo.isDirty =
false;
416 return _dataMapCPU.at(&topNode())[0];
420double RooFitDriver::getValHeterogeneous()
422 for (
auto &info : _nodes) {
423 info.remClients = info.clientInfos.size();
424 info.remServers = info.serverInfos.size();
427 info.buffer =
nullptr;
431 for (
auto &info : _nodes) {
432 if (info.remServers == 0 && info.computeInGPU()) {
437 NodeInfo const &topNodeInfo = _nodes.back();
440 for (
auto &info : _nodes) {
444 for (
auto *infoClient : info.clientInfos) {
445 --infoClient->remServers;
446 if (infoClient->computeInGPU() && infoClient->remServers == 0) {
447 assignToGPU(*infoClient);
450 for (
auto *serverInfo : info.serverInfos) {
451 serverInfo->decrementRemainingClients();
457 auto it = _nodes.begin();
458 for (; it != _nodes.end(); it++) {
459 if (it->remServers == 0 && !it->computeInGPU())
464 if (it == _nodes.end()) {
465 std::this_thread::sleep_for(std::chrono::milliseconds(1));
475 computeCPUNode(node, info);
480 if (--infoClient->remServers == 0 && infoClient->computeInGPU()) {
481 assignToGPU(*infoClient);
485 serverInfo->decrementRemainingClients();
490 return _dataMapCPU.at(&topNode())[0];
497 using namespace Detail;
504 if (infoServer->event)
510 double *buffer =
nullptr;
513 _dataMapCPU.set(node, {buffer, nOut});
516 : _bufferManager.makeGpuBuffer(nOut);
519 _dataMapCUDA.set(node, {buffer, nOut});
520 node->computeBatch(info.
stream, buffer, nOut, _dataMapCUDA);
528void RooFitDriver::markGPUNodes()
530 for (
auto &info : _nodes) {
531 info.copyAfterEvaluation =
false;
533 if (!info.isScalar()) {
534 for (
auto *clientInfo : info.clientInfos) {
535 if (info.computeInGPU() != clientInfo->computeInGPU()) {
536 info.copyAfterEvaluation =
true;
544void RooFitDriver::determineOutputSizes()
546 for (
auto &argInfo : _nodes) {
547 for (
auto *serverInfo : argInfo.serverInfos) {
548 if (!argInfo.absArg->isReducerNode()) {
549 argInfo.outputSize = std::max(serverInfo->outputSize, argInfo.outputSize);
560 _changeOperModeRAIIs.emplace(arg, opMode);
569void RooFitDriver::print(std::ostream &os)
const
571 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
573 std::vector<int> widths{9, 37, 20, 9, 10, 20};
575 auto printElement = [&](
int iCol,
auto const &t) {
576 const char separator =
' ';
577 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
581 auto printHorizontalRow = [&]() {
583 for (
int w : widths) {
586 for (
int i = 0; i <
n; i++) {
592 printHorizontalRow();
595 printElement(0,
"Index");
596 printElement(1,
"Name");
597 printElement(2,
"Class");
598 printElement(3,
"Size");
599 printElement(4,
"From Data");
600 printElement(5,
"1st value");
603 printHorizontalRow();
605 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
606 auto &nodeInfo = _nodes[iNode];
609 auto span = _dataMapCPU.at(node);
612 printElement(0, iNode);
613 printElement(1, node->
GetName());
615 printElement(3, nodeInfo.outputSize);
616 printElement(4, nodeInfo.fromDataset);
617 printElement(5, span[0]);
622 printHorizontalRow();
628 for (
auto &nodeInfo : _nodes) {
629 if (!nodeInfo.fromDataset && nodeInfo.isVariable) {
630 parameters.
add(*nodeInfo.absArg);
638RooAbsRealWrapper::RooAbsRealWrapper(std::unique_ptr<RooFitDriver> driver, std::string
const &rangeName,
640 :
RooAbsReal{
"RooFitDriverWrapper",
"RooFitDriverWrapper"},
641 _driver{std::move(driver)},
642 _topNode(
"topNode",
"top node", this, _driver->topNode()),
643 _rangeName{rangeName},
645 _takeGlobalObservablesFromData{takeGlobalObservablesFromData}
651 _driver{other._driver},
652 _topNode(
"topNode", this, other._topNode),
654 _rangeName{other._rangeName},
655 _simPdf{other._simPdf},
656 _takeGlobalObservablesFromData{other._takeGlobalObservablesFromData}
665 outputSet.
remove(*observables);
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
virtual double * gpuWritePtr()=0
virtual double * cpuWritePtr()=0
virtual double const * gpuReadPtr() const =0
virtual double const * cpuReadPtr() const =0
RooAbsRealWrapper(std::unique_ptr< RooFitDriver > driver, std::string const &rangeName, RooSimultaneous const *simPdf, bool takeGlobalObservablesFromData)
const bool _takeGlobalObservablesFromData
RooSimultaneous const * _simPdf
std::shared_ptr< RooFitDriver > _driver
bool setData(RooAbsData &data, bool cloneData) override
bool getParameters(const RooArgSet *observables, RooArgSet &outputSet, bool stripDisconnected) const override
Fills a list with leaf nodes in the arg tree starting with ourself as top node that don't match any o...
std::map< RooFit::Detail::DataKey, RooSpan< const double > > DataSpansMap
RooFitDriver(const RooAbsReal &absReal, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
const RooFit::BatchModeOption _batchMode
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
RooFit::Detail::DataMap _dataMapCPU
RooFit::Detail::DataMap _dataMapCUDA
void setData(RooAbsData const &data, std::string const &rangeName="", RooSimultaneous const *simPdf=nullptr, bool skipZeroWeights=false, bool takeGlobalObservablesFromData=true)
void determineOutputSizes()
std::vector< NodeInfo > _nodes
std::stack< std::vector< double > > _vectorBuffers
RooAbsReal & topNode() const
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
virtual bool canComputeBatchWithCuda() const
virtual bool isReducerNode() const
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool remove(const RooAbsArg &var, bool silent=false, bool matchByNameOnly=false)
Remove the specified argument from our list.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
virtual bool replace(const RooAbsArg &var1, const RooAbsArg &var2)
Replace var1 with var2 and return true for success.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
RooAbsData is the common abstract base class for binned and unbinned datasets.
RooArgSet const * getGlobalObservables() const
Returns snapshot of global observables stored in this data.
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual void * cudaMalloc(size_t)
virtual cudaStream_t * newCudaStream()
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void cudaFree(void *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
void set(RooAbsArg const *arg, RooSpan< const double > const &span)
RooSpan< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
auto resize(std::size_t n)
static RooMsgService & instance()
Return reference to singleton instance.
RooRealVar represents a variable that can be changed from the outside.
RooSimultaneous facilitates simultaneous fitting of multiple PDFs to subsets of a given dataset.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
std::map< RooFit::Detail::DataKey, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string const &rangeName, RooSimultaneous const *simPdf, bool skipZeroWeights, bool takeGlobalObservablesFromData, std::stack< std::vector< double > > &buffers)
Extract all content from a RooFit datasets as a map of spans.
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo()
void getSortedComputationGraph(RooAbsReal const &func, RooArgSet &out)
Get the topologically-sorted list of all nodes in the computation graph.
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
std::vector< NodeInfo * > serverInfos
std::size_t lastSetValCount
bool computeInGPU() const
std::vector< NodeInfo * > clientInfos
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...
Detail::AbsBuffer * buffer