Logo ROOT  
Reference Guide
RooFitDriver.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file RooFitDriver.cxx
16\class RooFitDriver
17\ingroup Roofitcore
18
19This class can evaluate a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The RooFitDriver class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFitDriver.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooArgList.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
40#include <RooFit/CUDAHelpers.h>
42
44
45#include <iomanip>
46#include <numeric>
47#include <thread>
48#include <unordered_set>
49
50namespace ROOT {
51namespace Experimental {
52
53/// A struct used by the RooFitDriver to store information on the RooAbsArgs in
54/// the computation graph.
55struct NodeInfo {
56
58
59 // No copying because of the owned CUDA pointers and buffers
60 NodeInfo(const NodeInfo &) = delete;
61 NodeInfo &operator=(const NodeInfo &) = delete;
62
63 /// Check the servers of a node that has been computed and release it's resources
64 /// if they are no longer needed.
66 {
67 if (--remClients == 0) {
68 buffer.reset();
69 }
70 }
71
72 RooAbsArg *absArg = nullptr;
73
74 std::unique_ptr<Detail::AbsBuffer> buffer;
75
76 cudaEvent_t *event = nullptr;
77 cudaEvent_t *eventStart = nullptr;
78 cudaStream_t *stream = nullptr;
79 std::chrono::microseconds cpuTime{0};
80 std::chrono::microseconds cudaTime{std::chrono::microseconds::max()};
81 std::chrono::microseconds timeLaunched{-1};
82 int nClients = 0;
83 int nServers = 0;
84 int remClients = 0;
85 int remServers = 0;
86 bool computeInScalarMode = false;
87 bool computeInGPU = false;
88 bool copyAfterEvaluation = false;
89 bool fromDataset = false;
90 std::size_t outputSize = 1;
92 {
93 if (event)
95 if (eventStart)
97 if (stream)
99 }
100};
101
102/// Construct a new RooFitDriver. The constructor analyzes and saves metadata about the graph,
103/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
104/// there's also some CUDA-related initialization.
105///
106/// \param[in] absReal The RooAbsReal object that sits on top of the
107/// computation graph that we want to evaluate.
108/// \param[in] normSet Normalization set for the evaluation
109/// \param[in] batchMode The computation mode, accepted values are
110/// `RooBatchCompute::Cpu` and `RooBatchCompute::Cuda`.
112 : _batchMode{batchMode}
113{
114 _integralUnfolder = std::make_unique<RooFit::NormalizationIntegralUnfolder>(absReal, normSet);
115
116 // Initialize RooBatchCompute
118
119 // Some checks and logging of used architectures
121
122 // Get the set of nodes in the computation graph. Do the detour via
123 // RooArgList to avoid deduplication done after adding each element.
124 RooArgList serverList;
125 topNode().treeNodeServerList(&serverList, nullptr, true, true, false, true);
126 // If we fill the servers in reverse order, they are approximately in
127 // topological order so we save a bit of work in sortTopologically().
128 RooArgSet serverSet;
129 serverSet.add(serverList.rbegin(), serverList.rend(), /*silent=*/true);
130 // Sort nodes topologically: the servers of any node will be before that
131 // node in the collection.
132 serverSet.sortTopologically();
133
134 // Fill the ordered nodes list and initialize the node info structs.
135 for (RooAbsArg *arg : serverSet) {
136 _orderedNodes.add(*arg);
137 _nodeInfos[arg];
138 }
139
141 // For the CUDA mode, we need to keep track of the number of servers and
142 // clients of each node.
143 for (RooAbsArg *arg : _orderedNodes) {
144 auto &argInfo = _nodeInfos.at(arg);
145 argInfo.absArg = arg;
146
147 for (auto *client : arg->clients()) {
148 // we use containsInstance instead of find to match by pointer and not name
149 if (!serverSet.containsInstance(*client))
150 continue;
151
152 auto &clientInfo = _nodeInfos.at(client);
153
154 ++clientInfo.nServers;
155 ++argInfo.nClients;
156 }
157 }
158
159 // create events and streams for every node
160 for (auto &item : _nodeInfos) {
161 item.second.event = RooBatchCompute::dispatchCUDA->newCudaEvent(true);
162 item.second.eventStart = RooBatchCompute::dispatchCUDA->newCudaEvent(true);
163 item.second.stream = RooBatchCompute::dispatchCUDA->newCudaStream();
164 }
165 }
166}
167
169 RooAbsCategory const *indexCatForSplitting)
170{
171 setData(RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting, _vectorBuffers));
172}
173
175{
176 if (!_dataMapCPU.empty() && _dataMapCUDA.empty()) {
177 throw std::runtime_error("You can call RooFitDriver::setData() only on a freshly-constructed RooFitDriver!");
178 }
179
180 // Iterate over the given data spans and add them to the data map. Check if
181 // they are used in the computation graph. If yes, add the span to the data
182 // map and set the node info accordingly.
183 for (auto const &span : dataSpans) {
184 _dataMapCPU[span.first] = span.second;
185 auto found = _nodeInfos.find(span.first);
186 if (found != _nodeInfos.end()) {
187 auto &argInfo = found->second;
188 argInfo.outputSize = span.second.size();
189 argInfo.fromDataset = true;
190 }
191 }
192
194
195 for (auto *arg : _orderedNodes) {
196 auto &info = _nodeInfos.at(arg);
197
198 // If the node evaluation doesn't involve a loop over entries, we can
199 // always use the scalar mode.
200 info.computeInScalarMode = info.outputSize == 1 && !arg->isReducerNode();
201
202 // We don't need dirty flag propagation for nodes evaluated in batch
203 // mode, because the driver takes care of deciding which node needs to be
204 // re-evaluated. However, dirty flag propagation must be kept for reducer
205 // nodes, because their clients are evaluated in scalar mode.
206 if (!info.computeInScalarMode && !arg->isReducerNode()) {
208 }
209 }
210
211 // Extra steps for initializing in cuda mode
213 return;
214
215 std::size_t totalSize = 0;
216 for (auto &record : _dataMapCPU) {
217 totalSize += record.second.size();
218 }
219 // copy observable data to the GPU
220 // TODO: use separate buffers here
221 _cudaMemDataset = static_cast<double *>(RooBatchCompute::dispatchCUDA->cudaMalloc(totalSize * sizeof(double)));
222 size_t idx = 0;
223 for (auto &record : _dataMapCPU) {
224 std::size_t size = record.second.size();
225 _dataMapCUDA[record.first] = RooSpan<double>(_cudaMemDataset + idx, size);
226 RooBatchCompute::dispatchCUDA->memcpyToCUDA(_cudaMemDataset + idx, record.second.data(), size * sizeof(double));
227 idx += size;
228 }
229}
230
232{
235 }
236}
237
238std::vector<double> RooFitDriver::getValues()
239{
240 getVal();
241 auto const &nodeInfo = _nodeInfos.at(&topNode());
242 if (nodeInfo.computeInGPU) {
243 std::size_t nOut = nodeInfo.outputSize;
244 std::vector<double> out(nOut);
245 RooBatchCompute::dispatchCUDA->memcpyToCPU(out.data(), _dataMapCPU.at(&topNode()).data(), nOut * sizeof(double));
246 _dataMapCPU[&topNode()] = RooSpan<const double>(out.data(), nOut);
247 return out;
248 }
249 // We copy the data to the output vector
250 auto dataSpan = _dataMapCPU.at(&topNode());
251 std::vector<double> out;
252 out.reserve(dataSpan.size());
253 for (auto const &x : dataSpan) {
254 out.push_back(x);
255 }
256 return out;
257}
258
260{
261 using namespace Detail;
262
263 auto nodeAbsReal = dynamic_cast<RooAbsReal const *>(node);
264 auto nodeAbsCategory = dynamic_cast<RooAbsCategory const *>(node);
265 assert(nodeAbsReal || nodeAbsCategory);
266
267 const std::size_t nOut = info.outputSize;
268
269 if (info.computeInScalarMode) {
270 // compute in scalar mode
271 _nonDerivedValues.push_back(nodeAbsCategory ? nodeAbsCategory->getIndex() : nodeAbsReal->getVal());
273 } else if (nOut == 1) {
274 _nonDerivedValues.push_back(0.0);
276 nodeAbsReal->computeBatch(nullptr, &_nonDerivedValues.back(), nOut, _dataMapCPU);
277 } else {
278 info.buffer = info.copyAfterEvaluation ? makePinnedBuffer(nOut, info.stream) : makeCpuBuffer(nOut);
279 double *buffer = info.buffer->cpuWritePtr();
280 _dataMapCPU[node] = RooSpan<const double>(buffer, nOut);
281 // compute node and measure the time the first time
282 if (_getValInvocations == 1) {
283 using namespace std::chrono;
284 auto start = steady_clock::now();
285 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
286 info.cpuTime = duration_cast<microseconds>(steady_clock::now() - start);
287 } else {
288 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
289 }
290 if (info.copyAfterEvaluation) {
291 _dataMapCUDA[node] = RooSpan<const double>(info.buffer->gpuReadPtr(), nOut);
292 if (info.event) {
294 }
295 }
296 }
297}
298
299/// Returns the value of the top node in the computation graph
301{
303
304 _nonDerivedValues.clear();
305 _nonDerivedValues.reserve(_orderedNodes.size()); // to avoid reallocation
306
308 return getValHeterogeneous();
309 }
310
311 for (std::size_t iNode = 0; iNode < _orderedNodes.size(); ++iNode) {
312 RooAbsArg *node = _orderedNodes.at(iNode);
313 auto &nodeInfo = _nodeInfos.at(node);
314 if (!nodeInfo.fromDataset) {
315 computeCPUNode(node, nodeInfo);
316 }
317 }
318 // return the final value
319 return _dataMapCPU.at(&topNode())[0];
320}
321
322/// Returns the value of the top node in the computation graph
324{
325 for (auto &item : _nodeInfos) {
326 item.second.remClients = item.second.nClients;
327 item.second.remServers = item.second.nServers;
328 }
329
330 // In a cuda fit, use first 3 fits to determine the execution times
331 // and the hardware that computes each part of the graph
333 markGPUNodes();
334
335 // find initial gpu nodes and assign them to gpu
336 for (const auto &it : _nodeInfos)
337 if (it.second.remServers == 0 && it.second.computeInGPU)
338 assignToGPU(it.second.absArg);
339
340 auto const &topNodeInfo = _nodeInfos.at(&topNode());
341 while (topNodeInfo.remServers != -2) {
342 // find finished gpu nodes
343 for (auto &it : _nodeInfos) {
344 if (it.second.remServers == -1 && !RooBatchCompute::dispatchCUDA->streamIsActive(it.second.stream)) {
345 if (_getValInvocations == 2) {
346 float ms = RooBatchCompute::dispatchCUDA->cudaEventElapsedTime(it.second.eventStart, it.second.event);
347 it.second.cudaTime += std::chrono::microseconds{int(1000.0 * ms)};
348 }
349 it.second.remServers = -2;
350 // Decrement number of remaining servers for clients and start GPU computations
351 for (auto *client : it.second.absArg->clients()) {
352 // client not part of the computation graph
353 if (!isInComputationGraph(client))
354 continue;
355 NodeInfo &infoClient = _nodeInfos.at(client);
356
357 --infoClient.remServers;
358 if (infoClient.computeInGPU && infoClient.remServers == 0) {
359 assignToGPU(client);
360 }
361 }
362 for (auto *server : it.second.absArg->servers()) {
363 _nodeInfos.at(server).decrementRemainingClients();
364 }
365 }
366 }
367
368 // find next CPU node
369 auto it = _nodeInfos.begin();
370 for (; it != _nodeInfos.end(); it++) {
371 if (it->second.remServers == 0 && !it->second.computeInGPU)
372 break;
373 }
374
375 // if no CPU node available sleep for a while to save CPU usage
376 if (it == _nodeInfos.end()) {
377 std::this_thread::sleep_for(std::chrono::milliseconds(1));
378 continue;
379 }
380
381 // compute next CPU node
382 RooAbsArg const *node = it->second.absArg;
383 NodeInfo &info = it->second;
384 info.remServers = -2; // so that it doesn't get picked again
385
386 if (!info.fromDataset) {
387 computeCPUNode(node, info);
388 }
389
390 // Assign the clients that are computed on the GPU
391 for (auto *client : node->clients()) {
392 // client not part of the computation graph
393 if (!isInComputationGraph(client))
394 continue;
395 NodeInfo &infoClient = _nodeInfos.at(client);
396 if (--infoClient.remServers == 0 && infoClient.computeInGPU) {
397 assignToGPU(client);
398 }
399 }
400 for (auto *server : node->servers()) {
401 _nodeInfos.at(server).decrementRemainingClients();
402 }
403 }
404
405 // return the final value
406 return _dataMapCPU.at(&topNode())[0];
407}
408
409/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
410/// in case they only depend on gpu nodes.
412{
413 using namespace Detail;
414
415 auto nodeAbsReal = dynamic_cast<RooAbsReal const *>(node);
416 assert(nodeAbsReal || dynamic_cast<RooAbsCategory const *>(node));
417
418 NodeInfo &info = _nodeInfos.at(node);
419 const std::size_t nOut = info.outputSize;
420
421 info.remServers = -1;
422 // wait for every server to finish
423 for (auto *server : node->servers()) {
424 if (_nodeInfos.count(server) == 0)
425 continue;
426 const auto &infoServer = _nodeInfos.at(server);
427 if (infoServer.event)
429 }
430
431 info.buffer = info.copyAfterEvaluation ? makePinnedBuffer(nOut, info.stream) : makeGpuBuffer(nOut);
432 double *buffer = info.buffer->gpuWritePtr();
433 _dataMapCUDA[node] = RooSpan<const double>(buffer, nOut);
434 // measure launching overhead (add computation time later)
435 if (_getValInvocations == 2) {
436 using namespace std::chrono;
438 auto start = steady_clock::now();
439 nodeAbsReal->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
440 info.cudaTime = duration_cast<microseconds>(steady_clock::now() - start);
441 } else
442 nodeAbsReal->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
444 if (info.copyAfterEvaluation) {
445 _dataMapCPU[node] = RooSpan<const double>(info.buffer->cpuReadPtr(), nOut);
446 }
447}
448
449/// This methods simulates the computation of the whole graph and the time it takes
450/// and decides what to compute in gpu. The decision is made on the basis of avoiding
451/// leaving either the gpu or the cpu idle at any time, if possible, and on assigning
452/// to each piece of hardware a computation that is significantly slower on the other part.
453/// The nodes may be assigned to the non-efficient side (cpu or gpu) to prevent idleness
454/// only if the absolute difference cpuTime-cudaTime does not exceed the diffThreshold.
455std::chrono::microseconds RooFitDriver::simulateFit(std::chrono::microseconds h2dTime,
456 std::chrono::microseconds d2hTime,
457 std::chrono::microseconds diffThreshold)
458{
459 using namespace std::chrono;
460
461 std::size_t nNodes = _nodeInfos.size();
462 // launch scalar nodes (assume they are computed in 0 time)
463 for (auto &it : _nodeInfos) {
464 if (it.second.computeInScalarMode) {
465 nNodes--;
466 it.second.timeLaunched = microseconds{0};
467 } else
468 it.second.timeLaunched = microseconds{-1};
469 }
470
471 RooAbsArg const *cpuNode = nullptr;
472 RooAbsArg const *cudaNode = nullptr;
473 microseconds simulatedTime{0};
474 while (nNodes) {
475 microseconds minDiff = microseconds::max(), maxDiff = -minDiff; // diff = cpuTime - cudaTime
476 RooAbsArg const *cpuCandidate = nullptr;
477 RooAbsArg const *cudaCandidate = nullptr;
478 microseconds cpuDelay, cudaDelay;
479 for (auto &it : _nodeInfos) {
480 RooAbsArg const *absArg = it.second.absArg;
481 if (it.second.timeLaunched >= microseconds{0})
482 continue; // already launched
483 microseconds diff{it.second.cpuTime - it.second.cudaTime}, cpuWait{0}, cudaWait{0};
484
485 for (auto *server : absArg->servers()) {
486 if (_nodeInfos.count(server) == 0)
487 continue;
488 auto &info = _nodeInfos.at(server);
489 if (info.computeInScalarMode)
490 continue;
491
492 // dependencies not computed yet
493 if (info.timeLaunched < microseconds{0})
494 goto nextCandidate;
495 if (info.computeInGPU)
496 cpuWait = std::max(cpuWait, info.timeLaunched + info.cudaTime + d2hTime - simulatedTime);
497 else
498 cudaWait = std::max(cudaWait, info.timeLaunched + info.cpuTime + h2dTime - simulatedTime);
499 }
500
501 diff += cpuWait - cudaWait;
502 if (diff < minDiff) {
503 minDiff = diff;
504 cpuDelay = cpuWait;
505 cpuCandidate = absArg;
506 }
507 if (diff > maxDiff && absArg->canComputeBatchWithCuda()) {
508 maxDiff = diff;
509 cudaDelay = cudaWait;
510 cudaCandidate = absArg;
511 }
512 nextCandidate:;
513 } // for (auto& it:_nodeInfos)
514
515 auto calcDiff = [&](const RooAbsArg *node) { return _nodeInfos.at(node).cpuTime - _nodeInfos.at(node).cudaTime; };
516 if (cpuCandidate && calcDiff(cpuCandidate) > diffThreshold)
517 cpuCandidate = nullptr;
518 if (cudaCandidate && -calcDiff(cudaCandidate) > diffThreshold)
519 cudaCandidate = nullptr;
520 // don't compute same node twice
521 if (cpuCandidate == cudaCandidate && !cpuNode && !cudaNode) {
522 if (minDiff < microseconds{0})
523 cudaCandidate = nullptr;
524 else
525 cpuCandidate = nullptr;
526 }
527 if (cpuCandidate && !cpuNode) {
528 cpuNode = cpuCandidate;
529 _nodeInfos.at(cpuNode).timeLaunched = simulatedTime + cpuDelay;
530 _nodeInfos.at(cpuNode).computeInGPU = false;
531 nNodes--;
532 }
533 if (cudaCandidate && !cudaNode) {
534 cudaNode = cudaCandidate;
535 _nodeInfos.at(cudaNode).timeLaunched = simulatedTime + cudaDelay;
536 _nodeInfos.at(cudaNode).computeInGPU = true;
537 nNodes--;
538 }
539
540 microseconds etaCPU{microseconds::max()}, etaCUDA{microseconds::max()};
541 if (cpuNode)
542 etaCPU = _nodeInfos.at(cpuNode).timeLaunched + _nodeInfos.at(cpuNode).cpuTime;
543 if (cudaNode)
544 etaCUDA = _nodeInfos.at(cudaNode).timeLaunched + _nodeInfos.at(cudaNode).cudaTime;
545 simulatedTime = std::min(etaCPU, etaCUDA);
546 if (etaCPU < etaCUDA)
547 cpuNode = nullptr;
548 else
549 cudaNode = nullptr;
550 } // while(nNodes)
551 return simulatedTime;
552}
553
554/// Decides which nodes are assigned to the gpu in a cuda fit. In the 1st iteration,
555/// everything is computed in cpu for measuring the cpu time. In the 2nd iteration,
556/// everything is computed in gpu (if possible) to measure the gpu time.
557/// In the 3rd iteration, simulate the computation of the graph by calling simulateFit
558/// with every distinct threshold found as timeDiff within the nodes of the graph and select
559/// the best configuration. In the end, mark the nodes and handle the details accordingly.
561{
562 using namespace std::chrono;
563
564 if (_getValInvocations == 1) {
565 // leave everything to be computed (and timed) in cpu
566 return;
567 } else if (_getValInvocations == 2) {
568 // compute (and time) as much as possible in gpu
569 for (auto &item : _nodeInfos) {
570 item.second.computeInGPU = !item.second.computeInScalarMode && item.second.absArg->canComputeBatchWithCuda();
571 }
572 } else {
573 // Assign nodes to gpu using a greedy algorithm: for the number of bytes
574 // in this benchmark we take the maximum size of spans in the dataset.
575 std::size_t nBytes = 1;
576 for (auto const &item : _dataMapCUDA) {
577 nBytes = std::max(nBytes, item.second.size() * sizeof(double));
578 }
579 auto transferTimes = RooFit::CUDAHelpers::memcpyBenchmark(nBytes);
580
581 microseconds h2dTime = transferTimes.first;
582 microseconds d2hTime = transferTimes.second;
583 ooccoutD(static_cast<RooAbsArg *>(nullptr), FastEvaluations) << "------Copying times------\n";
584 ooccoutD(static_cast<RooAbsArg *>(nullptr), FastEvaluations)
585 << "h2dTime=" << h2dTime.count() << "us\td2hTime=" << d2hTime.count() << "us\n";
586
587 std::vector<microseconds> diffTimes;
588 for (auto &item : _nodeInfos)
589 if (!item.second.computeInScalarMode)
590 diffTimes.push_back(item.second.cpuTime - item.second.cudaTime);
591 microseconds bestTime = microseconds::max(), bestThreshold, ret;
592 for (auto &threshold : diffTimes)
593 if ((ret = simulateFit(h2dTime, d2hTime, microseconds{std::abs(threshold.count())})) < bestTime) {
594 bestTime = ret;
595 bestThreshold = threshold;
596 }
597 // finalize the marking of the best configuration
598 simulateFit(h2dTime, d2hTime, microseconds{std::abs(bestThreshold.count())});
599 ooccoutD(static_cast<RooAbsArg *>(nullptr), FastEvaluations)
600 << "Best threshold=" << bestThreshold.count() << "us" << std::endl;
601
602 // deletion of the timing events (to be replaced later by non-timing events)
603 for (auto &item : _nodeInfos) {
604 item.second.copyAfterEvaluation = false;
606 RooBatchCompute::dispatchCUDA->deleteCudaEvent(item.second.eventStart);
607 item.second.event = item.second.eventStart = nullptr;
608 }
609 } // else (_getValInvocations > 2)
610
611 for (auto &item : _nodeInfos) {
612 // scalar nodes don't need copying
613 if (!item.second.computeInScalarMode) {
614 for (auto *client : item.second.absArg->clients()) {
615 if (_nodeInfos.count(client) == 0)
616 continue;
617 auto &info = _nodeInfos.at(client);
618 if (item.second.computeInGPU != info.computeInGPU) {
619 item.second.copyAfterEvaluation = true;
620 break;
621 }
622 }
623 }
624 }
625
626 // restore a cudaEventDisableTiming event when necessary
627 if (_getValInvocations == 3) {
628 for (auto &item : _nodeInfos)
629 if (item.second.computeInGPU || item.second.copyAfterEvaluation)
630 item.second.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);
631
632 ooccoutD(static_cast<RooAbsArg *>(nullptr), FastEvaluations)
633 << "------Nodes------\t\t\t\tCpu time: \t Cuda time\n";
634 for (auto &item : _nodeInfos)
635 ooccoutD(static_cast<RooAbsArg *>(nullptr), FastEvaluations)
636 << std::setw(20) << item.first->GetName() << "\t" << item.second.absArg << "\t"
637 << (item.second.computeInGPU ? "CUDA" : "CPU") << "\t" << item.second.cpuTime.count() << "us\t"
638 << item.second.cudaTime.count() << "us\n";
639 }
640}
641
643{
644 for (auto *arg : _orderedNodes) {
645 auto &argInfo = _nodeInfos.at(arg);
646 for (auto *server : arg->servers()) {
647 if (server->isValueServer(*arg)) {
648 if (!arg->isReducerNode()) {
649 argInfo.outputSize = std::max(_nodeInfos.at(server).outputSize, argInfo.outputSize);
650 }
651 }
652 }
653 }
654}
655
657{
658 auto found = _nodeInfos.find(arg);
659 return found != _nodeInfos.end() && found->second.absArg == arg;
660}
661
662/// Temporarily change the operation mode of a RooAbsArg until the
663/// RooFitDriver gets deleted.
665{
666 if (opMode != arg->operMode()) {
667 _changeOperModeRAIIs.emplace(arg, opMode);
668 }
669}
670
672{
673 return static_cast<RooAbsReal const &>(_integralUnfolder->arg());
674}
675
676} // namespace Experimental
677} // namespace ROOT
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
#define ooccoutD(o, a)
Definition: RooMsgService.h:56
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
const RooFit::BatchModeOption _batchMode
Definition: RooFitDriver.h:77
bool isInComputationGraph(RooAbsArg const *arg) const
void setData(RooAbsData const &data, std::string_view rangeName="", RooAbsCategory const *indexCatForSplitting=nullptr)
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
RooFit::Detail::DataMap _dataMapCPU
Definition: RooFitDriver.h:82
std::map< RooFit::Detail::DataKey, NodeInfo > _nodeInfos
Definition: RooFitDriver.h:84
double getValHeterogeneous()
Returns the value of the top node in the computation graph.
double getVal()
Returns the value of the top node in the computation graph.
RooFit::Detail::DataMap _dataMapCUDA
Definition: RooFitDriver.h:83
std::unique_ptr< RooFit::NormalizationIntegralUnfolder > _integralUnfolder
Definition: RooFitDriver.h:92
std::vector< double > _nonDerivedValues
Definition: RooFitDriver.h:90
std::map< const TNamed *, RooSpan< const double > > DataSpansMap
Definition: RooFitDriver.h:43
std::vector< double > getValues()
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
std::stack< std::vector< double > > _vectorBuffers
Definition: RooFitDriver.h:91
void markGPUNodes()
Decides which nodes are assigned to the gpu in a cuda fit.
std::stack< RooHelpers::ChangeOperModeRAII > _changeOperModeRAIIs
Definition: RooFitDriver.h:95
RooAbsReal const & topNode() const
std::chrono::microseconds simulateFit(std::chrono::microseconds h2dTime, std::chrono::microseconds d2hTime, std::chrono::microseconds diffThreshold)
This methods simulates the computation of the whole graph and the time it takes and decides what to c...
void assignToGPU(const RooAbsArg *node)
Assign a node to be computed in the GPU.
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
Definition: RooAbsArg.h:77
void treeNodeServerList(RooAbsCollection *list, const RooAbsArg *arg=0, bool doBranch=true, bool doLeaf=true, bool valueOnly=false, bool recurseNonDerived=false) const
Fill supplied list with nodes of the arg tree, following all server links, starting with ourself as t...
Definition: RooAbsArg.cxx:527
virtual bool canComputeBatchWithCuda() const
Definition: RooAbsArg.h:598
const RefCountList_t & servers() const
List of all servers of this object.
Definition: RooAbsArg.h:204
const RefCountList_t & clients() const
List of all clients of this object.
Definition: RooAbsArg.h:190
OperMode operMode() const
Query the operation mode of this node.
Definition: RooAbsArg.h:507
A space to attach TBranches.
void sortTopologically()
Sort collection topologically: the servers of any RooAbsArg will be before that RooAbsArg in the coll...
Storage_t::const_reverse_iterator rend() const
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::const_reverse_iterator rbegin() const
Storage_t::size_type size() const
RooAbsData is the common abstract base class for binned and unbinned datasets.
Definition: RooAbsData.h:61
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
Definition: RooAbsReal.h:64
RooArgList is a container object that can hold multiple RooAbsArg objects.
Definition: RooArgList.h:22
RooAbsArg * at(Int_t idx) const
Return object at given index, or nullptr if index is out of range.
Definition: RooArgList.h:110
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition: RooArgSet.h:57
bool containsInstance(const RooAbsArg &var) const override
Check if this exact instance is in this collection.
Definition: RooArgSet.h:171
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
A simple container to hold a batch of data values.
Definition: RooSpan.h:34
RVec< PromoteType< T > > abs(const RVec< T > &v)
Definition: RVec.hxx:1739
Double_t x[n]
Definition: legend1.C:17
basic_string_view< char > string_view
std::unique_ptr< AbsBuffer > makeGpuBuffer(std::size_t size)
Definition: Buffers.cxx:224
std::unique_ptr< AbsBuffer > makeCpuBuffer(std::size_t size)
Definition: Buffers.cxx:220
std::unique_ptr< AbsBuffer > makePinnedBuffer(std::size_t size, cudaStream_t *stream=nullptr)
Definition: Buffers.cxx:228
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
std::map< const TNamed *, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, std::stack< std::vector< double > > &buffers)
void logArchitectureInfo(RooFit::BatchModeOption batchMode)
std::pair< std::chrono::microseconds, std::chrono::microseconds > memcpyBenchmark(std::size_t nBytes)
Measure the time for transfering data from host to device and vice-versa.
Definition: CUDAHelpers.cxx:18
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo();.
Definition: RooGlobalFunc.h:70
@ FastEvaluations
Definition: RooGlobalFunc.h:65
static constexpr double ms
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
std::chrono::microseconds timeLaunched
std::chrono::microseconds cudaTime
NodeInfo(const NodeInfo &)=delete
std::unique_ptr< Detail::AbsBuffer > buffer
std::chrono::microseconds cpuTime
NodeInfo & operator=(const NodeInfo &)=delete
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...