Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cu
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN, September 2020
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13/**
14\file RooBatchCompute.cu
15\class RbcClass
16\ingroup roofit_dev_docs_batchcompute
17
18This file contains the code for cuda computations using the RooBatchCompute library.
19**/
20
21#include "RooBatchCompute.h"
22#include "Batches.h"
23#include "CudaInterface.h"
24
25#include <algorithm>
26#include <functional>
27#include <map>
28#include <queue>
29#include <vector>
30
31namespace RooBatchCompute {
32namespace CUDA {
33
34constexpr int blockSize = 512;
35
36namespace {
37
38void fillBatches(Batches &batches, double *output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
39{
40 batches.nEvents = nEvents;
41 batches.nBatches = nBatches;
42 batches.nExtra = nExtraArgs;
43 batches.output = output;
44}
45
46void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents)
47{
48 for (int i = 0; i < vars.size(); i++) {
49 const std::span<const double> &span = vars[i];
50 arrays[i]._isVector = span.empty() || span.size() >= nEvents;
51 if (!arrays[i]._isVector) {
52 // In the scalar case, the value is not on the GPU yet, so we have to
53 // copy the value to the GPU buffer.
54 buffer[i] = span[0];
55 arrays[i]._array = bufferDevice + i;
56 } else {
57 // In the vector input cases, they are already on the GPU, so we can
58 // fill be buffer with some dummy value and set the input span
59 // directly.
60 buffer[i] = 0.0;
61 arrays[i]._array = span.data();
62 }
63 }
64}
65
66int getGridSize(std::size_t n)
67{
68 // The grid size should be not larger than the order of number of streaming
69 // multiprocessors (SMs) in an Nvidia GPU. The number 84 was chosen because
70 // the developers were using an Nvidia RTX A4500, which has 46 SMs. This was
71 // multiplied by a factor of 1.5, as recommended by stackoverflow.
72 //
73 // But when there are not enough elements to load the GPU, the number should
74 // be lower: that's why there is the std::ceil().
75 //
76 // Note: for grid sizes larger than 512, the Kahan summation kernels give
77 // wrong results. This problem is not understood, but also not really worth
78 // investigating further, as that number is unreasonably large anyway.
79 constexpr int maxGridSize = 84;
80 return std::min(int(std::ceil(double(n) / blockSize)), maxGridSize);
81}
82
83} // namespace
84
85std::vector<void (*)(Batches &)> getFunctions();
86
87/// This class overrides some RooBatchComputeInterface functions, for the
88/// purpose of providing a cuda specific implementation of the library.
90
91public:
93 {
94 dispatchCUDA = this; // Set the dispatch pointer to this instance of the library upon loading
95 }
96
97 Architecture architecture() const override { return Architecture::CUDA; }
98 std::string architectureName() const override { return "cuda"; }
99
100 /** Compute multiple values using cuda kernels.
101 This method creates a Batches object and passes it to the correct compute function.
102 The compute function is launched as a cuda kernel.
103 \param computer An enum specifying the compute function to be used.
104 \param output The array where the computation results are stored.
105 \param vars A std::span containing pointers to the variables involved in the computation.
106 \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
107 void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span<double> output, VarSpan vars,
108 ArgSpan extraArgs) override
109 {
110 using namespace CudaInterface;
111
112 std::size_t nEvents = output.size();
113
114 const std::size_t memSize = sizeof(Batches) + vars.size() * sizeof(Batch) + vars.size() * sizeof(double) +
115 extraArgs.size() * sizeof(double);
116
117 std::vector<char> hostMem(memSize);
118 auto batches = reinterpret_cast<Batches *>(hostMem.data());
119 auto arrays = reinterpret_cast<Batch *>(batches + 1);
120 auto scalarBuffer = reinterpret_cast<double *>(arrays + vars.size());
121 auto extraArgsHost = reinterpret_cast<double *>(scalarBuffer + vars.size());
122
124 auto batchesDevice = reinterpret_cast<Batches *>(deviceMem.data());
125 auto arraysDevice = reinterpret_cast<Batch *>(batchesDevice + 1);
126 auto scalarBufferDevice = reinterpret_cast<double *>(arraysDevice + vars.size());
127 auto extraArgsDevice = reinterpret_cast<double *>(scalarBufferDevice + vars.size());
128
129 fillBatches(*batches, output.data(), nEvents, vars.size(), extraArgs.size());
130 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
131 batches->args = arraysDevice;
132
133 if (!extraArgs.empty()) {
134 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
135 batches->extra = extraArgsDevice;
136 }
137
138 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream());
139
140 const int gridSize = getGridSize(nEvents);
141 _computeFunctions[computer]<<<gridSize, blockSize, 0, *cfg.cudaStream()>>>(*batchesDevice);
142
143 // The compute might have modified the mutable extra args, so we need to
144 // copy them back. This can be optimized if necessary in the future by
145 // flagging if the extra args were actually changed.
146 if (!extraArgs.empty()) {
147 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
148 }
149 }
150 /// Return the sum of an input array
151 double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override;
152 ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span<const double> probas,
153 std::span<const double> weights, std::span<const double> offsetProbas) override;
154
155 std::unique_ptr<AbsBufferManager> createBufferManager() const;
156
158 {
160 }
162 void deleteCudaEvent(CudaInterface::CudaEvent *event) const override { delete event; }
163 void deleteCudaStream(CudaInterface::CudaStream *stream) const override { delete stream; }
164
166 {
167 CudaInterface::cudaEventRecord(*event, *stream);
168 }
170 {
171 stream->waitForEvent(*event);
172 }
173 bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override { return stream->isActive(); }
174
175private:
176 const std::vector<void (*)(Batches &)> _computeFunctions;
177
178}; // End class RooBatchComputeClass
179
180inline __device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
181{
182 // c is zero the first time around. Then is done a summation as the c variable is NEGATIVE
183 const double y = a - (carry + otherCarry);
184 const double t = sum + y; // Alas, sum is big, y small, so low-order digits of y are lost.
185
186 // (t - sum) cancels the high-order part of y; subtracting y recovers NEGATIVE (low part of y)
187 carry = (t - sum) - y;
188
189 // Algebraically, c should always be zero. Beware overly-aggressive optimizing compilers!
190 sum = t;
191}
192
193// This is the same implementation of the ROOT::Math::KahanSum::operator+=(KahanSum) but in GPU
194inline __device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
195{
196 // Stride in first iteration = half of the block dim. Then the half of the half...
197 for (int i = blockDim.x / 2; i > 0; i >>= 1) {
198 if (threadIdx.x < i && (threadIdx.x + i) < n) {
200 }
202 } // Next time around, the lost low part will be added to y in a fresh attempt.
203 // Wait until all threads of the block have finished its work
204
205 if (threadIdx.x == 0) {
206 result[blockIdx.x] = shared[0];
208 }
209}
210
211__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n,
212 double *__restrict__ result, bool nll)
213{
214 int thIdx = threadIdx.x;
215 int gthIdx = thIdx + blockIdx.x * blockSize;
216 int carry_index = threadIdx.x + blockDim.x;
217 const int nThreadsTotal = blockSize * gridDim.x;
218
219 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
220 extern __shared__ double shared[];
221
222 double sum = 0.0;
223 double carry = 0.0;
224
225 for (int i = gthIdx; i < n; i += nThreadsTotal) {
226 // Note: it does not make sense to use the nll option and provide at the
227 // same time external carries.
228 double val = nll == 1 ? -std::log(input[i]) : input[i];
229 kahanSumUpdate(sum, carry, val, carries ? carries[i] : 0.0);
230 }
231
232 shared[thIdx] = sum;
233 shared[carry_index] = carry;
234
235 // Wait until all threads in each block have loaded their elements
237
239}
240
241__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights,
242 const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
243{
244 int thIdx = threadIdx.x;
245 int gthIdx = thIdx + blockIdx.x * blockSize;
246 int carry_index = threadIdx.x + blockDim.x;
247 const int nThreadsTotal = blockSize * gridDim.x;
248
249 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
250 extern __shared__ double shared[];
251
252 double sum = 0.0;
253 double carry = 0.0;
254
255 for (int i = gthIdx; i < n; i += nThreadsTotal) {
256 // Note: it does not make sense to use the nll option and provide at the
257 // same time external carries.
258 double val = -std::log(probas[i]);
259 if (offsetProbas)
260 val += std::log(offsetProbas[i]);
261 if (weights)
262 val = weights[i] * val;
263 kahanSumUpdate(sum, carry, val, 0.0);
264 }
265
266 shared[thIdx] = sum;
267 shared[carry_index] = carry;
268
269 // Wait until all threads in each block have loaded their elements
271
273}
274
276{
277 if (n == 0)
278 return 0.0;
279 const int gridSize = getGridSize(n);
280 cudaStream_t stream = *cfg.cudaStream();
282 constexpr int shMemSize = 2 * blockSize * sizeof(double);
285 double tmp = 0.0;
286 CudaInterface::copyDeviceToHost(devOut.data(), &tmp, 1, cfg.cudaStream());
287 return tmp;
288}
289
291 std::span<const double> weights, std::span<const double> offsetProbas)
292{
293 ReduceNLLOutput out;
294 if (probas.empty()) {
295 return out;
296 }
297 const int gridSize = getGridSize(probas.size());
299 cudaStream_t stream = *cfg.cudaStream();
300 constexpr int shMemSize = 2 * blockSize * sizeof(double);
301
303 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
304 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.data());
305
307
308 double tmpSum = 0.0;
309 double tmpCarry = 0.0;
312
313 if (weights.size() == 1) {
314 tmpSum *= weights[0];
315 tmpCarry *= weights[0];
316 }
317
318 out.nllSum = tmpSum;
319 out.nllSumCarry = tmpCarry;
320 return out;
321}
322
323namespace {
324
325class ScalarBufferContainer {
326public:
327 ScalarBufferContainer() {}
328 ScalarBufferContainer(std::size_t size)
329 {
330 if (size != 1)
331 throw std::runtime_error("ScalarBufferContainer can only be of size 1");
332 }
333
334 double const *hostReadPtr() const { return &_val; }
335 double const *deviceReadPtr() const { return &_val; }
336
337 double *hostWritePtr() { return &_val; }
338 double *deviceWritePtr() { return &_val; }
339
340 void assignFromHost(std::span<const double> input) { _val = input[0]; }
341 void assignFromDevice(std::span<const double> input)
342 {
343 CudaInterface::copyDeviceToHost(input.data(), &_val, input.size(), nullptr);
344 }
345
346private:
347 double _val;
348};
349
350class CPUBufferContainer {
351public:
352 CPUBufferContainer(std::size_t size) : _vec(size) {}
353
354 double const *hostReadPtr() const { return _vec.data(); }
355 double const *deviceReadPtr() const
356 {
357 throw std::bad_function_call();
358 return nullptr;
359 }
360
361 double *hostWritePtr() { return _vec.data(); }
362 double *deviceWritePtr()
363 {
364 throw std::bad_function_call();
365 return nullptr;
366 }
367
368 void assignFromHost(std::span<const double> input) { _vec.assign(input.begin(), input.end()); }
369 void assignFromDevice(std::span<const double> input)
370 {
371 CudaInterface::copyDeviceToHost(input.data(), _vec.data(), input.size(), nullptr);
372 }
373
374private:
375 std::vector<double> _vec;
376};
377
378class GPUBufferContainer {
379public:
380 GPUBufferContainer(std::size_t size) : _arr(size) {}
381
382 double const *hostReadPtr() const
383 {
384 throw std::bad_function_call();
385 return nullptr;
386 }
387 double const *deviceReadPtr() const { return _arr.data(); }
388
389 double *hostWritePtr() const
390 {
391 throw std::bad_function_call();
392 return nullptr;
393 }
394 double *deviceWritePtr() const { return const_cast<double *>(_arr.data()); }
395
396 void assignFromHost(std::span<const double> input)
397 {
398 CudaInterface::copyHostToDevice(input.data(), deviceWritePtr(), input.size(), nullptr);
399 }
400 void assignFromDevice(std::span<const double> input)
401 {
402 CudaInterface::copyDeviceToDevice(input.data(), deviceWritePtr(), input.size(), nullptr);
403 }
404
405private:
406 CudaInterface::DeviceArray<double> _arr;
407};
408
409class PinnedBufferContainer {
410public:
411 PinnedBufferContainer(std::size_t size) : _arr{size}, _gpuBuffer{size} {}
412 std::size_t size() const { return _arr.size(); }
413
414 void setCudaStream(CudaInterface::CudaStream *stream) { _cudaStream = stream; }
415
416 double const *hostReadPtr() const
417 {
418
419 if (_lastAccess == LastAccessType::GPU_WRITE) {
420 CudaInterface::copyDeviceToHost(_gpuBuffer.deviceReadPtr(), const_cast<double *>(_arr.data()), size(),
421 _cudaStream);
422 }
423
424 _lastAccess = LastAccessType::CPU_READ;
425 return const_cast<double *>(_arr.data());
426 }
427 double const *deviceReadPtr() const
428 {
429
430 if (_lastAccess == LastAccessType::CPU_WRITE) {
431 CudaInterface::copyHostToDevice(_arr.data(), _gpuBuffer.deviceWritePtr(), size(), _cudaStream);
432 }
433
434 _lastAccess = LastAccessType::GPU_READ;
435 return _gpuBuffer.deviceReadPtr();
436 }
437
438 double *hostWritePtr()
439 {
440 _lastAccess = LastAccessType::CPU_WRITE;
441 return _arr.data();
442 }
443 double *deviceWritePtr()
444 {
445 _lastAccess = LastAccessType::GPU_WRITE;
446 return _gpuBuffer.deviceWritePtr();
447 }
448
449 void assignFromHost(std::span<const double> input) { std::copy(input.begin(), input.end(), hostWritePtr()); }
450 void assignFromDevice(std::span<const double> input)
451 {
452 CudaInterface::copyDeviceToDevice(input.data(), deviceWritePtr(), input.size(), _cudaStream);
453 }
454
455private:
456 enum class LastAccessType { CPU_READ, GPU_READ, CPU_WRITE, GPU_WRITE };
457
458 CudaInterface::PinnedHostArray<double> _arr;
459 GPUBufferContainer _gpuBuffer;
460 CudaInterface::CudaStream *_cudaStream = nullptr;
461 mutable LastAccessType _lastAccess = LastAccessType::CPU_READ;
462};
463
464template <class Container>
465class BufferImpl : public AbsBuffer {
466public:
467 using Queue = std::queue<std::unique_ptr<Container>>;
468
469 BufferImpl(std::size_t size, Queue &queue) : _queue{queue}
470 {
471 if (_queue.empty()) {
472 _vec = std::make_unique<Container>(size);
473 } else {
474 _vec = std::move(_queue.front());
475 _queue.pop();
476 }
477 }
478
479 ~BufferImpl() override { _queue.emplace(std::move(_vec)); }
480
481 double const *hostReadPtr() const override { return _vec->hostReadPtr(); }
482 double const *deviceReadPtr() const override { return _vec->deviceReadPtr(); }
483
484 double *hostWritePtr() override { return _vec->hostWritePtr(); }
485 double *deviceWritePtr() override { return _vec->deviceWritePtr(); }
486
487 void assignFromHost(std::span<const double> input) override { _vec->assignFromHost(input); }
488 void assignFromDevice(std::span<const double> input) override { _vec->assignFromDevice(input); }
489
490 Container &vec() { return *_vec; }
491
492private:
493 std::unique_ptr<Container> _vec;
494 Queue &_queue;
495};
496
501
502struct BufferQueuesMaps {
503 std::map<std::size_t, ScalarBuffer::Queue> scalarBufferQueuesMap;
504 std::map<std::size_t, CPUBuffer::Queue> cpuBufferQueuesMap;
505 std::map<std::size_t, GPUBuffer::Queue> gpuBufferQueuesMap;
506 std::map<std::size_t, PinnedBuffer::Queue> pinnedBufferQueuesMap;
507};
508
509class BufferManager : public AbsBufferManager {
510
511public:
512 BufferManager() : _queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
513
514 std::unique_ptr<AbsBuffer> makeScalarBuffer() override
515 {
516 return std::make_unique<ScalarBuffer>(1, _queuesMaps->scalarBufferQueuesMap[1]);
517 }
518 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) override
519 {
520 return std::make_unique<CPUBuffer>(size, _queuesMaps->cpuBufferQueuesMap[size]);
521 }
522 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t size) override
523 {
524 return std::make_unique<GPUBuffer>(size, _queuesMaps->gpuBufferQueuesMap[size]);
525 }
526 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream = nullptr) override
527 {
528 auto out = std::make_unique<PinnedBuffer>(size, _queuesMaps->pinnedBufferQueuesMap[size]);
529 out->vec().setCudaStream(stream);
530 return out;
531 }
532
533private:
534 std::unique_ptr<BufferQueuesMaps> _queuesMaps;
535};
536
537} // namespace
538
539std::unique_ptr<AbsBufferManager> RooBatchComputeClass::createBufferManager() const
540{
541 return std::make_unique<BufferManager>();
542}
543
544/// Static object to trigger the constructor which overwrites the dispatch pointer.
546
547} // End namespace CUDA
548} // End namespace RooBatchCompute
#define a(i)
Definition RSha256.hxx:99
std::vector< double > _vec
double _val
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
Queue & _queue
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
Double_t y[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()