Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cu
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN, September 2020
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13/**
14\file RooBatchCompute.cu
15\class RbcClass
16\ingroup Roobatchcompute
17
18This file contains the code for cuda computations using the RooBatchCompute library.
19**/
20
21#include "RooBatchCompute.h"
22#include "Batches.h"
23
24#include <ROOT/RConfig.hxx>
25#include <TError.h>
26
27#include <algorithm>
28#include <vector>
29
30#ifndef RF_ARCH
31#error "RF_ARCH should always be defined"
32#endif
33
35
36namespace RooBatchCompute {
37namespace RF_ARCH {
38
39constexpr int blockSize = 512;
40
41namespace {
42
43void fillBatches(Batches &batches, double *output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
44{
45 batches.nEvents = nEvents;
46 batches.nBatches = nBatches;
47 batches.nExtra = nExtraArgs;
48 batches.output = output;
49}
50
51void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents)
52{
53 for (int i = 0; i < vars.size(); i++) {
54 const std::span<const double> &span = vars[i];
55 arrays[i]._isVector = span.empty() || span.size() >= nEvents;
56 if (!arrays[i]._isVector) {
57 // In the scalar case, the value is not on the GPU yet, so we have to
58 // copy the value to the GPU buffer.
59 buffer[i] = span[0];
60 arrays[i]._array = bufferDevice + i;
61 } else {
62 // In the vector input cases, they are already on the GPU, so we can
63 // fill be buffer with some dummy value and set the input span
64 // directly.
65 buffer[i] = 0.0;
66 arrays[i]._array = span.data();
67 }
68 }
69}
70
71int getGridSize(std::size_t n)
72{
73 // The grid size should be not larger than the order of number of streaming
74 // multiprocessors (SMs) in an Nvidia GPU. The number 84 was chosen because
75 // the developers were using an Nvidia RTX A4500, which has 46 SMs. This was
76 // multiplied by a factor of 1.5, as recommended by stackoverflow.
77 //
78 // But when there are not enough elements to load the GPU, the number should
79 // be lower: that's why there is the std::ceil().
80 //
81 // Note: for grid sizes larger than 512, the Kahan summation kernels give
82 // wrong results. This problem is not understood, but also not really worth
83 // investigating further, as that number is unreasonably large anyway.
84 constexpr int maxGridSize = 84;
85 return std::min(int(std::ceil(double(n) / blockSize)), maxGridSize);
86}
87
88} // namespace
89
90std::vector<void (*)(Batches &)> getFunctions();
91
92/// This class overrides some RooBatchComputeInterface functions, for the
93/// purpose of providing a cuda specific implementation of the library.
95private:
96 const std::vector<void (*)(Batches &)> _computeFunctions;
97
98public:
100 {
101 dispatchCUDA = this; // Set the dispatch pointer to this instance of the library upon loading
102 }
103
104 Architecture architecture() const override { return Architecture::RF_ARCH; };
105 std::string architectureName() const override
106 {
107 // transform to lower case to match the original architecture name passed to the compiler
108 std::string out = _QUOTE_(RF_ARCH);
109 std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
110 return out;
111 };
112
113 /** Compute multiple values using cuda kernels.
114 This method creates a Batches object and passes it to the correct compute function.
115 The compute function is launched as a cuda kernel.
116 \param computer An enum specifying the compute function to be used.
117 \param output The array where the computation results are stored.
118 \param vars A std::span containing pointers to the variables involved in the computation.
119 \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
120 void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span<double> output, VarSpan vars,
121 ArgSpan extraArgs) override
122 {
123 using namespace RooFit::Detail::CudaInterface;
124
125 std::size_t nEvents = output.size();
126
127 const std::size_t memSize = sizeof(Batches) + vars.size() * sizeof(Batch) + vars.size() * sizeof(double) +
128 extraArgs.size() * sizeof(double);
129
130 std::vector<char> hostMem(memSize);
131 auto batches = reinterpret_cast<Batches *>(hostMem.data());
132 auto arrays = reinterpret_cast<Batch *>(batches + 1);
133 auto scalarBuffer = reinterpret_cast<double *>(arrays + vars.size());
134 auto extraArgsHost = reinterpret_cast<double *>(scalarBuffer + vars.size());
135
136 DeviceArray<char> deviceMem(memSize);
137 auto batchesDevice = reinterpret_cast<Batches *>(deviceMem.data());
138 auto arraysDevice = reinterpret_cast<Batch *>(batchesDevice + 1);
139 auto scalarBufferDevice = reinterpret_cast<double *>(arraysDevice + vars.size());
140 auto extraArgsDevice = reinterpret_cast<double *>(scalarBufferDevice + vars.size());
141
142 fillBatches(*batches, output.data(), nEvents, vars.size(), extraArgs.size());
143 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
144 batches->args = arraysDevice;
145
146 if (!extraArgs.empty()) {
147 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
148 batches->extra = extraArgsDevice;
149 }
150
151 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream());
152
153 const int gridSize = getGridSize(nEvents);
154 _computeFunctions[computer]<<<gridSize, blockSize, 0, *cfg.cudaStream()>>>(*batchesDevice);
155
156 // The compute might have modified the mutable extra args, so we need to
157 // copy them back. This can be optimized if necessary in the future by
158 // flagging if the extra args were actually changed.
159 if (!extraArgs.empty()) {
160 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
161 }
162 }
163 /// Return the sum of an input array
164 double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override;
165 ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span<const double> probas,
166 std::span<const double> weights, std::span<const double> offsetProbas) override;
167}; // End class RooBatchComputeClass
168
169inline __device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
170{
171 // c is zero the first time around. Then is done a summation as the c variable is NEGATIVE
172 const double y = a - (carry + otherCarry);
173 const double t = sum + y; // Alas, sum is big, y small, so low-order digits of y are lost.
174
175 // (t - sum) cancels the high-order part of y; subtracting y recovers NEGATIVE (low part of y)
176 carry = (t - sum) - y;
177
178 // Algebraically, c should always be zero. Beware overly-aggressive optimizing compilers!
179 sum = t;
180}
181
182// This is the same implementation of the ROOT::Math::KahanSum::operator+=(KahanSum) but in GPU
183inline __device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
184{
185 // Stride in first iteration = half of the block dim. Then the half of the half...
186 for (int i = blockDim.x / 2; i > 0; i >>= 1) {
187 if (threadIdx.x < i && (threadIdx.x + i) < n) {
188 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
189 }
190 __syncthreads();
191 } // Next time around, the lost low part will be added to y in a fresh attempt.
192 // Wait until all threads of the block have finished its work
193
194 if (threadIdx.x == 0) {
195 result[blockIdx.x] = shared[0];
196 result[blockIdx.x + gridDim.x] = shared[carry_index];
197 }
198}
199
200__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n,
201 double *__restrict__ result, bool nll)
202{
203 int thIdx = threadIdx.x;
204 int gthIdx = thIdx + blockIdx.x * blockSize;
205 int carry_index = threadIdx.x + blockDim.x;
206 const int nThreadsTotal = blockSize * gridDim.x;
207
208 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
209 extern __shared__ double shared[];
210
211 double sum = 0.0;
212 double carry = 0.0;
213
214 for (int i = gthIdx; i < n; i += nThreadsTotal) {
215 // Note: it does not make sense to use the nll option and provide at the
216 // same time external carries.
217 double val = nll == 1 ? -std::log(input[i]) : input[i];
218 kahanSumUpdate(sum, carry, val, carries ? carries[i] : 0.0);
219 }
220
221 shared[thIdx] = sum;
222 shared[carry_index] = carry;
223
224 // Wait until all threads in each block have loaded their elements
225 __syncthreads();
226
227 kahanSumReduction(shared, n, result, carry_index);
228}
229
230__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights,
231 const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
232{
233 int thIdx = threadIdx.x;
234 int gthIdx = thIdx + blockIdx.x * blockSize;
235 int carry_index = threadIdx.x + blockDim.x;
236 const int nThreadsTotal = blockSize * gridDim.x;
237
238 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
239 extern __shared__ double shared[];
240
241 double sum = 0.0;
242 double carry = 0.0;
243
244 for (int i = gthIdx; i < n; i += nThreadsTotal) {
245 // Note: it does not make sense to use the nll option and provide at the
246 // same time external carries.
247 double val = -std::log(probas[i]);
248 if (offsetProbas)
249 val += std::log(offsetProbas[i]);
250 if (weights)
251 val = weights[i] * val;
252 kahanSumUpdate(sum, carry, val, 0.0);
253 }
254
255 shared[thIdx] = sum;
256 shared[carry_index] = carry;
257
258 // Wait until all threads in each block have loaded their elements
259 __syncthreads();
260
261 kahanSumReduction(shared, n, result, carry_index);
262}
263
265{
266 if (n == 0)
267 return 0.0;
268 const int gridSize = getGridSize(n);
269 cudaStream_t stream = *cfg.cudaStream();
270 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
271 constexpr int shMemSize = 2 * blockSize * sizeof(double);
272 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(input, nullptr, n, devOut.data(), 0);
273 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
274 double tmp = 0.0;
275 CudaInterface::copyDeviceToHost(devOut.data(), &tmp, 1, cfg.cudaStream());
276 return tmp;
277}
278
280 std::span<const double> weights, std::span<const double> offsetProbas)
281{
282 ReduceNLLOutput out;
283 if (probas.empty()) {
284 return out;
285 }
286 const int gridSize = getGridSize(probas.size());
287 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
288 cudaStream_t stream = *cfg.cudaStream();
289 constexpr int shMemSize = 2 * blockSize * sizeof(double);
290
291 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
292 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
293 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.data());
294
295 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
296
297 double tmpSum = 0.0;
298 double tmpCarry = 0.0;
299 CudaInterface::copyDeviceToHost(devOut.data(), &tmpSum, 1, cfg.cudaStream());
300 CudaInterface::copyDeviceToHost(devOut.data() + 1, &tmpCarry, 1, cfg.cudaStream());
301
302 if (weights.size() == 1) {
303 tmpSum *= weights[0];
304 tmpCarry *= weights[0];
305 }
306
307 out.nllSum = tmpSum;
308 out.nllSumCarry = tmpCarry;
309 return out;
310}
311
312/// Static object to trigger the constructor which overwrites the dispatch pointer.
314
315} // End namespace RF_ARCH
316} // End namespace RooBatchCompute
#define _QUOTE_(name)
Definition RConfig.hxx:471
#define c(i)
Definition RSha256.hxx:101
#define a(i)
Definition RSha256.hxx:99
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
const double *__restrict _array
Definition Batches.h:32
std::size_t nEvents
Definition Batches.h:46
double *__restrict output
Definition Batches.h:49
std::size_t nBatches
Definition Batches.h:47
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
const std::vector< void(*)(Batches &)> _computeFunctions
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
The interface which should be implemented to provide optimised computation functions for implementati...
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
Double_t y[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()