48 for (
int i = 0; i < vars.size(); i++) {
49 const std::span<const double> &span = vars[i];
50 arrays[i]._isVector = span.empty() || span.size() >= nEvents;
51 if (!
arrays[i]._isVector) {
61 arrays[i]._array = span.data();
110 using namespace CudaInterface;
112 std::size_t nEvents =
output.size();
120 auto scalarBuffer =
reinterpret_cast<double *
>(
arrays + vars.size());
121 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
153 std::span<const double> weights, std::span<const double>
offsetProbas)
override;
184 const double t =
sum +
y;
187 carry = (t -
sum) -
y;
197 for (
int i =
blockDim.x / 2; i > 0; i >>= 1) {
228 double val = nll == 1 ? -std::log(
input[i]) :
input[i];
258 double val = -std::log(probas[i]);
262 val = weights[i] * val;
291 std::span<const double> weights, std::span<const double>
offsetProbas)
294 if (probas.empty()) {
303 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
313 if (weights.size() == 1) {
325class ScalarBufferContainer {
327 ScalarBufferContainer() {}
328 ScalarBufferContainer(std::size_t
size)
331 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
334 double const *hostReadPtr()
const {
return &
_val; }
335 double const *deviceReadPtr()
const {
return &
_val; }
337 double *hostWritePtr() {
return &
_val; }
338 double *deviceWritePtr() {
return &
_val; }
340 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
341 void assignFromDevice(std::span<const double>
input)
350class CPUBufferContainer {
354 double const *hostReadPtr()
const {
return _vec.data(); }
355 double const *deviceReadPtr()
const
357 throw std::bad_function_call();
361 double *hostWritePtr() {
return _vec.data(); }
362 double *deviceWritePtr()
364 throw std::bad_function_call();
368 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
369 void assignFromDevice(std::span<const double>
input)
378class GPUBufferContainer {
382 double const *hostReadPtr()
const
384 throw std::bad_function_call();
387 double const *deviceReadPtr()
const {
return _arr.data(); }
389 double *hostWritePtr()
const
391 throw std::bad_function_call();
394 double *deviceWritePtr()
const {
return const_cast<double *
>(
_arr.data()); }
396 void assignFromHost(std::span<const double>
input)
400 void assignFromDevice(std::span<const double>
input)
406 CudaInterface::DeviceArray<double>
_arr;
409class PinnedBufferContainer {
412 std::size_t
size()
const {
return _arr.size(); }
414 void setCudaStream(CudaInterface::CudaStream *stream) {
_cudaStream = stream; }
416 double const *hostReadPtr()
const
419 if (_lastAccess == LastAccessType::GPU_WRITE) {
425 return const_cast<double *
>(
_arr.data());
427 double const *deviceReadPtr()
const
430 if (_lastAccess == LastAccessType::CPU_WRITE) {
438 double *hostWritePtr()
443 double *deviceWritePtr()
449 void assignFromHost(std::span<const double>
input) { std::copy(
input.begin(),
input.end(), hostWritePtr()); }
450 void assignFromDevice(std::span<const double>
input)
458 CudaInterface::PinnedHostArray<double>
_arr;
464template <
class Container>
465class BufferImpl :
public AbsBuffer {
467 using Queue = std::queue<std::unique_ptr<Container>>;
469 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
472 _vec = std::make_unique<Container>(
size);
481 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
482 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
484 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
485 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
487 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
488 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
493 std::unique_ptr<Container>
_vec;
502struct BufferQueuesMaps {
509class BufferManager :
public AbsBufferManager {
512 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
514 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
516 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
518 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
522 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t
size)
override
526 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t
size, CudaInterface::CudaStream *stream =
nullptr)
override
529 out->vec().setCudaStream(stream);
541 return std::make_unique<BufferManager>();
std::vector< double > _vec
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)