Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.h
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN 6 January 2021
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13#ifndef ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
14#define ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
15
16#include <ROOT/RSpan.hxx>
17
18#include <DllImport.h> //for R__EXTERN, needed for windows
19
20#include <cstddef>
21#include <initializer_list>
22#include <memory>
23#include <string>
24
25/**
26 * Namespace for dispatching RooFit computations to various backends.
27 *
28 * This namespace contains an interface for providing high-performance computation functions for use in
29 * RooAbsReal::doEval(), see RooBatchComputeInterface.
30 *
31 * Furthermore, several implementations of this interface can be created, which reside in RooBatchCompute::RF_ARCH,
32 * where RF_ARCH may be replaced by the architecture that this implementation targets, e.g. SSE, AVX, etc.
33 *
34 * Using the pointer RooBatchCompute::dispatch, a computation request can be dispatched to the fastest backend that is
35 * available on a specific platform.
36 */
37namespace RooBatchCompute {
38
39namespace CudaInterface {
40class CudaEvent;
41class CudaStream;
42} // namespace CudaInterface
43
44typedef std::span<const std::span<const double>> VarSpan;
45typedef std::span<double> ArgSpan;
46typedef const double *__restrict InputArr;
47
48constexpr std::size_t bufferSize = 64;
49
50int initCPU();
51int initCUDA();
52
53/// Minimal configuration struct to steer the evaluation of a single node with
54/// the RooBatchCompute library.
55class Config {
56public:
57 bool useCuda() const { return _cudaStream != nullptr; }
60
61private:
63};
64
65enum class Architecture {
66 AVX512,
67 AVX2,
68 AVX,
69 SSE4,
70 GENERIC,
71 CUDA
72};
73
115
117 double nllSum = 0.0;
118 double nllSumCarry = 0.0;
119 std::size_t nInfiniteValues = 0;
120 std::size_t nNonPositiveValues = 0;
121 std::size_t nNaNValues = 0;
122};
123
125public:
126 virtual ~AbsBuffer() = default;
127
128 virtual double const *hostReadPtr() const = 0;
129 virtual double const *deviceReadPtr() const = 0;
130
131 virtual double *hostWritePtr() = 0;
132 virtual double *deviceWritePtr() = 0;
133
134 virtual void assignFromHost(std::span<const double> input) = 0;
135 virtual void assignFromDevice(std::span<const double> input) = 0;
136};
137
139public:
140 virtual ~AbsBufferManager() = default;
141
142 virtual std::unique_ptr<AbsBuffer> makeScalarBuffer() = 0;
143 virtual std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) = 0;
144 virtual std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t size) = 0;
145 virtual std::unique_ptr<AbsBuffer>
146 makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream = nullptr) = 0;
147};
148
149/**
150 * \class RooBatchComputeInterface
151 * \ingroup roofit_dev_docs_batchcompute
152 * \brief The interface which should be implemented to provide optimised computation functions for implementations of
153 * RooAbsReal::doEval().
154 *
155 * The class RooBatchComputeInterface provides the mechanism for external modules (like RooFit) to call
156 * functions from the library. The power lies in the virtual functions that can resolve to different
157 * implementations for the functionality; for example, calling a function through dispatchCuda
158 * will resolve to efficient CUDA implementations.
159 *
160 * This interface contains the signatures of the compute functions of every PDF that has an optimised implementation
161 * available. These are the functions that perform the actual computations in batches.
162 *
163 * Several implementations of this interface may be provided, e.g. SSE, AVX, AVX2 etc. At run time, the fastest
164 * implementation of this interface is selected, and using a virtual call, the computation is dispatched to the best
165 * backend.
166 *
167 * \see RooBatchCompute::dispatch, RooBatchComputeClass, RF_ARCH
168 */
170public:
171 virtual ~RooBatchComputeInterface() = default;
172 virtual void compute(Config const &cfg, Computer, std::span<double> output, VarSpan, ArgSpan) = 0;
173
174 virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0;
175 virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span<const double> probas, std::span<const double> weights,
176 std::span<const double> offsetProbas) = 0;
177
178 virtual Architecture architecture() const = 0;
179 virtual std::string architectureName() const = 0;
180
181 virtual std::unique_ptr<AbsBufferManager> createBufferManager() const = 0;
182
185 virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const = 0;
190};
191
192/**
193 * This dispatch pointer points to an implementation of the compute library, provided one has been loaded.
194 * Using a virtual call, computation requests are dispatched to backends with architecture-specific functions
195 * such as SSE, AVX, AVX2, etc.
196 *
197 * \see RooBatchComputeInterface, RooBatchComputeClass, RF_ARCH
198 */
201
203{
204 return dispatchCPU->architecture();
205}
206
207inline std::string cpuArchitectureName()
208{
210}
211
212inline void compute(Config cfg, Computer comp, std::span<double> output, VarSpan vars, ArgSpan extraArgs = {})
213{
214 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
215 dispatch->compute(cfg, comp, output, vars, extraArgs);
216}
217
218/// It is not possible to construct a std::span directly from an initializer
219/// list (probably it will be with C++26). That's why we need an explicit
220/// overload for this.
221inline void compute(Config cfg, Computer comp, std::span<double> output,
222 std::initializer_list<std::span<const double>> vars, ArgSpan extraArgs = {})
223{
224 compute(cfg, comp, output, VarSpan{vars.begin(), vars.end()}, extraArgs);
225}
226
227inline double reduceSum(Config cfg, InputArr input, size_t n)
228{
229 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
230 return dispatch->reduceSum(cfg, input, n);
231}
232
233inline ReduceNLLOutput reduceNLL(Config cfg, std::span<const double> probas, std::span<const double> weights,
234 std::span<const double> offsetProbas)
235{
236 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
237 return dispatch->reduceNLL(cfg, probas, weights, offsetProbas);
238}
239
240std::string getBatchComputeChoice();
241void setBatchComputeChoice(std::string const &value);
242
243} // End namespace RooBatchCompute
244
245#endif
#define R__EXTERN
Definition DllImport.h:26
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
virtual std::unique_ptr< AbsBuffer > makeScalarBuffer()=0
virtual ~AbsBufferManager()=default
virtual std::unique_ptr< AbsBuffer > makeCpuBuffer(std::size_t size)=0
virtual std::unique_ptr< AbsBuffer > makeGpuBuffer(std::size_t size)=0
virtual std::unique_ptr< AbsBuffer > makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream=nullptr)=0
virtual double const * deviceReadPtr() const =0
virtual ~AbsBuffer()=default
virtual void assignFromHost(std::span< const double > input)=0
virtual double const * hostReadPtr() const =0
virtual double * deviceWritePtr()=0
virtual void assignFromDevice(std::span< const double > input)=0
virtual double * hostWritePtr()=0
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
CudaInterface::CudaStream * _cudaStream
CudaInterface::CudaStream * cudaStream() const
The interface which should be implemented to provide optimised computation functions for implementati...
virtual double reduceSum(Config const &cfg, InputArr input, size_t n)=0
virtual std::string architectureName() const =0
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
virtual Architecture architecture() const =0
virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)=0
virtual void compute(Config const &cfg, Computer, std::span< double > output, VarSpan, ArgSpan)=0
const Int_t n
Definition legend1.C:16
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
std::string cpuArchitectureName()
std::string getBatchComputeChoice()
void compute(Config cfg, Computer comp, std::span< double > output, VarSpan vars, ArgSpan extraArgs={})
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
void setBatchComputeChoice(std::string const &value)
constexpr std::size_t bufferSize
double reduceSum(Config cfg, InputArr input, size_t n)
ReduceNLLOutput reduceNLL(Config cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)
Architecture cpuArchitecture()
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
static void output()