28#ifdef ROOBATCHCOMPUTE_USE_IMT
44#error "RF_ARCH should always be defined"
63 for (std::size_t i = 0; i < vars.size(); i++) {
64 arrays[i]._array = vars[i].data();
65 arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
71 for (std::size_t i = 0; i <
batches.nBatches; i++) {
73 arg._array += arg._isVector * nEvents;
84class RooBatchComputeClass :
public RooBatchComputeInterface {
97#error "It's unexpected that _QUOTEVAL_ is defined at this point!"
99#define _QUOTEVAL_(x) _QUOTE_(x)
102 std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
119 throw std::bad_function_call();
123 throw std::bad_function_call();
128#ifdef ROOBATCHCOMPUTE_USE_IMT
132 const std::vector<void (*)(
Batches &)> _computeFunctions;
135#ifdef ROOBATCHCOMPUTE_USE_IMT
138 std::size_t nEvents =
output.size();
150 auto task = [&](std::size_t idx) ->
int {
154 std::vector<Batch>
arrays(vars.size());
165 std::size_t events =
batches.nEvents;
167 while (events > bufferSize) {
170 events -= bufferSize;
177 std::vector<std::size_t> indices(
nThreads);
178 for (
unsigned int i = 1; i <
nThreads; i++) {
193void RooBatchComputeClass::compute(Config
const &, Computer
computer, std::span<double>
output, VarSpan vars,
214#ifdef ROOBATCHCOMPUTE_USE_IMT
220 std::size_t nEvents =
output.size();
225 std::vector<Batch>
arrays(vars.size());
230 std::size_t events =
batches.nEvents;
232 while (events > bufferSize) {
243inline std::pair<double, double>
getLog(
double prob, ReduceNLLOutput &out)
246 out.nNonPositiveValues++;
250 if (std::isinf(
prob)) {
251 out.nInfiniteValues++;
254 if (std::isnan(
prob)) {
259 return {std::log(
prob), 0.0};
264double RooBatchComputeClass::reduceSum(Config
const &, InputArr
input,
size_t n)
269ReduceNLLOutput RooBatchComputeClass::reduceNLL(Config
const &, std::span<const double> probas,
270 std::span<const double> weights, std::span<const double>
offsetProbas)
278 for (std::size_t i = 0; i < weights.size(); ++i) {
280 if (0. == weights[i])
296 out.nllSum = nllSum.
Sum();
302 out.nllSumCarry = 0.0;
310class ScalarBufferContainer {
312 ScalarBufferContainer() {}
313 ScalarBufferContainer(std::size_t
size)
316 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
319 double const *hostReadPtr()
const {
return &
_val; }
320 double const *deviceReadPtr()
const {
return &
_val; }
322 double *hostWritePtr() {
return &
_val; }
323 double *deviceWritePtr() {
return &
_val; }
325 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
326 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
332class CPUBufferContainer {
336 double const *hostReadPtr()
const {
return _vec.data(); }
337 double const *deviceReadPtr()
const
339 throw std::bad_function_call();
343 double *hostWritePtr() {
return _vec.data(); }
344 double *deviceWritePtr()
346 throw std::bad_function_call();
350 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
351 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
357template <
class Container>
358class BufferImpl :
public AbsBuffer {
360 using Queue = std::queue<std::unique_ptr<Container>>;
362 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
365 _vec = std::make_unique<Container>(
size);
374 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
375 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
377 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
378 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
380 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
381 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
386 std::unique_ptr<Container>
_vec;
393struct BufferQueuesMaps {
398class BufferManager :
public AbsBufferManager {
401 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
403 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
405 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
407 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
411 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t)
override {
throw std::bad_function_call(); }
412 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t, CudaInterface::CudaStream * =
nullptr)
override
414 throw std::bad_function_call();
423std::unique_ptr<AbsBufferManager> RooBatchComputeClass::createBufferManager()
const
425 return std::make_unique<BufferManager>();
std::vector< double > _vec
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
std::unique_ptr< BufferQueuesMaps > _queuesMaps
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
These classes encapsulate the necessary data for the computations.
This class implements the interface to execute the same task multiple times, sequentially or in paral...
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
void Add(T x)
Single-element accumulation. Will not vectorise.
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
double reduceSum(Config const &, InputArr input, size_t n) override
void deleteCudaStream(CudaInterface::CudaStream *) const override
CudaInterface::CudaStream * newCudaStream() const override
std::unique_ptr< AbsBufferManager > createBufferManager() const override
CudaInterface::CudaEvent * newCudaEvent(bool) const override
bool cudaStreamIsActive(CudaInterface::CudaStream *) const override
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override
std::string architectureName() const override
void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override
void compute(Config const &, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
void deleteCudaEvent(CudaInterface::CudaEvent *) const override
Architecture architecture() const override
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
std::span< double > ArgSpan
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
void probas(TString dataset, TString fin="TMVA.root", Bool_t useTMVAStyle=kTRUE)
static double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...