38void fillBatches(
Batches &batches,
double *
output,
size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
42 batches.
nExtra = nExtraArgs;
46void fillArrays(
Batch *arrays,
VarSpan vars,
double *buffer,
double *bufferDevice, std::size_t nEvents)
48 for (
int i = 0;
i < vars.size();
i++) {
49 const std::span<const double> &span = vars[
i];
50 arrays[
i].
_isVector = span.empty() || span.size() >= nEvents;
51 if (!arrays[
i]._isVector) {
66int getGridSize(std::size_t
n)
79 constexpr int maxGridSize = 84;
80 return std::min(
int(std::ceil(
double(
n) /
blockSize)), maxGridSize);
112 std::size_t nEvents =
output.size();
114 const std::size_t memSize =
sizeof(
Batches) + vars.size() *
sizeof(
Batch) + vars.size() *
sizeof(
double) +
115 extraArgs.size() *
sizeof(
double);
117 std::vector<char> hostMem(memSize);
118 auto batches =
reinterpret_cast<Batches *
>(hostMem.data());
119 auto arrays =
reinterpret_cast<Batch *
>(batches + 1);
120 auto scalarBuffer =
reinterpret_cast<double *
>(arrays + vars.size());
121 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
123 DeviceArray<char> deviceMem(memSize);
124 auto batchesDevice =
reinterpret_cast<Batches *
>(deviceMem.data());
125 auto arraysDevice =
reinterpret_cast<Batch *
>(batchesDevice + 1);
126 auto scalarBufferDevice =
reinterpret_cast<double *
>(arraysDevice + vars.size());
127 auto extraArgsDevice =
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
129 fillBatches(*batches,
output.data(), nEvents, vars.size(), extraArgs.size());
130 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
131 batches->args = arraysDevice;
133 if (!extraArgs.empty()) {
134 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
135 batches->extra = extraArgsDevice;
138 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.
cudaStream());
140 const int gridSize = getGridSize(nEvents);
146 if (!extraArgs.empty()) {
147 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.
cudaStream());
153 std::span<const double> weights, std::span<const double> offsetProbas)
override;
183 const double y =
a - (carry + otherCarry);
184 const double t =
sum +
y;
187 carry = (t -
sum) -
y;
197 for (
int i = blockDim.x / 2;
i > 0;
i >>= 1) {
198 if (threadIdx.x <
i && (threadIdx.x +
i) <
n) {
199 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x +
i], shared[carry_index +
i]);
205 if (threadIdx.x == 0) {
206 result[blockIdx.x] = shared[0];
207 result[blockIdx.x + gridDim.x] = shared[carry_index];
211__global__
void kahanSum(
const double *__restrict__
input,
const double *__restrict__ carries,
size_t n,
212 double *__restrict__
result,
bool nll)
214 int thIdx = threadIdx.x;
215 int gthIdx = thIdx + blockIdx.x *
blockSize;
216 int carry_index = threadIdx.x + blockDim.x;
217 const int nThreadsTotal =
blockSize * gridDim.x;
220 extern __shared__
double shared[];
225 for (
int i = gthIdx;
i <
n;
i += nThreadsTotal) {
233 shared[carry_index] = carry;
241__global__
void nllSumKernel(
const double *__restrict__ probas,
const double *__restrict__ weights,
242 const double *__restrict__ offsetProbas,
size_t n,
double *__restrict__
result)
244 int thIdx = threadIdx.x;
245 int gthIdx = thIdx + blockIdx.x *
blockSize;
246 int carry_index = threadIdx.x + blockDim.x;
247 const int nThreadsTotal =
blockSize * gridDim.x;
250 extern __shared__
double shared[];
255 for (
int i = gthIdx;
i <
n;
i += nThreadsTotal) {
258 double val = -std::log(probas[
i]);
260 val += std::log(offsetProbas[
i]);
262 val = weights[
i] * val;
267 shared[carry_index] = carry;
279 const int gridSize = getGridSize(
n);
291 std::span<const double> weights, std::span<const double> offsetProbas)
294 if (probas.empty()) {
297 const int gridSize = getGridSize(probas.size());
303 probas.data(), weights.size() == 1 ?
nullptr : weights.data(),
304 offsetProbas.empty() ?
nullptr : offsetProbas.data(), probas.size(), devOut.
data());
309 double tmpCarry = 0.0;
313 if (weights.size() == 1) {
314 tmpSum *= weights[0];
315 tmpCarry *= weights[0];
319 out.nllSumCarry = tmpCarry;
325class ScalarBufferContainer {
327 ScalarBufferContainer() {}
328 ScalarBufferContainer(std::size_t
size)
331 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
334 double const *hostReadPtr()
const {
return &_val; }
335 double const *deviceReadPtr()
const {
return &_val; }
337 double *hostWritePtr() {
return &_val; }
338 double *deviceWritePtr() {
return &_val; }
340 void assignFromHost(std::span<const double>
input) { _val =
input[0]; }
341 void assignFromDevice(std::span<const double>
input)
350class CPUBufferContainer {
352 CPUBufferContainer(std::size_t
size) : _vec(
size) {}
354 double const *hostReadPtr()
const {
return _vec.data(); }
355 double const *deviceReadPtr()
const
357 throw std::bad_function_call();
361 double *hostWritePtr() {
return _vec.data(); }
362 double *deviceWritePtr()
364 throw std::bad_function_call();
368 void assignFromHost(std::span<const double>
input) { _vec.assign(
input.begin(),
input.end()); }
369 void assignFromDevice(std::span<const double>
input)
375 std::vector<double> _vec;
378class GPUBufferContainer {
380 GPUBufferContainer(std::size_t
size) : _arr(
size) {}
382 double const *hostReadPtr()
const
384 throw std::bad_function_call();
387 double const *deviceReadPtr()
const {
return _arr.data(); }
389 double *hostWritePtr()
const
391 throw std::bad_function_call();
394 double *deviceWritePtr()
const {
return const_cast<double *
>(_arr.data()); }
396 void assignFromHost(std::span<const double>
input)
400 void assignFromDevice(std::span<const double>
input)
409class PinnedBufferContainer {
411 PinnedBufferContainer(std::size_t
size) : _arr{
size}, _gpuBuffer{
size} {}
412 std::size_t
size()
const {
return _arr.size(); }
414 void setCudaStream(CudaInterface::CudaStream *stream) { _cudaStream = stream; }
416 double const *hostReadPtr()
const
419 if (_lastAccess == LastAccessType::GPU_WRITE) {
424 _lastAccess = LastAccessType::CPU_READ;
425 return const_cast<double *
>(_arr.data());
427 double const *deviceReadPtr()
const
430 if (_lastAccess == LastAccessType::CPU_WRITE) {
434 _lastAccess = LastAccessType::GPU_READ;
435 return _gpuBuffer.deviceReadPtr();
438 double *hostWritePtr()
440 _lastAccess = LastAccessType::CPU_WRITE;
443 double *deviceWritePtr()
445 _lastAccess = LastAccessType::GPU_WRITE;
446 return _gpuBuffer.deviceWritePtr();
449 void assignFromHost(std::span<const double>
input) { std::copy(
input.begin(),
input.end(), hostWritePtr()); }
450 void assignFromDevice(std::span<const double>
input)
456 enum class LastAccessType { CPU_READ, GPU_READ, CPU_WRITE, GPU_WRITE };
459 GPUBufferContainer _gpuBuffer;
460 CudaInterface::CudaStream *_cudaStream =
nullptr;
461 mutable LastAccessType _lastAccess = LastAccessType::CPU_READ;
464template <
class Container>
465class BufferImpl :
public AbsBuffer {
467 using Queue = std::queue<std::unique_ptr<Container>>;
469 BufferImpl(std::size_t
size, Queue &queue) : _queue{queue}
471 if (_queue.empty()) {
472 _vec = std::make_unique<Container>(
size);
474 _vec = std::move(_queue.front());
479 ~BufferImpl()
override { _queue.emplace(std::move(_vec)); }
481 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
482 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
484 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
485 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
487 void assignFromHost(std::span<const double>
input)
override { _vec->assignFromHost(
input); }
488 void assignFromDevice(std::span<const double>
input)
override { _vec->assignFromDevice(
input); }
490 Container &vec() {
return *_vec; }
493 std::unique_ptr<Container> _vec;
497using ScalarBuffer = BufferImpl<ScalarBufferContainer>;
498using CPUBuffer = BufferImpl<CPUBufferContainer>;
499using GPUBuffer = BufferImpl<GPUBufferContainer>;
500using PinnedBuffer = BufferImpl<PinnedBufferContainer>;
502struct BufferQueuesMaps {
503 std::map<std::size_t, ScalarBuffer::Queue> scalarBufferQueuesMap;
504 std::map<std::size_t, CPUBuffer::Queue> cpuBufferQueuesMap;
505 std::map<std::size_t, GPUBuffer::Queue> gpuBufferQueuesMap;
506 std::map<std::size_t, PinnedBuffer::Queue> pinnedBufferQueuesMap;
509class BufferManager :
public AbsBufferManager {
512 BufferManager() : _queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
514 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
516 return std::make_unique<ScalarBuffer>(1, _queuesMaps->scalarBufferQueuesMap[1]);
518 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
520 return std::make_unique<CPUBuffer>(
size, _queuesMaps->cpuBufferQueuesMap[
size]);
522 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t
size)
override
524 return std::make_unique<GPUBuffer>(
size, _queuesMaps->gpuBufferQueuesMap[
size]);
526 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t
size, CudaInterface::CudaStream *stream =
nullptr)
override
528 auto out = std::make_unique<PinnedBuffer>(
size, _queuesMaps->pinnedBufferQueuesMap[
size]);
529 out->vec().setCudaStream(stream);
534 std::unique_ptr<BufferQueuesMaps> _queuesMaps;
541 return std::make_unique<BufferManager>();