23 #include "cuda_runtime.h" 32 template <
typename AFloat>
35 cudaFreeHost(*devicePointer);
36 delete[] devicePointer;
40 template <
typename AFloat>
43 AFloat **pointer =
new AFloat *[1];
44 cudaMallocHost(pointer, size *
sizeof(AFloat));
49 template <
typename AFloat>
56 template <
typename AFloat>
68 template <
typename AFloat>
71 cudaFree(*devicePointer);
72 delete[] devicePointer;
76 template <
typename AFloat>
79 AFloat **pointer =
new AFloat *[1];
80 cudaMalloc(pointer, size *
sizeof(AFloat));
86 template <
typename AFloat>
90 AFloat **pointer =
new AFloat *[1];
91 cudaMalloc(pointer, size *
sizeof(AFloat));
96 template <
typename AFloat>
100 AFloat **pointer =
new AFloat *[1];
101 *pointer = devicePointer;
106 template <
typename AFloat>
116 template <
typename AFloat>
123 template <
typename AFloat>
127 cudaMemcpyAsync(*
this, buffer,
fSize *
sizeof(AFloat), cudaMemcpyHostToDevice,
fComputeStream);
131 template <
typename AFloat>
134 cudaMemcpyAsync(*
this, buffer,
fSize *
sizeof(AFloat), cudaMemcpyDeviceToHost,
fComputeStream);
146 for (
size_t i = 0; i < batchSize; i++) {
147 size_t sampleIndex = *sampleIterator;
148 for (
size_t j = 0; j <
n; j++) {
149 size_t bufferIndex = j * batchSize + i;
150 buffer[bufferIndex] =
static_cast<float>(inputMatrix(sampleIndex, j));
164 for (
size_t i = 0; i < batchSize; i++) {
165 size_t sampleIndex = *sampleIterator;
166 for (
size_t j = 0; j <
n; j++) {
167 size_t bufferIndex = j * batchSize + i;
168 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
180 for (
size_t i = 0; i < batchSize; i++) {
181 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
191 Event *
event = std::get<0>(fData)[0];
192 size_t n =
event->GetNVariables();
193 for (
size_t i = 0; i < batchSize; i++) {
194 size_t sampleIndex = * sampleIterator++;
195 event = std::get<0>(fData)[sampleIndex];
196 for (
size_t j = 0; j <
n; j++) {
197 size_t bufferIndex = j * batchSize + i;
198 buffer[bufferIndex] =
static_cast<float>(
event->GetValue(j));
209 size_t n = buffer.
GetSize() / batchSize;
213 for (
size_t i = 0; i < batchSize; i++) {
214 size_t sampleIndex = *sampleIterator++;
215 Event *
event = std::get<0>(fData)[sampleIndex];
216 for (
size_t j = 0; j <
n; j++) {
218 size_t bufferIndex = j * batchSize + i;
220 if (event->GetNTargets() == 0) {
223 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
226 buffer[bufferIndex] = 0.0;
227 if (j == event->GetClass()) {
228 buffer[bufferIndex] = 1.0;
232 buffer[bufferIndex] =
static_cast<float>(
event->GetTarget(j));
243 for (
size_t i = 0; i < batchSize; i++) {
244 size_t sampleIndex = *sampleIterator++;
245 Event *
event = std::get<0>(fData)[sampleIndex];
246 buffer[i] =
static_cast<float>(
event->GetWeight());
258 for (
size_t i = 0; i < batchSize; i++) {
259 size_t sampleIndex = *sampleIterator;
260 for (
size_t j = 0; j <
n; j++) {
261 size_t bufferIndex = j * batchSize + i;
262 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
276 for (
size_t i = 0; i < batchSize; i++) {
277 size_t sampleIndex = *sampleIterator;
278 for (
size_t j = 0; j <
n; j++) {
279 size_t bufferIndex = j * batchSize + i;
280 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
292 for (
size_t i = 0; i < batchSize; i++) {
293 buffer[i] =
static_cast<double>(weightMatrix(*sampleIterator, 0));
303 Event *
event = std::get<0>(fData)[0];
304 size_t n =
event->GetNVariables();
305 for (
size_t i = 0; i < batchSize; i++) {
306 size_t sampleIndex = * sampleIterator++;
307 event = std::get<0>(fData)[sampleIndex];
308 for (
size_t j = 0; j <
n; j++) {
309 size_t bufferIndex = j * batchSize + i;
310 buffer[bufferIndex] =
event->GetValue(j);
321 size_t n = buffer.
GetSize() / batchSize;
325 for (
size_t i = 0; i < batchSize; i++) {
326 size_t sampleIndex = *sampleIterator++;
327 Event *
event = std::get<0>(fData)[sampleIndex];
328 for (
size_t j = 0; j <
n; j++) {
330 size_t bufferIndex = j * batchSize + i;
332 if (event->GetNTargets() == 0) {
335 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
338 buffer[bufferIndex] = 0.0;
339 if (j == event->GetClass()) {
340 buffer[bufferIndex] = 1.0;
344 buffer[bufferIndex] =
event->GetTarget(j);
355 for (
size_t i = 0; i < batchSize; i++) {
356 size_t sampleIndex = *sampleIterator++;
357 Event *
event = std::get<0>(fData)[sampleIndex];
358 buffer[i] =
static_cast<double>(
event->GetWeight());
367 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
369 for (
size_t i = 0; i < fBatchSize; i++) {
370 size_t sampleIndex = *sampleIterator;
371 for (
size_t j = 0; j < fBatchHeight; j++) {
372 for (
size_t k = 0; k < fBatchWidth; k++) {
373 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
374 buffer[bufferIndex] =
static_cast<float>(inputTensor[sampleIndex](j, k));
389 for (
size_t i = 0; i < fBatchSize; i++) {
390 size_t sampleIndex = *sampleIterator;
391 for (
size_t j = 0; j <
n; j++) {
392 size_t bufferIndex = j * fBatchSize + i;
393 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
405 for (
size_t i = 0; i < fBatchSize; i++) {
406 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
418 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
419 for (
size_t i = 0; i < fBatchHeight; i++) {
420 size_t sampleIndex = *sampleIterator;
421 Event *
event = std::get<0>(fData)[sampleIndex];
422 for (
size_t j = 0; j < fBatchWidth; j++) {
423 size_t bufferIndex = j * fBatchHeight + i;
424 buffer[bufferIndex] =
event->GetValue(j);
428 }
else if (fBatchDepth == fBatchSize) {
430 for (
size_t i = 0; i < fBatchDepth; i++) {
431 size_t sampleIndex = *sampleIterator;
432 Event *
event = std::get<0>(fData)[sampleIndex];
433 for (
size_t j = 0; j < fBatchHeight; j++) {
434 for (
size_t k = 0; k < fBatchWidth; k++) {
436 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
437 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
444 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
454 size_t n = buffer.
GetSize() / fBatchSize;
458 for (
size_t i = 0; i < fBatchSize; i++) {
459 size_t sampleIndex = *sampleIterator++;
460 Event *
event = std::get<0>(fData)[sampleIndex];
461 for (
size_t j = 0; j <
n; j++) {
463 size_t bufferIndex = j * fBatchSize + i;
465 if (event->GetNTargets() == 0) {
468 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
471 buffer[bufferIndex] = 0.0;
472 if (j == event->GetClass()) {
473 buffer[bufferIndex] = 1.0;
477 buffer[bufferIndex] =
static_cast<Real_t>(
event->GetTarget(j));
488 for (
size_t i = 0; i < fBatchSize; i++) {
489 size_t sampleIndex = *sampleIterator++;
490 Event *
event = std::get<0>(fData)[sampleIndex];
491 buffer[i] =
event->GetWeight();
500 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
502 for (
size_t i = 0; i < fBatchSize; i++) {
503 size_t sampleIndex = *sampleIterator;
504 for (
size_t j = 0; j < fBatchHeight; j++) {
505 for (
size_t k = 0; k < fBatchWidth; k++) {
506 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
507 buffer[bufferIndex] = inputTensor[sampleIndex](j, k);
522 for (
size_t i = 0; i < fBatchSize; i++) {
523 size_t sampleIndex = *sampleIterator;
524 for (
size_t j = 0; j <
n; j++) {
525 size_t bufferIndex = j * fBatchSize + i;
526 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
538 for (
size_t i = 0; i < fBatchSize; i++) {
539 buffer[i] =
static_cast<double>(weightMatrix(*sampleIterator, 0));
551 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
552 for (
size_t i = 0; i < fBatchHeight; i++) {
553 size_t sampleIndex = *sampleIterator;
554 Event *
event = std::get<0>(fData)[sampleIndex];
555 for (
size_t j = 0; j < fBatchWidth; j++) {
556 size_t bufferIndex = j * fBatchHeight + i;
557 buffer[bufferIndex] =
event->GetValue(j);
561 }
else if (fBatchDepth == fBatchSize) {
563 for (
size_t i = 0; i < fBatchDepth; i++) {
564 size_t sampleIndex = *sampleIterator;
565 Event *
event = std::get<0>(fData)[sampleIndex];
566 for (
size_t j = 0; j < fBatchHeight; j++) {
567 for (
size_t k = 0; k < fBatchWidth; k++) {
569 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
570 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
577 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
588 size_t n = buffer.
GetSize() / fBatchSize;
592 for (
size_t i = 0; i < fBatchSize; i++) {
593 size_t sampleIndex = *sampleIterator++;
594 Event *
event = std::get<0>(fData)[sampleIndex];
595 for (
size_t j = 0; j <
n; j++) {
597 size_t bufferIndex = j * fBatchSize + i;
599 if (event->GetNTargets() == 0) {
602 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
605 buffer[bufferIndex] = 0.0;
606 if (j == event->GetClass()) {
607 buffer[bufferIndex] = 1.0;
611 buffer[bufferIndex] =
static_cast<Real_t>(
event->GetTarget(j));
622 for (
size_t i = 0; i < fBatchSize; i++) {
623 size_t sampleIndex = *sampleIterator++;
624 Event *
event = std::get<0>(fData)[sampleIndex];
625 buffer[i] =
event->GetWeight();
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
TCudaDeviceBuffer()=default
size_t fOffset
Offset for sub-buffers.
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
Class that contains all the data information.
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
typename std::vector< size_t >::iterator IndexIterator_t
void operator()(AFloat **devicePointer)
void Error(const char *location, const char *msgfmt,...)
void operator()(AFloat **devicePointer)
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
TCudaHostBuffer()=default
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Bool_t IsSignal(const Event *ev) const
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
size_t fOffset
Offset for sub-buffers.
cudaStream_t fComputeStream
cudaStream for data transfer