19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
36 #define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
45 namespace Experimental {
63 inline void cudnnError(cudnnStatus_t status,
const char *
file,
int line,
bool abort=
true)
65 if (status != CUDNN_STATUS_SUCCESS) {
66 fprintf(stderr,
"CUDNN Error: %s %s %d\n", cudnnGetErrorString(status),
file,
line);
82 template<
typename AFloat>
96 cudnnTensorDescriptor_t fCudnnDesc;
99 static std::vector<cudnnHandle_t> fCudnnHandle;
101 static cudnnDataType_t fDataType;
138 const std::vector<size_t> & shape,
140 int deviceIndx = 0,
int streamIndx = 0);
142 const std::vector<size_t> & shape,
144 int deviceIndx = 0,
int streamIndx = 0);
147 int deviceIndx = 0,
int streamIndx = 0);
149 TCudaTensor(
size_t bsize,
size_t csize,
size_t hwsize,
MemoryLayout memlayout = MemoryLayout::ColumnMajor,
int deviceIndx = 0,
int streamIndx = 0) :
151 deviceIndx, streamIndx)
154 TCudaTensor(
size_t bsize,
size_t csize,
size_t hsize,
size_t wsize,
MemoryLayout memlayout = MemoryLayout::ColumnMajor,
int deviceIndx = 0,
int streamIndx = 0) :
156 TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
158 if (memlayout == MemoryLayout::ColumnMajor)
174 TCudaTensor( buffer, {
n,
m}, MemoryLayout::ColumnMajor ,0,0) {}
208 const cudnnHandle_t & GetCudnnHandle()
const {
return fCudnnHandle[
fStreamIndx];}
209 const cudnnTensorDescriptor_t & GetTensorDescriptor()
const {
return fTensorDescriptor->fCudnnDesc;}
210 static cudnnDataType_t
GetDataType() {
return fDataType; }
225 std::unique_ptr<AFloat[]> hostBufferThis(
new AFloat[
fSize]);
226 std::unique_ptr<AFloat[]> hostBufferOther(
new AFloat[
fSize]);
228 cudaMemcpyDeviceToHost);
230 cudaMemcpyDeviceToHost);
232 for (
size_t i = 0; i <
fSize; i++) {
233 if (hostBufferThis[i] != hostBufferOther[i])
return false;
238 bool isEqual (
const AFloat * hostBufferOther,
size_t otherSize) {
239 if (
fSize != otherSize)
return false;
242 std::unique_ptr<AFloat[]> hostBufferThis(
new AFloat[
fSize]);
244 cudaMemcpyDeviceToHost);
246 for (
size_t i = 0; i <
fSize; i++) {
247 if (hostBufferThis[i] != hostBufferOther[i])
return false;
253 void Print(
const char *
name =
"Tensor",
bool truncate =
false)
const;
280 if (
fNDim == 2)
return 1;
306 if (
GetLayout() == MemoryLayout::ColumnMajor &&
312 bool caseNM11 =
true;
313 for (
size_t i = 2; i <
fNDim; ++i) caseNM11 &=
fShape[i] == 1;
315 return (
GetLayout() == MemoryLayout::ColumnMajor ) ?
319 bool case11NM =
true;
320 for (
size_t i = 0; i <
fNDim-2; ++i) case11NM &=
fShape[i] == 1;
322 return (
GetLayout() == MemoryLayout::ColumnMajor ) ?
334 return At(i).GetMatrix();
340 bool rowmajorLayout);
370 size_t buffsize = (
GetLayout() == MemoryLayout::RowMajor) ?
373 size_t offset = i * buffsize;
387 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?
388 i * ncols + j : j * nrows + i;
400 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?
415 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?