Logo ROOT   master
Reference Guide
CudaTensor.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
21 
22 
23 #include <vector>
24 #include <cstring>
25 #include <cassert>
26 #include <iostream>
27 
28 #include "CudaMatrix.h"
29 #include "TMatrixT.h"
30 #include "CudaBuffers.h"
31 
32 //#include "TMVA/RTensor.hxx"
33 
34 #ifdef R__HAS_CUDNN
35 #include "cudnn.h"
36 #define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
37 #endif
38 
39 namespace TMVA {
40 
41 
42 
43 #ifndef TMVA_RTENSOR
44 
45 namespace Experimental {
46 /// Memory layout type (copy from RTensor.hxx)
47 enum class MemoryLayout : uint8_t {
48  RowMajor = 0x01,
49  ColumnMajor = 0x02
50 };
51 }
52 #endif
53 
54 namespace DNN {
55 
57 
58 #ifdef R__HAS_CUDNN
59 /**
60  * Function to handle the status output of cuDNN function calls. See also
61  * CUDACHECK in CudaMatrix.h.
62  */
63 inline void cudnnError(cudnnStatus_t status, const char *file, int line, bool abort=true)
64 {
65  if (status != CUDNN_STATUS_SUCCESS) {
66  fprintf(stderr, "CUDNN Error: %s %s %d\n", cudnnGetErrorString(status), file, line);
67  if (abort)
68  exit(status);
69  }
70 }
71 #endif
72 //____________________________________________________________________________
73 //
74 // Cuda Tensor
75 //____________________________________________________________________________
76 
77 /** TCudaTensor Class
78  *
79  * The TCudaTensor class extends the TCudaMatrix class for dimensions > 2.
80  *
81  */
82 template<typename AFloat>
84 {
85 public:
86 
87  using Shape_t = std::vector<size_t>;
89  using Scalar_t = AFloat;
90 
91 
92 private:
93 
94 #ifdef R__HAS_CUDNN
95  struct TensorDescriptor {
96  cudnnTensorDescriptor_t fCudnnDesc;
97  };
98 
99  static std::vector<cudnnHandle_t> fCudnnHandle; ///< Holds the cuddn library context (one for every CUDA stream)
100 
101  static cudnnDataType_t fDataType; ///< Cudnn datatype used for the tensor
102 #else
104  };
105 #endif
106 
107  /** For each GPU device keep the CUDA streams in which tensors are used.
108  * Instances belonging to the same stream on the same deviceshare a
109  * cudnn library handel to keep cudnn contexts seperated */
110  //static std::vector<std::vector<int> > fInstances;
111  static std::vector<int> fInstances;
112 
113  /** The shape vector (size of dimensions) needs to be ordered as no. channels,
114  * image dimensions.
115  */
116  Shape_t fShape; ///< spatial subdimensions
117  Shape_t fStrides; ///< Strides between tensor dimensions (always assume dense, non overlapping tensor)
118  size_t fNDim; ///< Dimension of the tensor (first dimension is the batch size, second is the no. channels)
119  size_t fSize; ///< No. of elements
120  int fDevice; ///< Device associated with current tensor instance
121  int fStreamIndx; ///< Cuda stream associated with current instance
122 
123  std::shared_ptr<TensorDescriptor> fTensorDescriptor;
125 
127 
128 
129 
130 public:
131 
132 
133  //static AFloat * GetOnes() {return fOnes;}
134 
135  TCudaTensor();
136 
137  TCudaTensor(const AFloat * data,
138  const std::vector<size_t> & shape,
139  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
140  int deviceIndx = 0, int streamIndx = 0);
142  const std::vector<size_t> & shape,
143  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
144  int deviceIndx = 0, int streamIndx = 0);
145  TCudaTensor(const std::vector<size_t> & shape,
146  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
147  int deviceIndx = 0, int streamIndx = 0);
148 
149  TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
150  TCudaTensor( (memlayout == MemoryLayout::ColumnMajor) ? Shape_t({ csize, hwsize, bsize}) : Shape_t({ bsize, csize, hwsize }) , memlayout,
151  deviceIndx, streamIndx)
152  {}
153 
154  TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
155 
156  TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
157  {
158  if (memlayout == MemoryLayout::ColumnMajor)
159  *this = TCudaTensor(fElementBuffer, { csize, hsize, wsize, bsize}, memlayout, deviceIndx, streamIndx);
160  }
161 
162  TCudaTensor(size_t n, size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
163  // TCudaTensor( {n,m}, memlayout, deviceIndx, streamIndx) :
164  TCudaTensor( {n, m}, memlayout, deviceIndx, streamIndx)
165  {}
166 
167  TCudaTensor(const TCudaMatrix<AFloat> & m, size_t dim = 2);
168 
169  TCudaTensor(const TMatrixT<AFloat> & m, size_t dim = 2) :
170  TCudaTensor( TCudaMatrix<AFloat>(m), dim)
171  {}
172 
173  TCudaTensor(TCudaDeviceBuffer<AFloat> buffer, size_t n, size_t m) :
174  TCudaTensor( buffer, {n,m}, MemoryLayout::ColumnMajor ,0,0) {}
175 
176  TCudaTensor(const TCudaTensor &) = default;
177  TCudaTensor(TCudaTensor &&) = default;
178  TCudaTensor & operator=(const TCudaTensor &) = default;
179  TCudaTensor & operator=( TCudaTensor &&) = default;
180  ~TCudaTensor();
181 
182  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
183  operator TMatrixT<AFloat>() const;
184 
185 
187 
188  const Shape_t & GetShape() const {return fShape;}
189  const Shape_t & GetStrides() const {return fStrides;}
190  size_t GetDimAt(size_t i) const {return fShape[i];}
191  size_t GetNDim() const {return fNDim;}
192  size_t GetSize() const {return fSize;}
193 
194  const AFloat * GetDataPointer() const {return fElementBuffer;}
195  AFloat * GetDataPointer() {return fElementBuffer;}
196  const AFloat * GetData() const {return fElementBuffer;}
197  AFloat * GetData() {return fElementBuffer;}
198 
199  const AFloat * GetDataPointerAt(size_t i ) const {
200  return (const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
201  AFloat * GetDataPointerAt(size_t i ) {return fElementBuffer.GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
202 
203 
206 
207 #ifdef R__HAS_CUDNN
208  const cudnnHandle_t & GetCudnnHandle() const {return fCudnnHandle[fStreamIndx];}
209  const cudnnTensorDescriptor_t & GetTensorDescriptor() const {return fTensorDescriptor->fCudnnDesc;}
210  static cudnnDataType_t GetDataType() { return fDataType; }
211 #endif
212 
213  cudaStream_t GetComputeStream() const {
214  return fElementBuffer.GetComputeStream();
215  }
216  void SetComputeStream(cudaStream_t stream) {
217  fElementBuffer.SetComputeStream(stream);
218  }
219 
220  bool isEqual (TCudaTensor<AFloat> & other) {
221 
222  if (fSize != other.GetSize()) return false;
223 
224 
225  std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
226  std::unique_ptr<AFloat[]> hostBufferOther(new AFloat[fSize]);
227  cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
228  cudaMemcpyDeviceToHost);
229  cudaMemcpy(hostBufferOther.get(), other.GetDeviceBuffer(), fSize * sizeof(AFloat),
230  cudaMemcpyDeviceToHost);
231 
232  for (size_t i = 0; i < fSize; i++) {
233  if (hostBufferThis[i] != hostBufferOther[i]) return false;
234  }
235  return true;
236  }
237 
238  bool isEqual (const AFloat * hostBufferOther, size_t otherSize) {
239  if (fSize != otherSize) return false;
240 
241 
242  std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
243  cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
244  cudaMemcpyDeviceToHost);
245 
246  for (size_t i = 0; i < fSize; i++) {
247  if (hostBufferThis[i] != hostBufferOther[i]) return false;
248  }
249 
250  return true;
251  }
252 
253  void Print(const char * name = "Tensor", bool truncate = false) const;
254 
255  void PrintShape(const char * name="Tensor") const;
256 
257  void Zero() {
258  cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetSize());
259  }
260 
261  void SetConstVal(const AFloat constVal) {
262  TCudaHostBuffer<AFloat> hostBuffer(fSize);
263  hostBuffer.SetConstVal(constVal);
264  fElementBuffer.CopyFrom(hostBuffer);
265  }
266 
267  // have this tensor representatrions
268  // 2-dimensional tensors : NW where N is batch size W is the feature size . Memory layout should be columnwise in this case
269  // 3 -dimensional tensor : represnetation is NHWC , tensor should be columnwise storage
270  // 4 -dimensional tensor : representation is NCHW ande tensor should be row wose
271  // a rowmajor tensor with dimension less than trhee should not exist but in case consider as a N, (CHW) for 2d, N, C, (HW) for 3d
272  // a columnmajor tensor for dimension >=4 should not exist but in case consider as a N,H,W,C (i.e. with shape C,W,H,N)
273 
274  size_t GetFirstSize() const {
275  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fShape.front(); } // CM order
276  size_t GetFirstStride() const {
277  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fStrides.front(); } // CM order
278 
279  size_t GetCSize() const {
280  if (fNDim == 2) return 1;
281  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.front() : fShape[1] ; //assume NHWC
282  }
283  size_t GetHSize() const {
284  if (fNDim == 2) return fShape[0];
285  if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[0] : fShape[1] ;// same as C
286  if (fNDim >= 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[2] : fShape[2] ;
287  return 0;
288  }
289  size_t GetWSize() const {
290  if (fNDim == 2) return fShape[1];
291  if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[1] : fShape[2] ;
292  if (fNDim == 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[3] : fShape[3] ;
293  return 0;
294  }
295 
296  // for backward compatibility (assume column-major
297  // for backward compatibility : for CM tensor (n1,n2,n3,n4) -> ( n1*n2*n3, n4)
298  // for RM tensor (n1,n2,n3,n4) -> ( n2*n3*n4, n1 ) ???
299  size_t GetNrows() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fShape.front();}
300  size_t GetNcols() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fStrides.front(); }
301 
302 
303  // Matrix conversion for tensors of shape 2
305  // remember TCudaMatrix is always column-major
306  if ( GetLayout() == MemoryLayout::ColumnMajor &&
307  (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1) ) )
309 
310 
311  //case of N,M,1,1,..
312  bool caseNM11 = true;
313  for (size_t i = 2; i < fNDim; ++i) caseNM11 &= fShape[i] == 1;
314  if (caseNM11) {
315  return (GetLayout() == MemoryLayout::ColumnMajor ) ?
318  }
319  bool case11NM = true;
320  for (size_t i = 0; i < fNDim-2; ++i) case11NM &= fShape[i] == 1;
321  if (case11NM) {
322  return (GetLayout() == MemoryLayout::ColumnMajor ) ?
325  }
326 
327  assert(false);
328  return TCudaMatrix<AFloat>();
329  }
330 
331  // for backeard compatibility with old tensor
332  TCudaMatrix<AFloat> operator[](size_t i) const {
333  //assert(GetLayout() == MemoryLayout::ColumnMajor );
334  return At(i).GetMatrix();
335  }
336 
337 
338 
339  static inline std::vector<std::size_t> ComputeStridesFromShape(const std::vector<std::size_t> &shape,
340  bool rowmajorLayout);
341 
342  void ReshapeInPlace(const Shape_t & newShape) {
343  fShape = newShape;
344  fStrides = ComputeStridesFromShape(fShape, fMemoryLayout == MemoryLayout::RowMajor);
345  fNDim = fShape.size();
346  // in principle reshape should not change tensor size
347  size_t newSize = (fMemoryLayout == MemoryLayout::RowMajor) ? fStrides.front() * fShape.front() : fStrides.back() * fShape.back();
348  R__ASSERT(newSize <= fSize);
349  fSize = newSize;
350  // reset the descritor for Cudnn
352  }
353 
354  TCudaTensor<AFloat> Reshape(const Shape_t & newShape) const {
355  TCudaTensor<AFloat> tmp(this->GetDeviceBuffer(), newShape, this->GetLayout(), fDevice, fStreamIndx);
356  return tmp;
357  }
358 
359  void SetTensorDescriptor();
360 
361  // return slice of tensor
362  // return slices in the first dimension (if row wise) or last dimension if colun wise
363  // so single event slides
364  TCudaTensor<AFloat> At(size_t i) const {
365  Shape_t sliced_shape = (GetLayout() == MemoryLayout::RowMajor)
366  ? Shape_t(fShape.begin() + 1, fShape.end()) :
367  Shape_t(fShape.begin(), fShape.end() - 1);
368 
369 
370  size_t buffsize = (GetLayout() == MemoryLayout::RowMajor) ?
371  fStrides.front() : fStrides.back();
372 
373  size_t offset = i * buffsize;
374 
375  return TCudaTensor<AFloat>((const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(offset, buffsize), sliced_shape, GetLayout());
376  }
377 
378 
379  // element access ( for debugging)
380  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const
381  {
382  // like this works also for multi-dim tensors
383  // and consider the tensor as a multidim one
384  size_t nrows = GetNrows();
385  size_t ncols = GetNcols();
386 
387  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
388  i * ncols + j : j * nrows + i;
389 
390  AFloat * elementPointer = fElementBuffer + offset;
391  return TCudaDeviceReference<AFloat>(elementPointer);
392  }
393  // element access ( for debugging)
394  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k) const
395  {
396  // k is B, i is C, j is HW :
397  assert( fNDim >= 3); // || ( k==0 && fNDim == 2 ) );
398  //note for larger dimension k is all other dims collapsed !!!
399 
400  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
401  i * fStrides[0] + j * fStrides[1] + k :
402  i * fStrides[2] + k * fStrides[1] + j;
403 
404  AFloat * elementPointer = fElementBuffer + offset;
405 
406  return TCudaDeviceReference<AFloat>(elementPointer);
407  }
408 
409  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k, size_t l) const
410  {
411  // for rowsise
412  //assert(GetLayout() == MemoryLayout::RowMajor);
413  assert( fNDim == 4); // || ( k==0 && fNDim == 2 ) );
414 
415  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
416  i * fStrides[0] + j * fStrides[1] + k * fStrides[2] + l:
417  l * fStrides[3] + k * fStrides[2] + j * fStrides[1] + i;
418 
419  AFloat * elementPointer = fElementBuffer + offset;
420 
421  return TCudaDeviceReference<AFloat>(elementPointer);
422  }
423 
424 private:
425 
426  /** Initializes all shared devices resource and makes sure that a sufficient
427  * number of curand states are allocated on the device and initialized as
428  * well as that the one-vector for the summation over columns has the right
429  * size. */
430  void InitializeCuda();
431  void InitializeCurandStates();
432 
433 };
434 
435 
436 
437 
438 } // namespace DNN
439 } // namespace TMVA
440 
441 #endif
TMVA::Experimental::MemoryLayout MemoryLayout
Definition: CudaTensor.h:56
TCudaDeviceReference.
Definition: CudaMatrix.h:61
size_t GetFirstStride() const
Definition: CudaTensor.h:276
size_t GetHSize() const
Definition: CudaTensor.h:283
MemoryLayout GetLayout() const
Definition: CudaTensor.h:186
auto * m
Definition: textangle.C:8
cudaStream_t GetComputeStream() const
Definition: CudaTensor.h:213
TCudaTensor< AFloat > Reshape(const Shape_t &newShape) const
Definition: CudaTensor.h:354
TLine * line
TCudaTensor & operator=(const TCudaTensor &)=default
std::vector< size_t > Shape_t
Definition: CudaTensor.h:87
const AFloat * GetData() const
Definition: CudaTensor.h:196
TCudaDeviceBuffer.
Definition: CudaBuffers.h:28
#define R__ASSERT(e)
Definition: TError.h:96
int fStreamIndx
Cuda stream associated with current instance.
Definition: CudaTensor.h:121
const Shape_t & GetShape() const
Definition: CudaTensor.h:188
void ReshapeInPlace(const Shape_t &newShape)
Definition: CudaTensor.h:342
TMatrixT.
Definition: TMatrixDfwd.h:22
bool isEqual(const AFloat *hostBufferOther, size_t otherSize)
Definition: CudaTensor.h:238
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
Definition: CudaTensor.h:116
size_t GetFirstSize() const
Definition: CudaTensor.h:274
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition: CudaTensor.h:47
size_t GetNrows() const
Definition: CudaTensor.h:299
AFloat * GetDataPointer()
Definition: CudaTensor.h:195
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k) const
Definition: CudaTensor.h:394
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
Definition: CudaTensor.h:117
void PrintShape(const char *name="Tensor") const
TCudaHostBuffer.
Definition: CudaBuffers.h:42
size_t fSize
No. of elements.
Definition: CudaTensor.h:119
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
Definition: CudaBuffers.cxx:70
TCudaTensor< AFloat > At(size_t i) const
Definition: CudaTensor.h:364
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaTensor.h:124
BinData::ErrorType GetDataType(const TGraph *gr, DataOptions &fitOpt)
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
Definition: CudaTensor.h:111
void SetComputeStream(cudaStream_t stream)
Definition: CudaTensor.h:216
AFloat * GetDataPointerAt(size_t i)
Definition: CudaTensor.h:201
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k, size_t l) const
Definition: CudaTensor.h:409
const TCudaDeviceBuffer< AFloat > & GetDeviceBuffer() const
Definition: CudaTensor.h:204
TCudaMatrix< AFloat > GetMatrix() const
Definition: CudaTensor.h:304
void SetConstVal(const AFloat constVal)
Definition: CudaTensor.h:261
TCudaMatrix< AFloat > operator[](size_t i) const
Definition: CudaTensor.h:332
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Definition: CudaTensor.h:380
size_t GetNcols() const
Definition: CudaTensor.h:300
void Print(const char *name="Tensor", bool truncate=false) const
size_t GetNDim() const
Definition: CudaTensor.h:191
TCudaDeviceBuffer< AFloat > & GetDeviceBuffer()
Definition: CudaTensor.h:205
std::shared_ptr< TensorDescriptor > fTensorDescriptor
Definition: CudaTensor.h:123
TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:149
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
TCudaTensor(size_t n, size_t m, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:162
create variable transformations
auto * l
Definition: textangle.C:4
int fDevice
Device associated with current tensor instance.
Definition: CudaTensor.h:120
Definition: file.py:1
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, size_t n, size_t m)
Definition: CudaTensor.h:173
size_t GetDimAt(size_t i) const
Definition: CudaTensor.h:190
const AFloat * GetDataPointer() const
Definition: CudaTensor.h:194
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
TMVA::Experimental::MemoryLayout MemoryLayout
Definition: CudaTensor.h:88
TCudaTensor(const TMatrixT< AFloat > &m, size_t dim=2)
Definition: CudaTensor.h:169
MemoryLayout fMemoryLayout
Definition: CudaTensor.h:126
bool isEqual(TCudaTensor< AFloat > &other)
Definition: CudaTensor.h:220
size_t GetWSize() const
Definition: CudaTensor.h:289
TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:154
const Shape_t & GetStrides() const
Definition: CudaTensor.h:189
const AFloat * GetDataPointerAt(size_t i) const
Definition: CudaTensor.h:199
const Int_t n
Definition: legend1.C:16
TCudaTensor Class.
Definition: CudaTensor.h:83
size_t GetCSize() const
Definition: CudaTensor.h:279
TCudaMatrix Class.
Definition: CudaMatrix.h:105
char name[80]
Definition: TGX11.cxx:109
size_t GetSize() const
Definition: CudaTensor.h:192
size_t fNDim
Dimension of the tensor (first dimension is the batch size, second is the no. channels) ...
Definition: CudaTensor.h:118