Logo ROOT  
Reference Guide
CudaTensor.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
21
22
23#include <vector>
24#include <cstring>
25#include <cassert>
26#include <iostream>
27
28#include "CudaMatrix.h"
29#include "TMatrixT.h"
30#include "CudaBuffers.h"
31
32//#include "TMVA/RTensor.hxx"
33
34#ifdef R__HAS_CUDNN
35#include "cudnn.h"
36#define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
37#endif
38
39namespace TMVA {
40
41
42
43#ifndef TMVA_RTENSOR
44
45namespace Experimental {
46/// Memory layout type (copy from RTensor.hxx)
47enum class MemoryLayout : uint8_t {
48 RowMajor = 0x01,
49 ColumnMajor = 0x02
50};
51}
52#endif
53
54namespace DNN {
55
57
58#ifdef R__HAS_CUDNN
59/**
60 * Function to handle the status output of cuDNN function calls. See also
61 * CUDACHECK in CudaMatrix.h.
62 */
63inline void cudnnError(cudnnStatus_t status, const char *file, int line, bool abort=true)
64{
65 if (status != CUDNN_STATUS_SUCCESS) {
66 fprintf(stderr, "CUDNN Error: %s %s %d\n", cudnnGetErrorString(status), file, line);
67 if (abort)
68 exit(status);
69 }
70}
71#endif
72//____________________________________________________________________________
73//
74// Cuda Tensor
75//____________________________________________________________________________
76
77/** TCudaTensor Class
78 *
79 * The TCudaTensor class extends the TCudaMatrix class for dimensions > 2.
80 *
81 */
82template<typename AFloat>
84{
85public:
86
87 using Shape_t = std::vector<size_t>;
89 using Scalar_t = AFloat;
90
91
92private:
93
94#ifdef R__HAS_CUDNN
95 struct TensorDescriptor {
96 cudnnTensorDescriptor_t fCudnnDesc;
97 };
98
99 static std::vector<cudnnHandle_t> fCudnnHandle; ///< Holds the cuddn library context (one for every CUDA stream)
100
101 static cudnnDataType_t fDataType; ///< Cudnn datatype used for the tensor
102#else
104 };
105#endif
106
107 /** For each GPU device keep the CUDA streams in which tensors are used.
108 * Instances belonging to the same stream on the same deviceshare a
109 * cudnn library handel to keep cudnn contexts seperated */
110 //static std::vector<std::vector<int> > fInstances;
111 static std::vector<int> fInstances;
112
113 /** The shape vector (size of dimensions) needs to be ordered as no. channels,
114 * image dimensions.
115 */
116 Shape_t fShape; ///< spatial subdimensions
117 Shape_t fStrides; ///< Strides between tensor dimensions (always assume dense, non overlapping tensor)
118 size_t fNDim; ///< Dimension of the tensor (first dimension is the batch size, second is the no. channels)
119 size_t fSize; ///< No. of elements
120 int fDevice; ///< Device associated with current tensor instance
121 int fStreamIndx; ///< Cuda stream associated with current instance
122
123 std::shared_ptr<TensorDescriptor> fTensorDescriptor;
125
127
128
129
130public:
131
132
133 //static AFloat * GetOnes() {return fOnes;}
134
136
137 TCudaTensor(const AFloat * data,
138 const std::vector<size_t> & shape,
139 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
140 int deviceIndx = 0, int streamIndx = 0);
142 const std::vector<size_t> & shape,
143 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
144 int deviceIndx = 0, int streamIndx = 0);
145 TCudaTensor(const std::vector<size_t> & shape,
146 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
147 int deviceIndx = 0, int streamIndx = 0);
148
149 TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
150 TCudaTensor( (memlayout == MemoryLayout::ColumnMajor) ? Shape_t({ csize, hwsize, bsize}) : Shape_t({ bsize, csize, hwsize }) , memlayout,
151 deviceIndx, streamIndx)
152 {}
153
154 TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
155
156 TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
157 {
158 if (memlayout == MemoryLayout::ColumnMajor)
159 *this = TCudaTensor(fElementBuffer, { csize, hsize, wsize, bsize}, memlayout, deviceIndx, streamIndx);
160 }
161
162 TCudaTensor(size_t n, size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
163 // TCudaTensor( {n,m}, memlayout, deviceIndx, streamIndx) :
164 TCudaTensor( {n, m}, memlayout, deviceIndx, streamIndx)
165 {}
166
167 TCudaTensor(const TCudaMatrix<AFloat> & m, size_t dim = 2);
168
169 TCudaTensor(TCudaDeviceBuffer<AFloat> buffer, size_t n, size_t m) :
170 TCudaTensor( buffer, {n,m}, MemoryLayout::ColumnMajor ,0,0) {}
171
172 TCudaTensor(const TCudaTensor &) = default;
174 TCudaTensor & operator=(const TCudaTensor &) = default;
177
178 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
179 operator TMatrixT<AFloat>() const;
180
181
183
184 const Shape_t & GetShape() const {return fShape;}
185 const Shape_t & GetStrides() const {return fStrides;}
186 size_t GetDimAt(size_t i) const {return fShape[i];}
187 size_t GetNDim() const {return fNDim;}
188 size_t GetSize() const {return fSize;}
189
190 const AFloat * GetDataPointer() const {return fElementBuffer;}
191 AFloat * GetDataPointer() {return fElementBuffer;}
192 const AFloat * GetData() const {return fElementBuffer;}
193 AFloat * GetData() {return fElementBuffer;}
194
195 const AFloat * GetDataPointerAt(size_t i ) const {
196 return (const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
197 AFloat * GetDataPointerAt(size_t i ) {return fElementBuffer.GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
198
199
202
203#ifdef R__HAS_CUDNN
204 const cudnnHandle_t & GetCudnnHandle() const {return fCudnnHandle[fStreamIndx];}
205 const cudnnTensorDescriptor_t & GetTensorDescriptor() const {return fTensorDescriptor->fCudnnDesc;}
206 static cudnnDataType_t GetDataType() { return fDataType; }
207#endif
208
209 cudaStream_t GetComputeStream() const {
210 return fElementBuffer.GetComputeStream();
211 }
212 void SetComputeStream(cudaStream_t stream) {
213 fElementBuffer.SetComputeStream(stream);
214 }
215
217
218 if (fSize != other.GetSize()) return false;
219
220
221 std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
222 std::unique_ptr<AFloat[]> hostBufferOther(new AFloat[fSize]);
223 cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
224 cudaMemcpyDeviceToHost);
225 cudaMemcpy(hostBufferOther.get(), other.GetDeviceBuffer(), fSize * sizeof(AFloat),
226 cudaMemcpyDeviceToHost);
227
228 for (size_t i = 0; i < fSize; i++) {
229 if (hostBufferThis[i] != hostBufferOther[i]) return false;
230 }
231 return true;
232 }
233
234 bool isEqual (const AFloat * hostBufferOther, size_t otherSize) {
235 if (fSize != otherSize) return false;
236
237
238 std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
239 cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
240 cudaMemcpyDeviceToHost);
241
242 for (size_t i = 0; i < fSize; i++) {
243 if (hostBufferThis[i] != hostBufferOther[i]) return false;
244 }
245
246 return true;
247 }
248
249 void Print(const char * name = "Tensor", bool truncate = false) const;
250
251 void PrintShape(const char * name="Tensor") const;
252
253 void Zero() {
254 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetSize());
255 }
256
257 void SetConstVal(const AFloat constVal) {
258 TCudaHostBuffer<AFloat> hostBuffer(fSize);
259 hostBuffer.SetConstVal(constVal);
260 fElementBuffer.CopyFrom(hostBuffer);
261 }
262
263 // have this tensor representatrions
264 // 2-dimensional tensors : NW where N is batch size W is the feature size . Memory layout should be columnwise in this case
265 // 3 -dimensional tensor : represnetation is NHWC , tensor should be columnwise storage
266 // 4 -dimensional tensor : representation is NCHW ande tensor should be row wose
267 // a rowmajor tensor with dimension less than trhee should not exist but in case consider as a N, (CHW) for 2d, N, C, (HW) for 3d
268 // a columnmajor tensor for dimension >=4 should not exist but in case consider as a N,H,W,C (i.e. with shape C,W,H,N)
269
270 size_t GetFirstSize() const {
271 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fShape.front(); } // CM order
272 size_t GetFirstStride() const {
273 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fStrides.front(); } // CM order
274
275 size_t GetCSize() const {
276 if (fNDim == 2) return 1;
277 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.front() : fShape[1] ; //assume NHWC
278 }
279 size_t GetHSize() const {
280 if (fNDim == 2) return fShape[0];
281 if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[0] : fShape[1] ;// same as C
282 if (fNDim >= 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[2] : fShape[2] ;
283 return 0;
284 }
285 size_t GetWSize() const {
286 if (fNDim == 2) return fShape[1];
287 if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[1] : fShape[2] ;
288 if (fNDim == 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[3] : fShape[3] ;
289 return 0;
290 }
291
292 // for backward compatibility (assume column-major
293 // for backward compatibility : for CM tensor (n1,n2,n3,n4) -> ( n1*n2*n3, n4)
294 // for RM tensor (n1,n2,n3,n4) -> ( n2*n3*n4, n1 ) ???
295 size_t GetNrows() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fShape.front();}
296 size_t GetNcols() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fStrides.front(); }
297
298
299 // Matrix conversion for tensors of shape 2
301 if (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1))
303
304
305 // remember TCudaMatrix is always column-major
306 //case of N,M,1,1,..
307 bool caseNM11 = true;
308 for (size_t i = 2; i < fNDim; ++i) caseNM11 &= fShape[i] == 1;
309 if (caseNM11) {
310 return (GetLayout() == MemoryLayout::ColumnMajor ) ?
313 }
314 bool case11NM = true;
315 for (size_t i = 0; i < fNDim-2; ++i) case11NM &= fShape[i] == 1;
316 if (case11NM) {
317 return (GetLayout() == MemoryLayout::ColumnMajor ) ?
320 }
321
322 assert(false);
323 return TCudaMatrix<AFloat>();
324 }
325
326
327
328 static inline std::vector<std::size_t> ComputeStridesFromShape(const std::vector<std::size_t> &shape,
329 bool rowmajorLayout);
330
331 void ReshapeInPlace(const Shape_t & newShape) {
332 fShape = newShape;
333 fStrides = ComputeStridesFromShape(fShape, fMemoryLayout == MemoryLayout::RowMajor);
334 fNDim = fShape.size();
335 // in principle reshape should not change tensor size
336 size_t newSize = (fMemoryLayout == MemoryLayout::RowMajor) ? fStrides.front() * fShape.front() : fStrides.back() * fShape.back();
337 R__ASSERT(newSize <= fSize);
338 fSize = newSize;
339 // reset the descritor for Cudnn
341 }
342
343 TCudaTensor<AFloat> Reshape(const Shape_t & newShape) const {
344 TCudaTensor<AFloat> tmp(*this);
345 // have a new descriptor for reshaped tensor !!!
346#ifdef R__HAS_CUDNN
347 tmp.fTensorDescriptor.reset( new TensorDescriptor() );
348 // t.b.d. need to check if we delete the cudnn object
349 CUDNNCHECK(cudnnCreateTensorDescriptor(&(tmp.fTensorDescriptor->fCudnnDesc)));
350#endif
351 tmp.ReshapeInPlace(newShape);
352 return tmp;
353 }
354
356
357 // return slice of tensor
358 // return slices in the first dimension (if row wise) or last dimension if colun wise
359 // so single event slides
360 TCudaTensor<AFloat> At(size_t i) const {
361 Shape_t sliced_shape = (GetLayout() == MemoryLayout::RowMajor)
362 ? Shape_t(fShape.begin() + 1, fShape.end()) :
363 Shape_t(fShape.begin(), fShape.end() - 1);
364
365
366 size_t buffsize = (GetLayout() == MemoryLayout::RowMajor) ?
367 fStrides.front() : fStrides.back();
368
369 size_t offset = i * buffsize;
370
371 return TCudaTensor<AFloat>((const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(offset, buffsize), sliced_shape, GetLayout());
372 }
373
374
375 // element access ( for debugging)
376 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const
377 {
378 // like this works also for multi-dim tensors
379 // and consider the tensor as a multidim one
380 size_t nrows = GetNrows();
381 size_t ncols = GetNcols();
382
383 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
384 i * ncols + j : j * nrows + i;
385
386 AFloat * elementPointer = fElementBuffer + offset;
387 return TCudaDeviceReference<AFloat>(elementPointer);
388 }
389 // element access ( for debugging)
390 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k) const
391 {
392 // k is B, i is C, j is HW :
393 assert( fNDim >= 3); // || ( k==0 && fNDim == 2 ) );
394 //note for larger dimension k is all other dims collapsed !!!
395
396 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
397 i * fStrides[0] + j * fStrides[1] + k :
398 i * fStrides[2] + k * fStrides[1] + j;
399
400 AFloat * elementPointer = fElementBuffer + offset;
401
402 return TCudaDeviceReference<AFloat>(elementPointer);
403 }
404
405 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k, size_t l) const
406 {
407 // for rowsise
408 //assert(GetLayout() == MemoryLayout::RowMajor);
409 assert( fNDim == 4); // || ( k==0 && fNDim == 2 ) );
410
411 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
412 i * fStrides[0] + j * fStrides[1] + k * fStrides[2] + l:
413 l * fStrides[3] + k * fStrides[2] + j * fStrides[1] + i;
414
415 AFloat * elementPointer = fElementBuffer + offset;
416
417 return TCudaDeviceReference<AFloat>(elementPointer);
418 }
419
420
421
422
423private:
424
425 /** Initializes all shared devices resource and makes sure that a sufficient
426 * number of curand states are allocated on the device and initialized as
427 * well as that the one-vector for the summation over columns has the right
428 * size. */
431
432};
433
434
435
436
437} // namespace DNN
438} // namespace TMVA
439
440#endif
#define R__ASSERT(e)
Definition: TError.h:96
char name[80]
Definition: TGX11.cxx:109
TCudaDeviceBuffer.
Definition: CudaBuffers.h:100
TCudaDeviceReference.
Definition: CudaMatrix.h:62
TCudaHostBuffer.
Definition: CudaBuffers.h:43
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
Definition: CudaBuffers.cxx:70
TCudaMatrix Class.
Definition: CudaMatrix.h:106
TCudaTensor Class.
Definition: CudaTensor.h:84
TCudaTensor< AFloat > At(size_t i) const
Definition: CudaTensor.h:360
const AFloat * GetDataPointerAt(size_t i) const
Definition: CudaTensor.h:195
const Shape_t & GetShape() const
Definition: CudaTensor.h:184
TCudaTensor(const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
size_t GetWSize() const
Definition: CudaTensor.h:285
AFloat * GetDataPointer()
Definition: CudaTensor.h:191
std::vector< size_t > Shape_t
Definition: CudaTensor.h:87
const AFloat * GetData() const
Definition: CudaTensor.h:192
size_t GetDimAt(size_t i) const
Definition: CudaTensor.h:186
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
Definition: CudaTensor.h:111
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
Definition: CudaTensor.h:117
int fDevice
Device associated with current tensor instance.
Definition: CudaTensor.h:120
bool isEqual(TCudaTensor< AFloat > &other)
Definition: CudaTensor.h:216
TCudaTensor & operator=(TCudaTensor &&)=default
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Definition: CudaTensor.h:376
TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:154
size_t GetNrows() const
Definition: CudaTensor.h:295
size_t fNDim
Dimension of the tensor (first dimension is the batch size, second is the no. channels)
Definition: CudaTensor.h:118
TCudaTensor & operator=(const TCudaTensor &)=default
cudaStream_t GetComputeStream() const
Definition: CudaTensor.h:209
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
MemoryLayout GetLayout() const
Definition: CudaTensor.h:182
TCudaTensor(const TCudaMatrix< AFloat > &m, size_t dim=2)
TCudaDeviceBuffer< AFloat > & GetDeviceBuffer()
Definition: CudaTensor.h:201
size_t GetNcols() const
Definition: CudaTensor.h:296
TCudaTensor(TCudaTensor &&)=default
TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:149
TCudaTensor(const AFloat *data, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
TCudaTensor(const TCudaTensor &)=default
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
Definition: CudaTensor.h:116
bool isEqual(const AFloat *hostBufferOther, size_t otherSize)
Definition: CudaTensor.h:234
AFloat * GetDataPointerAt(size_t i)
Definition: CudaTensor.h:197
size_t GetCSize() const
Definition: CudaTensor.h:275
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, size_t n, size_t m)
Definition: CudaTensor.h:169
void PrintShape(const char *name="Tensor") const
TCudaTensor< AFloat > Reshape(const Shape_t &newShape) const
Definition: CudaTensor.h:343
size_t fSize
No. of elements.
Definition: CudaTensor.h:119
TCudaMatrix< AFloat > GetMatrix() const
Definition: CudaTensor.h:300
TCudaTensor(size_t n, size_t m, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition: CudaTensor.h:162
size_t GetNDim() const
Definition: CudaTensor.h:187
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
const AFloat * GetDataPointer() const
Definition: CudaTensor.h:190
const Shape_t & GetStrides() const
Definition: CudaTensor.h:185
void ReshapeInPlace(const Shape_t &newShape)
Definition: CudaTensor.h:331
size_t GetHSize() const
Definition: CudaTensor.h:279
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaTensor.h:124
size_t GetFirstStride() const
Definition: CudaTensor.h:272
const TCudaDeviceBuffer< AFloat > & GetDeviceBuffer() const
Definition: CudaTensor.h:200
MemoryLayout fMemoryLayout
Definition: CudaTensor.h:126
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k, size_t l) const
Definition: CudaTensor.h:405
void SetComputeStream(cudaStream_t stream)
Definition: CudaTensor.h:212
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k) const
Definition: CudaTensor.h:390
size_t GetFirstSize() const
Definition: CudaTensor.h:270
void SetConstVal(const AFloat constVal)
Definition: CudaTensor.h:257
size_t GetSize() const
Definition: CudaTensor.h:188
int fStreamIndx
Cuda stream associated with current instance.
Definition: CudaTensor.h:121
void Print(const char *name="Tensor", bool truncate=false) const
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
std::shared_ptr< TensorDescriptor > fTensorDescriptor
Definition: CudaTensor.h:123
TMatrixT.
Definition: TMatrixT.h:39
TLine * line
const Int_t n
Definition: legend1.C:16
BinData::ErrorType GetDataType(const TGraph *gr, DataOptions &fitOpt)
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition: CudaTensor.h:47
create variable transformations
Definition: file.py:1
auto * m
Definition: textangle.C:8
auto * l
Definition: textangle.C:4