Logo ROOT  
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21 
22 // in case we compile C++ code with std-17 and cuda with lower standard
23 #include "RConfigure.h"
24 #ifdef R__HAS_STD_STRING_VIEW
25 #undef R__HAS_STD_STRING_VIEW
26 #define R__HAS_STD_EXPERIMENTAL_STRING_VIEW
27 #endif
28 
29 #include "cuda.h"
30 #include "cuda_runtime.h"
31 #include "cublas_v2.h"
32 #include "curand_kernel.h"
33 
34 #include "TMatrixT.h"
35 #include "CudaBuffers.h"
36 
37 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
38 
39 namespace TMVA {
40 namespace DNN {
41 
42 /** Function to check cuda return code. Taken from
43  * http://stackoverflow.com/questions/14038589/
44  */
45 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
46 
47 //____________________________________________________________________________
48 //
49 // Cuda Device Reference
50 //____________________________________________________________________________
51 
52 /** TCudaDeviceReference
53  *
54  * Helper class emulating lvalue references for AFloat values that are
55  * physically on the device. Allows for example to assign to matrix elements.
56  * Note that device access through CudaDeviceReferences enforces synchronization
57  * with all streams and thus qualifies as performance killer. Only used for
58  * testing.
59  */
60 template<typename AFloat>
62 {
63 private:
64 
65  AFloat * fDevicePointer;
66 
67 public:
68 
69  TCudaDeviceReference(AFloat * devicePointer);
70 
71  operator AFloat();
72 
73  void operator=(const TCudaDeviceReference &other);
74  void operator=(AFloat value);
75  void operator+=(AFloat value);
76  void operator-=(AFloat value);
77 };
78 
79 //____________________________________________________________________________
80 //
81 // Cuda Matrix
82 //____________________________________________________________________________
83 
84 /** TCudaMatrix Class
85  *
86  * The TCudaMatrix class represents matrices on a CUDA device. The elements
87  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
88  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
89  * object, that means on assignment and copy creation only a shallow copy is
90  * performed and no new element buffer allocated. To perform a deep copy use
91  * the static Copy method of the TCuda architecture class.
92  *
93  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
94  * transferred to the device. This stream can be accessed through the
95  * GetComputeStream member function and used to synchronize computations.
96  *
97  * The TCudaMatrix class also holds static references to CUDA resources.
98  * Those are the cublas handle, a buffer of curand states for the generation
99  * of random numbers as well as a vector containing ones, which is used for
100  * summing column matrices using matrix-vector multiplication. The class also
101  * has a static buffer for returning results from the device.
102  *
103  */
104 template<typename AFloat>
106 {
107 public:
108 
109 private:
110 
111  static size_t fInstances; ///< Current number of matrix instances.
112  static cublasHandle_t fCublasHandle;
113  static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
114  static AFloat * fOnes; ///< Vector used for summations of columns.
115  static size_t fNOnes; ///< Current length of the one vector.
116  static curandState_t * fCurandStates;
117  static size_t fNCurandStates;
118 
119 
120  size_t fNRows;
121  size_t fNCols;
123 
124 public:
125 
127 
128  static AFloat * GetOnes() {return fOnes;}
129 
130  TCudaMatrix();
131  TCudaMatrix(size_t i, size_t j);
132  TCudaMatrix(const TMatrixT<AFloat> &);
133  TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
134 
135  TCudaMatrix(const TCudaMatrix &) = default;
136  TCudaMatrix( TCudaMatrix &&) = default;
137  TCudaMatrix & operator=(const TCudaMatrix &) = default;
138  TCudaMatrix & operator=( TCudaMatrix &&) = default;
139  ~TCudaMatrix() = default;
140 
141  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
142  operator TMatrixT<AFloat>() const;
143 
144  inline cudaStream_t GetComputeStream() const;
145  inline void SetComputeStream(cudaStream_t stream);
146  /** Set the return buffer on the device to the specified value. This is
147  * required for example for reductions in order to initialize the
148  * accumulator. */
149  inline static void ResetDeviceReturn(AFloat value = 0.0);
150  /** Transfer the value in the device return buffer to the host. This
151  * tranfer is synchronous */
152  inline static AFloat GetDeviceReturn();
153  /** Return device pointer to the device return buffer */
154  inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
155  inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
156 
157  /** Blocking synchronization with the associated compute stream, if it's
158  * not the default stream. */
159  inline void Synchronize(const TCudaMatrix &) const;
160 
161  static size_t GetNDim() {return 2;}
162  size_t GetNrows() const {return fNRows;}
163  size_t GetNcols() const {return fNCols;}
164  size_t GetNoElements() const {return fNRows * fNCols;}
165 
166  const AFloat * GetDataPointer() const {return fElementBuffer;}
167  AFloat * GetDataPointer() {return fElementBuffer;}
168  const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
169 
171 
172  /** Access to elements of device matrices provided through TCudaDeviceReference
173  * class. Note that access is synchronous end enforces device synchronization
174  * on all streams. Only used for testing. */
175  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
176 
177  void Print() const {
178  TMatrixT<AFloat> mat(*this);
179  mat.Print();
180  }
181 
182  void Zero() {
183  cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
184  }
185 
186 
187 private:
188 
189  /** Initializes all shared devices resource and makes sure that a sufficient
190  * number of curand states are allocated on the device and initialized as
191  * well as that the one-vector for the summation over columns has the right
192  * size. */
193  void InitializeCuda();
194  void InitializeCurandStates();
195 
196 };
197 
198 //
199 // Inline Functions.
200 //______________________________________________________________________________
201 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
202 {
203  if (code != cudaSuccess)
204  {
205  fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
206  if (abort) exit(code);
207  }
208 }
209 
210 //______________________________________________________________________________
211 template<typename AFloat>
213  : fDevicePointer(devicePointer)
214 {
215  // Nothing to do here.
216 }
217 
218 //______________________________________________________________________________
219 template<typename AFloat>
221 {
222  AFloat buffer;
223  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
224  cudaMemcpyDeviceToHost);
225  return buffer;
226 }
227 
228 //______________________________________________________________________________
229 template<typename AFloat>
231 {
232  cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
233  cudaMemcpyDeviceToDevice);
234 }
235 
236 //______________________________________________________________________________
237 template<typename AFloat>
239 {
240  AFloat buffer = value;
241  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
242  cudaMemcpyHostToDevice);
243 }
244 
245 //______________________________________________________________________________
246 template<typename AFloat>
248 {
249  AFloat buffer;
250  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
251  cudaMemcpyDeviceToHost);
252  buffer += value;
253  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
254  cudaMemcpyHostToDevice);
255 }
256 
257 //______________________________________________________________________________
258 template<typename AFloat>
260 {
261  AFloat buffer;
262  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
263  cudaMemcpyDeviceToHost);
264  buffer -= value;
265  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
266  cudaMemcpyHostToDevice);
267 }
268 
269 //______________________________________________________________________________
270 template<typename AFloat>
271 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
272 {
273  return fElementBuffer.GetComputeStream();
274 }
275 
276 //______________________________________________________________________________
277 template<typename AFloat>
278 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
279 {
280  return fElementBuffer.SetComputeStream(stream);
281 }
282 
283 //______________________________________________________________________________
284 template<typename AFloat>
286 {
287  cudaEvent_t event;
288  cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
289  cudaEventRecord(event, A.GetComputeStream());
290  cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
291  cudaEventDestroy(event);
292 }
293 
294 //______________________________________________________________________________
295 template<typename AFloat>
296 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
297 {
298  AFloat buffer = value;
299  cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
300 }
301 
302 //______________________________________________________________________________
303 template<typename AFloat>
305 {
306  AFloat buffer;
307  cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
308  return buffer;
309 }
310 
311 //______________________________________________________________________________
312 template<typename AFloat>
314 {
315  AFloat * elementPointer = fElementBuffer;
316  elementPointer += j * fNRows + i;
317  return TCudaDeviceReference<AFloat>(elementPointer);
318 }
319 
320 } // namespace DNN
321 } // namespace TMVA
322 
323 #endif
TMVA::DNN::TCudaMatrix::GetNoElements
size_t GetNoElements() const
Definition: CudaMatrix.h:164
m
auto * m
Definition: textangle.C:8
TMVA::DNN::TCudaDeviceBuffer
TCudaDeviceBuffer.
Definition: CudaBuffers.h:40
n
const Int_t n
Definition: legend1.C:16
TMVA::DNN::TCudaMatrix::GetDataPointer
AFloat * GetDataPointer()
Definition: CudaMatrix.h:167
TMVA::DNN::TCudaDeviceReference::operator+=
void operator+=(AFloat value)
Definition: CudaMatrix.h:247
TMVA::DNN::TCudaMatrix::GetNDim
static size_t GetNDim()
Definition: CudaMatrix.h:161
TMVA::DNN::TCudaMatrix::fCublasHandle
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:112
TMVA::DNN::TCudaMatrix::InitializeCuda
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
TMVA::DNN::TCudaMatrix::GetCublasHandle
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:168
TMVA::DNN::TCudaMatrix::GetDataPointer
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:166
TMVA::DNN::TCudaDeviceReference::fDevicePointer
AFloat * fDevicePointer
Definition: CudaMatrix.h:65
TMVA::DNN::TCudaMatrix::~TCudaMatrix
~TCudaMatrix()=default
TMVA::DNN::TCudaMatrix::GetDeviceReturnPointer
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:154
TMVA::DNN::TCudaMatrix::fInstances
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:111
TMVA::DNN::TCudaMatrix::GetNrows
size_t GetNrows() const
Definition: CudaMatrix.h:162
TMVA::DNN::TCudaMatrix::fCurandStates
static curandState_t * fCurandStates
Definition: CudaMatrix.h:116
ROOT::Math::Cephes::A
static double A[]
Definition: SpecFuncCephes.cxx:170
TMatrixT.h
TMVA::DNN::TCudaMatrix::fOnes
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:114
TMatrixT
Definition: TMatrixDfwd.h:22
TMVA::DNN::TCudaMatrix::GetDeviceBuffer
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
Definition: CudaMatrix.h:170
bool
TMVA::DNN::TCudaMatrix::fNCols
size_t fNCols
Definition: CudaMatrix.h:121
TMVA::DNN::TCudaMatrix::GetCurandStatesPointer
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:155
TMVA::DNN::TCudaMatrix::SetComputeStream
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:278
TMVA::DNN::TCudaMatrix::GetComputeStream
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:271
TMVA::DNN::TCudaMatrix::Synchronize
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition: CudaMatrix.h:285
TMVA::DNN::TCudaMatrix::fDeviceReturn
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:113
TMVA::DNN::TCudaMatrix::operator()
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:313
TMVA::DNN::TCudaMatrix::operator=
TCudaMatrix & operator=(const TCudaMatrix &)=default
TMVA::DNN::TCudaDeviceReference::operator=
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:230
TMVA::DNN::TCudaMatrix::Zero
void Zero()
Definition: CudaMatrix.h:182
TMVA::DNN::TCudaMatrix::GetNcols
size_t GetNcols() const
Definition: CudaMatrix.h:163
line
TLine * line
Definition: entrylistblock_figure1.C:235
TMVA::DNN::TCudaDeviceReference::TCudaDeviceReference
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:212
TMVA::DNN::TCudaMatrix::ResetDeviceReturn
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:296
TMVA::DNN::TCudaMatrix::gInitializeCurand
static Bool_t gInitializeCurand
Definition: CudaMatrix.h:126
TMVA::DNN::TCudaMatrix::fNCurandStates
static size_t fNCurandStates
Definition: CudaMatrix.h:117
TMVA::DNN::TCudaMatrix::GetDeviceReturn
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:304
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix()
TMVA::DNN::cudaError
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:201
file
Definition: file.py:1
TMVA::DNN::TCudaMatrix::fNOnes
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:115
TMVA::DNN::TCudaMatrix::GetOnes
static AFloat * GetOnes()
Definition: CudaMatrix.h:128
TMVA::DNN::TCudaMatrix
TCudaMatrix Class.
Definition: CudaMatrix.h:105
TMVA::DNN::TCudaDeviceReference
TCudaDeviceReference.
Definition: CudaMatrix.h:61
TMVA::DNN::TCudaMatrix::fElementBuffer
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:122
TMVA::DNN::TCudaMatrix::InitializeCurandStates
void InitializeCurandStates()
TMVA::DNN::TCudaMatrix::fNRows
size_t fNRows
Definition: CudaMatrix.h:120
TMVA::DNN::TCudaDeviceReference::operator-=
void operator-=(AFloat value)
Definition: CudaMatrix.h:259
TMVA
create variable transformations
Definition: GeneticMinimizer.h:22
CudaBuffers.h
TMVA::DNN::TCudaMatrix::Print
void Print() const
Definition: CudaMatrix.h:177