Logo ROOT  
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21 
22 // in case we compile C++ code with std-17 and cuda with lower standard
23 // use experimental string_view, otherwise keep as is
24 #include "RConfigure.h"
25 #ifdef R__HAS_STD_STRING_VIEW
26 #ifndef R__CUDA_HAS_STD_STRING_VIEW
27 #undef R__HAS_STD_STRING_VIEW
28 #define R__HAS_STD_EXPERIMENTAL_STRING_VIEW
29 #endif
30 #endif
31 
32 #include "cuda.h"
33 #include "cuda_runtime.h"
34 #include "cublas_v2.h"
35 #include "curand_kernel.h"
36 
37 #include "TMatrixT.h"
38 #include "CudaBuffers.h"
39 
40 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
41 
42 namespace TMVA {
43 namespace DNN {
44 
45 /** Function to check cuda return code. Taken from
46  * http://stackoverflow.com/questions/14038589/
47  */
48 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
49 
50 //____________________________________________________________________________
51 //
52 // Cuda Device Reference
53 //____________________________________________________________________________
54 
55 /** TCudaDeviceReference
56  *
57  * Helper class emulating lvalue references for AFloat values that are
58  * physically on the device. Allows for example to assign to matrix elements.
59  * Note that device access through CudaDeviceReferences enforces synchronization
60  * with all streams and thus qualifies as performance killer. Only used for
61  * testing.
62  */
63 template<typename AFloat>
65 {
66 private:
67 
68  AFloat * fDevicePointer;
69 
70 public:
71 
72  TCudaDeviceReference(AFloat * devicePointer);
73 
74  operator AFloat();
75 
76  void operator=(const TCudaDeviceReference &other);
77  void operator=(AFloat value);
78  void operator+=(AFloat value);
79  void operator-=(AFloat value);
80 };
81 
82 //____________________________________________________________________________
83 //
84 // Cuda Matrix
85 //____________________________________________________________________________
86 
87 /** TCudaMatrix Class
88  *
89  * The TCudaMatrix class represents matrices on a CUDA device. The elements
90  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
91  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
92  * object, that means on assignment and copy creation only a shallow copy is
93  * performed and no new element buffer allocated. To perform a deep copy use
94  * the static Copy method of the TCuda architecture class.
95  *
96  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
97  * transferred to the device. This stream can be accessed through the
98  * GetComputeStream member function and used to synchronize computations.
99  *
100  * The TCudaMatrix class also holds static references to CUDA resources.
101  * Those are the cublas handle, a buffer of curand states for the generation
102  * of random numbers as well as a vector containing ones, which is used for
103  * summing column matrices using matrix-vector multiplication. The class also
104  * has a static buffer for returning results from the device.
105  *
106  */
107 template<typename AFloat>
109 {
110 public:
111 
112 private:
113 
114  static size_t fInstances; ///< Current number of matrix instances.
115  static cublasHandle_t fCublasHandle;
116  static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
117  static AFloat * fOnes; ///< Vector used for summations of columns.
118  static size_t fNOnes; ///< Current length of the one vector.
119  static curandState_t * fCurandStates;
120  static size_t fNCurandStates;
121 
122 
123  size_t fNRows;
124  size_t fNCols;
126 
127 public:
128 
130 
131  static AFloat * GetOnes() {return fOnes;}
132 
134  TCudaMatrix(size_t i, size_t j);
136  TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
137 
138  TCudaMatrix(const TCudaMatrix &) = default;
139  TCudaMatrix( TCudaMatrix &&) = default;
140  TCudaMatrix & operator=(const TCudaMatrix &) = default;
142  ~TCudaMatrix() = default;
143 
144  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
145  operator TMatrixT<AFloat>() const;
146 
147  inline cudaStream_t GetComputeStream() const;
148  inline void SetComputeStream(cudaStream_t stream);
149  /** Set the return buffer on the device to the specified value. This is
150  * required for example for reductions in order to initialize the
151  * accumulator. */
152  inline static void ResetDeviceReturn(AFloat value = 0.0);
153  /** Transfer the value in the device return buffer to the host. This
154  * tranfer is synchronous */
155  inline static AFloat GetDeviceReturn();
156  /** Return device pointer to the device return buffer */
157  inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
158  inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
159 
160  /** Blocking synchronization with the associated compute stream, if it's
161  * not the default stream. */
162  inline void Synchronize(const TCudaMatrix &) const;
163 
164  static size_t GetNDim() {return 2;}
165  size_t GetNrows() const {return fNRows;}
166  size_t GetNcols() const {return fNCols;}
167  size_t GetNoElements() const {return fNRows * fNCols;}
168 
169  const AFloat * GetDataPointer() const {return fElementBuffer;}
170  AFloat * GetDataPointer() {return fElementBuffer;}
171  const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
172 
174 
175  /** Access to elements of device matrices provided through TCudaDeviceReference
176  * class. Note that access is synchronous end enforces device synchronization
177  * on all streams. Only used for testing. */
178  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
179 
180  void Print() const {
181  TMatrixT<AFloat> mat(*this);
182  mat.Print();
183  }
184 
185  void Zero() {
186  cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
187  }
188 
189 
190 private:
191 
192  /** Initializes all shared devices resource and makes sure that a sufficient
193  * number of curand states are allocated on the device and initialized as
194  * well as that the one-vector for the summation over columns has the right
195  * size. */
198 
199 };
200 
201 //
202 // Inline Functions.
203 //______________________________________________________________________________
204 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
205 {
206  if (code != cudaSuccess)
207  {
208  fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
209  if (abort) exit(code);
210  }
211 }
212 
213 //______________________________________________________________________________
214 template<typename AFloat>
216  : fDevicePointer(devicePointer)
217 {
218  // Nothing to do here.
219 }
220 
221 //______________________________________________________________________________
222 template<typename AFloat>
224 {
225  AFloat buffer;
226  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
227  cudaMemcpyDeviceToHost);
228  return buffer;
229 }
230 
231 //______________________________________________________________________________
232 template<typename AFloat>
234 {
235  cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
236  cudaMemcpyDeviceToDevice);
237 }
238 
239 //______________________________________________________________________________
240 template<typename AFloat>
242 {
243  AFloat buffer = value;
244  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
245  cudaMemcpyHostToDevice);
246 }
247 
248 //______________________________________________________________________________
249 template<typename AFloat>
251 {
252  AFloat buffer;
253  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
254  cudaMemcpyDeviceToHost);
255  buffer += value;
256  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
257  cudaMemcpyHostToDevice);
258 }
259 
260 //______________________________________________________________________________
261 template<typename AFloat>
263 {
264  AFloat buffer;
265  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
266  cudaMemcpyDeviceToHost);
267  buffer -= value;
268  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
269  cudaMemcpyHostToDevice);
270 }
271 
272 //______________________________________________________________________________
273 template<typename AFloat>
274 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
275 {
276  return fElementBuffer.GetComputeStream();
277 }
278 
279 //______________________________________________________________________________
280 template<typename AFloat>
281 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
282 {
283  return fElementBuffer.SetComputeStream(stream);
284 }
285 
286 //______________________________________________________________________________
287 template<typename AFloat>
289 {
290  cudaEvent_t event;
291  cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
292  cudaEventRecord(event, A.GetComputeStream());
293  cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
294  cudaEventDestroy(event);
295 }
296 
297 //______________________________________________________________________________
298 template<typename AFloat>
299 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
300 {
301  AFloat buffer = value;
302  cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
303 }
304 
305 //______________________________________________________________________________
306 template<typename AFloat>
308 {
309  AFloat buffer;
310  cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
311  return buffer;
312 }
313 
314 //______________________________________________________________________________
315 template<typename AFloat>
317 {
318  AFloat * elementPointer = fElementBuffer;
319  elementPointer += j * fNRows + i;
320  return TCudaDeviceReference<AFloat>(elementPointer);
321 }
322 
323 } // namespace DNN
324 } // namespace TMVA
325 
326 #endif
TMVA::DNN::TCudaMatrix::GetNoElements
size_t GetNoElements() const
Definition: CudaMatrix.h:167
m
auto * m
Definition: textangle.C:8
TMVA::DNN::TCudaDeviceBuffer
TCudaDeviceBuffer.
Definition: CudaBuffers.h:100
n
const Int_t n
Definition: legend1.C:16
TMVA::DNN::TCudaMatrix::GetDataPointer
AFloat * GetDataPointer()
Definition: CudaMatrix.h:170
TMVA::DNN::TCudaDeviceReference::operator+=
void operator+=(AFloat value)
Definition: CudaMatrix.h:250
TMVA::DNN::TCudaMatrix::GetNDim
static size_t GetNDim()
Definition: CudaMatrix.h:164
TMVA::DNN::TCudaMatrix::fCublasHandle
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:115
TMVA::DNN::TCudaMatrix::InitializeCuda
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
TMVA::DNN::TCudaMatrix::GetCublasHandle
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:171
TMVA::DNN::TCudaMatrix::GetDataPointer
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:169
TMVA::DNN::TCudaDeviceReference::fDevicePointer
AFloat * fDevicePointer
Definition: CudaMatrix.h:68
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(size_t i, size_t j)
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(const TMatrixT< AFloat > &)
TMVA::DNN::TCudaMatrix::~TCudaMatrix
~TCudaMatrix()=default
TMVA::DNN::TCudaMatrix::GetDeviceReturnPointer
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:157
TMVA::DNN::TCudaMatrix::fInstances
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:114
TMVA::DNN::TCudaMatrix::GetNrows
size_t GetNrows() const
Definition: CudaMatrix.h:165
TMVA::DNN::TCudaMatrix::fCurandStates
static curandState_t * fCurandStates
Definition: CudaMatrix.h:119
TMVA::DNN::TCudaMatrix::operator=
TCudaMatrix & operator=(TCudaMatrix &&)=default
ROOT::Math::Cephes::A
static double A[]
Definition: SpecFuncCephes.cxx:170
TMatrixT.h
TMVA::DNN::TCudaMatrix::fOnes
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:117
TMatrixT
TMatrixT.
Definition: TMatrixT.h:39
TMVA::DNN::TCudaMatrix::GetDeviceBuffer
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
Definition: CudaMatrix.h:173
bool
TMVA::DNN::TCudaMatrix::fNCols
size_t fNCols
Definition: CudaMatrix.h:124
TMVA::DNN::TCudaMatrix::GetCurandStatesPointer
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:158
TMVA::DNN::TCudaMatrix::SetComputeStream
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:281
TMVA::DNN::TCudaMatrix::GetComputeStream
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:274
TMVA::DNN::TCudaMatrix::Synchronize
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition: CudaMatrix.h:288
TMVA::DNN::TCudaMatrix::fDeviceReturn
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:116
TMVA::DNN::TCudaMatrix::operator()
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:316
TMVA::DNN::TCudaMatrix::operator=
TCudaMatrix & operator=(const TCudaMatrix &)=default
TMVA::DNN::TCudaDeviceReference::operator=
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:233
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(TCudaMatrix &&)=default
TMVA::DNN::TCudaMatrix::Zero
void Zero()
Definition: CudaMatrix.h:185
TMVA::DNN::TCudaMatrix::GetNcols
size_t GetNcols() const
Definition: CudaMatrix.h:166
line
TLine * line
Definition: entrylistblock_figure1.C:235
TMVA::DNN::TCudaDeviceReference::TCudaDeviceReference
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:215
TMVA::DNN::TCudaMatrix::ResetDeviceReturn
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:299
TMVA::DNN::TCudaMatrix::gInitializeCurand
static Bool_t gInitializeCurand
Definition: CudaMatrix.h:129
TMVA::DNN::TCudaMatrix::fNCurandStates
static size_t fNCurandStates
Definition: CudaMatrix.h:120
TMVA::DNN::TCudaMatrix::GetDeviceReturn
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:307
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix()
TMVA::DNN::cudaError
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:204
file
Definition: file.py:1
TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(const TCudaMatrix &)=default
TMVA::DNN::TCudaMatrix::fNOnes
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:118
TMVA::DNN::TCudaMatrix::GetOnes
static AFloat * GetOnes()
Definition: CudaMatrix.h:131
TMVA::DNN::TCudaMatrix
TCudaMatrix Class.
Definition: CudaMatrix.h:109
TMVA::DNN::TCudaDeviceReference
TCudaDeviceReference.
Definition: CudaMatrix.h:65
TMVA::DNN::TCudaMatrix::fElementBuffer
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:125
TMVA::DNN::TCudaMatrix::InitializeCurandStates
void InitializeCurandStates()
TMVA::DNN::TCudaMatrix::fNRows
size_t fNRows
Definition: CudaMatrix.h:123
TMVA::DNN::TCudaDeviceReference::operator-=
void operator-=(AFloat value)
Definition: CudaMatrix.h:262
TMVA
create variable transformations
Definition: GeneticMinimizer.h:22
CudaBuffers.h
TMVA::DNN::TCudaMatrix::Print
void Print() const
Definition: CudaMatrix.h:180