Logo ROOT   6.08/07
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21 
22 #include "cuda.h"
23 #include "cuda_runtime.h"
24 #include "cublas_v2.h"
25 #include "curand_kernel.h"
26 
27 #include "TMatrixT.h"
28 #include "CudaBuffers.h"
29 
30 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
31 
32 namespace TMVA {
33 namespace DNN {
34 
35 /** Function to check cuda return code. Taken from
36  * http://stackoverflow.com/questions/14038589/
37  */
38 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
39 
40 //____________________________________________________________________________
41 //
42 // Cuda Device Reference
43 //____________________________________________________________________________
44 
45 /** TCudaDeviceReference
46  *
47  * Helper class emulating lvalue references for AFloat values that are
48  * physically on the device. Allows for example to assign to matrix elements.
49  * Note that device access through CudaDeviceReferences enforces synchronization
50  * with all streams and thus qualifies as performance killer. Only used for
51  * testing.
52  */
53 template<typename AFloat>
55 {
56 private:
57 
58  AFloat * fDevicePointer;
59 
60 public:
61 
62  TCudaDeviceReference(AFloat * devicePointer);
63 
64  operator AFloat();
65 
66  void operator=(const TCudaDeviceReference &other);
67  void operator=(AFloat value);
68  void operator+=(AFloat value);
69  void operator-=(AFloat value);
70 };
71 
72 //____________________________________________________________________________
73 //
74 // Cuda Matrix
75 //____________________________________________________________________________
76 
77 /** TCudaMatrix Class
78  *
79  * The TCudaMatrix class represents matrices on a CUDA device. The elements
80  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
81  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
82  * object, that means on assignment and copy creation only a shallow copy is
83  * performed and no new element buffer allocated. To perform a deep copy use
84  * the static Copy method of the TCuda architecture class.
85  *
86  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
87  * transferred to the device. This stream can be accessed through the
88  * GetComputeStream member function and used to synchronize computations.
89  *
90  * The TCudaMatrix class also holds static references to CUDA resources.
91  * Those are the cublas handle, a buffer of curand states for the generation
92  * of random numbers as well as a vector containing ones, which is used for
93  * summing column matrices using matrix-vector multiplication. The class also
94  * has a static buffer for returning results from the device.
95  *
96  */
97 template<typename AFloat>
99 {
100 public:
101 
102 private:
103 
104  static size_t fInstances; ///< Current number of matrix instances.
105  static cublasHandle_t fCublasHandle;
106  static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
107  static AFloat * fOnes; ///< Vector used for summations of columns.
108  static size_t fNOnes; ///< Current length of the one vector.
109  static curandState_t * fCurandStates;
110  static size_t fNCurandStates;
111 
112  size_t fNRows;
113  size_t fNCols;
115 
116 public:
117 
118  static AFloat * GetOnes() {return fOnes;}
119 
120  TCudaMatrix();
121  TCudaMatrix(size_t i, size_t j);
123  TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
124 
125  TCudaMatrix(const TCudaMatrix &) = default;
126  TCudaMatrix( TCudaMatrix &&) = default;
127  TCudaMatrix & operator=(const TCudaMatrix &) = default;
128  TCudaMatrix & operator=( TCudaMatrix &&) = default;
129  ~TCudaMatrix() = default;
130 
131  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
132  operator TMatrixT<Double_t>() const;
133 
134  inline cudaStream_t GetComputeStream() const;
135  inline void SetComputeStream(cudaStream_t stream);
136  /** Set the return buffer on the device to the specified value. This is
137  * required for example for reductions in order to initialize the
138  * accumulator. */
139  inline static void ResetDeviceReturn(AFloat value = 0.0);
140  /** Transfer the value in the device return buffer to the host. This
141  * tranfer is synchronous */
142  inline static AFloat GetDeviceReturn();
143  /** Return device pointer to the device return buffer */
144  inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
145  inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
146 
147  /** Blocking synchronization with the associated compute stream, if it's
148  * not the default stream. */
149  inline void Synchronize(const TCudaMatrix &) const;
150 
151  size_t GetNrows() const {return fNRows;}
152  size_t GetNcols() const {return fNCols;}
153  size_t GetNoElements() const {return fNRows * fNCols;}
154  const AFloat * GetDataPointer() const {return fElementBuffer;}
155  AFloat * GetDataPointer() {return fElementBuffer;}
156  const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
157 
158  /** Access to elements of device matrices provided through TCudaDeviceReference
159  * class. Note that access is synchronous end enforces device synchronization
160  * on all streams. Only used for testing. */
161  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
162 
163 private:
164 
165  /** Initializes all shared devices resource and makes sure that a sufficient
166  * number of curand states are allocated on the device and initialized as
167  * well as that the one-vector for the summation over columns has the right
168  * size. */
169  void InitializeCuda();
170  void InitializeCurandStates();
171 
172 };
173 
174 //
175 // Inline Functions.
176 //______________________________________________________________________________
177 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
178 {
179  if (code != cudaSuccess)
180  {
181  fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
182  if (abort) exit(code);
183  }
184 }
185 
186 //______________________________________________________________________________
187 template<typename AFloat>
189  : fDevicePointer(devicePointer)
190 {
191  // Nothing to do here.
192 }
193 
194 //______________________________________________________________________________
195 template<typename AFloat>
197 {
198  AFloat buffer;
199  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
200  cudaMemcpyDeviceToHost);
201  return buffer;
202 }
203 
204 //______________________________________________________________________________
205 template<typename AFloat>
207 {
208  cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
209  cudaMemcpyDeviceToDevice);
210 }
211 
212 //______________________________________________________________________________
213 template<typename AFloat>
215 {
216  AFloat buffer = value;
217  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
218  cudaMemcpyHostToDevice);
219 }
220 
221 //______________________________________________________________________________
222 template<typename AFloat>
224 {
225  AFloat buffer;
226  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
227  cudaMemcpyDeviceToHost);
228  buffer += value;
229  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
230  cudaMemcpyHostToDevice);
231 }
232 
233 //______________________________________________________________________________
234 template<typename AFloat>
236 {
237  AFloat buffer;
238  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
239  cudaMemcpyDeviceToHost);
240  buffer -= value;
241  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
242  cudaMemcpyHostToDevice);
243 }
244 
245 //______________________________________________________________________________
246 template<typename AFloat>
247 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
248 {
249  return fElementBuffer.GetComputeStream();
250 }
251 
252 //______________________________________________________________________________
253 template<typename AFloat>
254 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
255 {
256  return fElementBuffer.SetComputeStream(stream);
257 }
258 
259 //______________________________________________________________________________
260 template<typename AFloat>
262 {
263  cudaEvent_t event;
264  cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
265  cudaEventRecord(event, A.GetComputeStream());
266  cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
267  cudaEventDestroy(event);
268 }
269 
270 //______________________________________________________________________________
271 template<typename AFloat>
272 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
273 {
274  AFloat buffer = value;
275  cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
276 }
277 
278 //______________________________________________________________________________
279 template<typename AFloat>
281 {
282  AFloat buffer;
283  cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
284  return buffer;
285 }
286 
287 //______________________________________________________________________________
288 template<typename AFloat>
290 {
291  AFloat * elementPointer = fElementBuffer;
292  elementPointer += j * fNRows + i;
293  return TCudaDeviceReference<AFloat>(elementPointer);
294 }
295 
296 } // namespace DNN
297 } // namespace TMVA
298 
299 #endif
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:145
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:105
TCudaDeviceReference.
Definition: CudaMatrix.h:54
void operator-=(AFloat value)
Definition: CudaMatrix.h:235
TLine * line
TCudaDeviceBuffer.
Definition: CudaBuffers.h:27
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:254
static double A[]
static curandState_t * fCurandStates
Definition: CudaMatrix.h:109
size_t GetNoElements() const
Definition: CudaMatrix.h:153
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:177
size_t GetNcols() const
Definition: CudaMatrix.h:152
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:107
AFloat * GetDataPointer()
Definition: CudaMatrix.h:155
size_t GetNrows() const
Definition: CudaMatrix.h:151
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:114
static size_t fNCurandStates
Definition: CudaMatrix.h:110
TMarker * m
Definition: textangle.C:8
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:104
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it&#39;s not the default stream...
Definition: CudaMatrix.h:261
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:206
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:289
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:106
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:188
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:280
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:156
Abstract ClassifierFactory template that handles arbitrary types.
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:144
Definition: file.py:1
static AFloat * GetOnes()
Definition: CudaMatrix.h:118
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:272
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:154
const Int_t n
Definition: legend1.C:16
TCudaMatrix Class.
Definition: CudaMatrix.h:98
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:247
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:108
void operator+=(AFloat value)
Definition: CudaMatrix.h:223