Logo ROOT  
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21
22// in case we compile C++ code with std-17 and cuda with lower standard
23#include "RConfigure.h"
24#ifdef R__HAS_STD_STRING_VIEW
25#undef R__HAS_STD_STRING_VIEW
26#define R__HAS_STD_EXPERIMENTAL_STRING_VIEW
27#endif
28
29#include "cuda.h"
30#include "cuda_runtime.h"
31#include "cublas_v2.h"
32#include "curand_kernel.h"
33
34#include "TMatrixT.h"
35#include "CudaBuffers.h"
36
37#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
38
39namespace TMVA {
40namespace DNN {
41
42/** Function to check cuda return code. Taken from
43 * http://stackoverflow.com/questions/14038589/
44 */
45inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
46
47//____________________________________________________________________________
48//
49// Cuda Device Reference
50//____________________________________________________________________________
51
52/** TCudaDeviceReference
53 *
54 * Helper class emulating lvalue references for AFloat values that are
55 * physically on the device. Allows for example to assign to matrix elements.
56 * Note that device access through CudaDeviceReferences enforces synchronization
57 * with all streams and thus qualifies as performance killer. Only used for
58 * testing.
59 */
60template<typename AFloat>
62{
63private:
64
66
67public:
68
69 TCudaDeviceReference(AFloat * devicePointer);
70
71 operator AFloat();
72
73 void operator=(const TCudaDeviceReference &other);
74 void operator=(AFloat value);
75 void operator+=(AFloat value);
76 void operator-=(AFloat value);
77};
78
79//____________________________________________________________________________
80//
81// Cuda Matrix
82//____________________________________________________________________________
83
84/** TCudaMatrix Class
85 *
86 * The TCudaMatrix class represents matrices on a CUDA device. The elements
87 * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
88 * the allocation and freeing of the device memory. TCudaMatrices are lightweight
89 * object, that means on assignment and copy creation only a shallow copy is
90 * performed and no new element buffer allocated. To perform a deep copy use
91 * the static Copy method of the TCuda architecture class.
92 *
93 * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
94 * transferred to the device. This stream can be accessed through the
95 * GetComputeStream member function and used to synchronize computations.
96 *
97 * The TCudaMatrix class also holds static references to CUDA resources.
98 * Those are the cublas handle, a buffer of curand states for the generation
99 * of random numbers as well as a vector containing ones, which is used for
100 * summing column matrices using matrix-vector multiplication. The class also
101 * has a static buffer for returning results from the device.
102 *
103 */
104template<typename AFloat>
106{
107public:
108
109private:
110
111 static size_t fInstances; ///< Current number of matrix instances.
112 static cublasHandle_t fCublasHandle;
113 static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
114 static AFloat * fOnes; ///< Vector used for summations of columns.
115 static size_t fNOnes; ///< Current length of the one vector.
116 static curandState_t * fCurandStates;
117 static size_t fNCurandStates;
118
119
120 size_t fNRows;
121 size_t fNCols;
123
124public:
125
127
128 static AFloat * GetOnes() {return fOnes;}
129
131 TCudaMatrix(size_t i, size_t j);
133 TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
134
135 TCudaMatrix(const TCudaMatrix &) = default;
136 TCudaMatrix( TCudaMatrix &&) = default;
137 TCudaMatrix & operator=(const TCudaMatrix &) = default;
139 ~TCudaMatrix() = default;
140
141 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
142 operator TMatrixT<AFloat>() const;
143
144 inline cudaStream_t GetComputeStream() const;
145 inline void SetComputeStream(cudaStream_t stream);
146 /** Set the return buffer on the device to the specified value. This is
147 * required for example for reductions in order to initialize the
148 * accumulator. */
149 inline static void ResetDeviceReturn(AFloat value = 0.0);
150 /** Transfer the value in the device return buffer to the host. This
151 * tranfer is synchronous */
152 inline static AFloat GetDeviceReturn();
153 /** Return device pointer to the device return buffer */
154 inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
155 inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
156
157 /** Blocking synchronization with the associated compute stream, if it's
158 * not the default stream. */
159 inline void Synchronize(const TCudaMatrix &) const;
160
161 static size_t GetNDim() {return 2;}
162 size_t GetNrows() const {return fNRows;}
163 size_t GetNcols() const {return fNCols;}
164 size_t GetNoElements() const {return fNRows * fNCols;}
165
166 const AFloat * GetDataPointer() const {return fElementBuffer;}
167 AFloat * GetDataPointer() {return fElementBuffer;}
168 const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
169
171
172 /** Access to elements of device matrices provided through TCudaDeviceReference
173 * class. Note that access is synchronous end enforces device synchronization
174 * on all streams. Only used for testing. */
175 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
176
177 void Print() const {
178 TMatrixT<AFloat> mat(*this);
179 mat.Print();
180 }
181
182 void Zero() {
183 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
184 }
185
186
187private:
188
189 /** Initializes all shared devices resource and makes sure that a sufficient
190 * number of curand states are allocated on the device and initialized as
191 * well as that the one-vector for the summation over columns has the right
192 * size. */
195
196};
197
198//
199// Inline Functions.
200//______________________________________________________________________________
201inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
202{
203 if (code != cudaSuccess)
204 {
205 fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
206 if (abort) exit(code);
207 }
208}
209
210//______________________________________________________________________________
211template<typename AFloat>
213 : fDevicePointer(devicePointer)
214{
215 // Nothing to do here.
216}
217
218//______________________________________________________________________________
219template<typename AFloat>
221{
222 AFloat buffer;
223 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
224 cudaMemcpyDeviceToHost);
225 return buffer;
226}
227
228//______________________________________________________________________________
229template<typename AFloat>
231{
232 cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
233 cudaMemcpyDeviceToDevice);
234}
235
236//______________________________________________________________________________
237template<typename AFloat>
239{
240 AFloat buffer = value;
241 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
242 cudaMemcpyHostToDevice);
243}
244
245//______________________________________________________________________________
246template<typename AFloat>
248{
249 AFloat buffer;
250 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
251 cudaMemcpyDeviceToHost);
252 buffer += value;
253 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
254 cudaMemcpyHostToDevice);
255}
256
257//______________________________________________________________________________
258template<typename AFloat>
260{
261 AFloat buffer;
262 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
263 cudaMemcpyDeviceToHost);
264 buffer -= value;
265 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
266 cudaMemcpyHostToDevice);
267}
268
269//______________________________________________________________________________
270template<typename AFloat>
271inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
272{
273 return fElementBuffer.GetComputeStream();
274}
275
276//______________________________________________________________________________
277template<typename AFloat>
278inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
279{
280 return fElementBuffer.SetComputeStream(stream);
281}
282
283//______________________________________________________________________________
284template<typename AFloat>
286{
287 cudaEvent_t event;
288 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
289 cudaEventRecord(event, A.GetComputeStream());
290 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
291 cudaEventDestroy(event);
292}
293
294//______________________________________________________________________________
295template<typename AFloat>
297{
298 AFloat buffer = value;
299 cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
300}
301
302//______________________________________________________________________________
303template<typename AFloat>
305{
306 AFloat buffer;
307 cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
308 return buffer;
309}
310
311//______________________________________________________________________________
312template<typename AFloat>
314{
315 AFloat * elementPointer = fElementBuffer;
316 elementPointer += j * fNRows + i;
317 return TCudaDeviceReference<AFloat>(elementPointer);
318}
319
320} // namespace DNN
321} // namespace TMVA
322
323#endif
bool Bool_t
Definition: RtypesCore.h:59
TCudaDeviceBuffer.
Definition: CudaBuffers.h:100
TCudaDeviceReference.
Definition: CudaMatrix.h:62
void operator-=(AFloat value)
Definition: CudaMatrix.h:259
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:212
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:230
void operator+=(AFloat value)
Definition: CudaMatrix.h:247
TCudaMatrix Class.
Definition: CudaMatrix.h:106
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:122
size_t GetNcols() const
Definition: CudaMatrix.h:163
TCudaMatrix & operator=(const TCudaMatrix &)=default
static curandState_t * fCurandStates
Definition: CudaMatrix.h:116
void Print() const
Definition: CudaMatrix.h:177
TCudaMatrix(const TMatrixT< AFloat > &)
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:304
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:278
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:113
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:271
size_t GetNoElements() const
Definition: CudaMatrix.h:164
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static Bool_t gInitializeCurand
Definition: CudaMatrix.h:126
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:313
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:154
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:168
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:114
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:296
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:166
TCudaMatrix(TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)
static size_t fNCurandStates
Definition: CudaMatrix.h:117
static size_t GetNDim()
Definition: CudaMatrix.h:161
TCudaMatrix(const TCudaMatrix &)=default
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
Definition: CudaMatrix.h:170
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition: CudaMatrix.h:285
static AFloat * GetOnes()
Definition: CudaMatrix.h:128
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:112
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:111
TCudaMatrix(size_t i, size_t j)
size_t GetNrows() const
Definition: CudaMatrix.h:162
TCudaMatrix & operator=(TCudaMatrix &&)=default
AFloat * GetDataPointer()
Definition: CudaMatrix.h:167
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:115
TCudaMatrix(TCudaMatrix &&)=default
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:155
void Print(Option_t *name="") const
Print the matrix as a table of elements.
TMatrixT.
Definition: TMatrixT.h:39
TLine * line
const Int_t n
Definition: legend1.C:16
static double A[]
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:201
create variable transformations
Definition: file.py:1
auto * m
Definition: textangle.C:8