Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaMatrix.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21
22// in case we compile C++ code with std-17 and cuda with lower standard
23// use experimental string_view, otherwise keep as is
24#include "RConfigure.h"
25#ifdef R__HAS_STD_STRING_VIEW
26#ifndef R__CUDA_HAS_STD_STRING_VIEW
27#undef R__HAS_STD_STRING_VIEW
28#define R__HAS_STD_EXPERIMENTAL_STRING_VIEW
29#endif
30#endif
31
32#include "cuda.h"
33#include "cuda_runtime.h"
34#include "cublas_v2.h"
35#include "curand_kernel.h"
36
37#include "TMatrixT.h"
38#include "CudaBuffers.h"
39
40#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
41
42namespace TMVA {
43namespace DNN {
44
45/** Function to check cuda return code. Taken from
46 * http://stackoverflow.com/questions/14038589/
47 */
48inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
49
50//____________________________________________________________________________
51//
52// Cuda Device Reference
53//____________________________________________________________________________
54
55/** TCudaDeviceReference
56 *
57 * Helper class emulating lvalue references for AFloat values that are
58 * physically on the device. Allows for example to assign to matrix elements.
59 * Note that device access through CudaDeviceReferences enforces synchronization
60 * with all streams and thus qualifies as performance killer. Only used for
61 * testing.
62 */
63template<typename AFloat>
65{
66private:
67
69
70public:
71
72 TCudaDeviceReference(AFloat * devicePointer);
73
74 operator AFloat();
75
76 void operator=(const TCudaDeviceReference &other);
77 void operator=(AFloat value);
78 void operator+=(AFloat value);
79 void operator-=(AFloat value);
80};
81
82//____________________________________________________________________________
83//
84// Cuda Matrix
85//____________________________________________________________________________
86
87/** TCudaMatrix Class
88 *
89 * The TCudaMatrix class represents matrices on a CUDA device. The elements
90 * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
91 * the allocation and freeing of the device memory. TCudaMatrices are lightweight
92 * object, that means on assignment and copy creation only a shallow copy is
93 * performed and no new element buffer allocated. To perform a deep copy use
94 * the static Copy method of the TCuda architecture class.
95 *
96 * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
97 * transferred to the device. This stream can be accessed through the
98 * GetComputeStream member function and used to synchronize computations.
99 *
100 * The TCudaMatrix class also holds static references to CUDA resources.
101 * Those are the cublas handle, a buffer of curand states for the generation
102 * of random numbers as well as a vector containing ones, which is used for
103 * summing column matrices using matrix-vector multiplication. The class also
104 * has a static buffer for returning results from the device.
105 *
106 */
107template<typename AFloat>
109{
110public:
111
112private:
113
114 static size_t fInstances; ///< Current number of matrix instances.
115 static cublasHandle_t fCublasHandle;
116 static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
117 static AFloat * fOnes; ///< Vector used for summations of columns.
118 static size_t fNOnes; ///< Current length of the one vector.
119 static curandState_t * fCurandStates;
120 static size_t fNCurandStates;
121
122
123 size_t fNRows;
124 size_t fNCols;
126
127public:
128
130
131 static AFloat * GetOnes() {return fOnes;}
132
134 TCudaMatrix(size_t i, size_t j);
136 TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
137
138 TCudaMatrix(const TCudaMatrix &) = default;
139 TCudaMatrix( TCudaMatrix &&) = default;
140 TCudaMatrix & operator=(const TCudaMatrix &) = default;
142 ~TCudaMatrix() = default;
143
144 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
145 operator TMatrixT<AFloat>() const;
146
147 inline cudaStream_t GetComputeStream() const;
148 inline void SetComputeStream(cudaStream_t stream);
149 /** Set the return buffer on the device to the specified value. This is
150 * required for example for reductions in order to initialize the
151 * accumulator. */
152 inline static void ResetDeviceReturn(AFloat value = 0.0);
153 /** Transfer the value in the device return buffer to the host. This
154 * tranfer is synchronous */
155 inline static AFloat GetDeviceReturn();
156 /** Return device pointer to the device return buffer */
157 inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
158 inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
159
160 /** Blocking synchronization with the associated compute stream, if it's
161 * not the default stream. */
162 inline void Synchronize(const TCudaMatrix &) const;
163
164 static size_t GetNDim() {return 2;}
165 size_t GetNrows() const {return fNRows;}
166 size_t GetNcols() const {return fNCols;}
167 size_t GetNoElements() const {return fNRows * fNCols;}
168
169 const AFloat * GetDataPointer() const {return fElementBuffer;}
170 AFloat * GetDataPointer() {return fElementBuffer;}
171 const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
172
174
175 /** Access to elements of device matrices provided through TCudaDeviceReference
176 * class. Note that access is synchronous end enforces device synchronization
177 * on all streams. Only used for testing. */
178 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
179
180 void Print() const {
181 TMatrixT<AFloat> mat(*this);
182 mat.Print();
183 }
184
185 void Zero() {
186 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
187 }
188
189
190private:
191
192 /** Initializes all shared devices resource and makes sure that a sufficient
193 * number of curand states are allocated on the device and initialized as
194 * well as that the one-vector for the summation over columns has the right
195 * size. */
198
199};
200
201//
202// Inline Functions.
203//______________________________________________________________________________
204inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
205{
206 if (code != cudaSuccess)
207 {
208 fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
209 if (abort) exit(code);
210 }
211}
212
213//______________________________________________________________________________
214template<typename AFloat>
216 : fDevicePointer(devicePointer)
217{
218 // Nothing to do here.
219}
220
221//______________________________________________________________________________
222template<typename AFloat>
224{
225 AFloat buffer;
226 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
227 cudaMemcpyDeviceToHost);
228 return buffer;
229}
230
231//______________________________________________________________________________
232template<typename AFloat>
234{
235 cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
236 cudaMemcpyDeviceToDevice);
237}
238
239//______________________________________________________________________________
240template<typename AFloat>
242{
243 AFloat buffer = value;
244 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
245 cudaMemcpyHostToDevice);
246}
247
248//______________________________________________________________________________
249template<typename AFloat>
251{
252 AFloat buffer;
253 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
254 cudaMemcpyDeviceToHost);
255 buffer += value;
256 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
257 cudaMemcpyHostToDevice);
258}
259
260//______________________________________________________________________________
261template<typename AFloat>
263{
264 AFloat buffer;
265 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
266 cudaMemcpyDeviceToHost);
267 buffer -= value;
268 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
269 cudaMemcpyHostToDevice);
270}
271
272//______________________________________________________________________________
273template<typename AFloat>
274inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
275{
276 return fElementBuffer.GetComputeStream();
277}
278
279//______________________________________________________________________________
280template<typename AFloat>
281inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
282{
283 return fElementBuffer.SetComputeStream(stream);
284}
285
286//______________________________________________________________________________
287template<typename AFloat>
289{
290 cudaEvent_t event;
291 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
292 cudaEventRecord(event, A.GetComputeStream());
293 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
294 cudaEventDestroy(event);
295}
296
297//______________________________________________________________________________
298template<typename AFloat>
300{
301 AFloat buffer = value;
302 cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
303}
304
305//______________________________________________________________________________
306template<typename AFloat>
308{
309 AFloat buffer;
310 cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
311 return buffer;
312}
313
314//______________________________________________________________________________
315template<typename AFloat>
317{
318 AFloat * elementPointer = fElementBuffer;
319 elementPointer += j * fNRows + i;
320 return TCudaDeviceReference<AFloat>(elementPointer);
321}
322
323} // namespace DNN
324} // namespace TMVA
325
326#endif
TCudaDeviceReference.
Definition CudaMatrix.h:65
void operator-=(AFloat value)
Definition CudaMatrix.h:262
TCudaDeviceReference(AFloat *devicePointer)
Definition CudaMatrix.h:215
void operator=(const TCudaDeviceReference &other)
Definition CudaMatrix.h:233
void operator+=(AFloat value)
Definition CudaMatrix.h:250
TCudaMatrix Class.
Definition CudaMatrix.h:109
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition CudaMatrix.h:125
size_t GetNcols() const
Definition CudaMatrix.h:166
TCudaMatrix & operator=(const TCudaMatrix &)=default
static curandState_t * fCurandStates
Definition CudaMatrix.h:119
TCudaMatrix(const TMatrixT< AFloat > &)
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition CudaMatrix.h:307
void SetComputeStream(cudaStream_t stream)
Definition CudaMatrix.h:281
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition CudaMatrix.h:116
cudaStream_t GetComputeStream() const
Definition CudaMatrix.h:274
size_t GetNoElements() const
Definition CudaMatrix.h:167
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static Bool_t gInitializeCurand
Definition CudaMatrix.h:129
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition CudaMatrix.h:316
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition CudaMatrix.h:157
const cublasHandle_t & GetCublasHandle() const
Definition CudaMatrix.h:171
static AFloat * fOnes
Vector used for summations of columns.
Definition CudaMatrix.h:117
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition CudaMatrix.h:299
const AFloat * GetDataPointer() const
Definition CudaMatrix.h:169
TCudaMatrix(TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)
static size_t fNCurandStates
Definition CudaMatrix.h:120
static size_t GetNDim()
Definition CudaMatrix.h:164
TCudaMatrix(const TCudaMatrix &)=default
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
Definition CudaMatrix.h:173
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition CudaMatrix.h:288
static AFloat * GetOnes()
Definition CudaMatrix.h:131
static cublasHandle_t fCublasHandle
Definition CudaMatrix.h:115
static size_t fInstances
Current number of matrix instances.
Definition CudaMatrix.h:114
TCudaMatrix(size_t i, size_t j)
size_t GetNrows() const
Definition CudaMatrix.h:165
TCudaMatrix & operator=(TCudaMatrix &&)=default
AFloat * GetDataPointer()
Definition CudaMatrix.h:170
static size_t fNOnes
Current length of the one vector.
Definition CudaMatrix.h:118
TCudaMatrix(TCudaMatrix &&)=default
static curandState_t * GetCurandStatesPointer()
Definition CudaMatrix.h:158
void Print(Option_t *name="") const
Print the matrix as a table of elements.
TMatrixT.
Definition TMatrixT.h:39
TLine * line
const Int_t n
Definition legend1.C:16
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition CudaMatrix.h:204
create variable transformations
Definition file.py:1
auto * m
Definition textangle.C:8