Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaMatrix.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21
22// in case we compile C++ code with std-17 and cuda with lower standard
23// use experimental string_view, otherwise keep as is
24#include "RConfigure.h"
25
26#include "cuda.h"
27#include "cuda_runtime.h"
28#include "cublas_v2.h"
29#include "curand_kernel.h"
30
31#include "TMatrixT.h"
32#include "CudaBuffers.h"
33
34#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
35
36namespace TMVA {
37namespace DNN {
38
39/** Function to check cuda return code. Taken from
40 * http://stackoverflow.com/questions/14038589/
41 */
42inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
43
44//____________________________________________________________________________
45//
46// Cuda Device Reference
47//____________________________________________________________________________
48
49/** TCudaDeviceReference
50 *
51 * Helper class emulating lvalue references for AFloat values that are
52 * physically on the device. Allows for example to assign to matrix elements.
53 * Note that device access through CudaDeviceReferences enforces synchronization
54 * with all streams and thus qualifies as performance killer. Only used for
55 * testing.
56 */
57template<typename AFloat>
59{
60private:
61
63
64public:
65
66 TCudaDeviceReference(AFloat * devicePointer);
67
68 operator AFloat();
69
70 void operator=(const TCudaDeviceReference &other);
71 void operator=(AFloat value);
72 void operator+=(AFloat value);
73 void operator-=(AFloat value);
74};
75
76//____________________________________________________________________________
77//
78// Cuda Matrix
79//____________________________________________________________________________
80
81/** TCudaMatrix Class
82 *
83 * The TCudaMatrix class represents matrices on a CUDA device. The elements
84 * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
85 * the allocation and freeing of the device memory. TCudaMatrices are lightweight
86 * object, that means on assignment and copy creation only a shallow copy is
87 * performed and no new element buffer allocated. To perform a deep copy use
88 * the static Copy method of the TCuda architecture class.
89 *
90 * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
91 * transferred to the device. This stream can be accessed through the
92 * GetComputeStream member function and used to synchronize computations.
93 *
94 * The TCudaMatrix class also holds static references to CUDA resources.
95 * Those are the cublas handle, a buffer of curand states for the generation
96 * of random numbers as well as a vector containing ones, which is used for
97 * summing column matrices using matrix-vector multiplication. The class also
98 * has a static buffer for returning results from the device.
99 *
100 */
101template<typename AFloat>
103{
104public:
105
106private:
107
108 static size_t fInstances; ///< Current number of matrix instances.
109 static cublasHandle_t fCublasHandle;
110 static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
111 static AFloat * fOnes; ///< Vector used for summations of columns.
112 static size_t fNOnes; ///< Current length of the one vector.
113 static curandState_t * fCurandStates;
114 static size_t fNCurandStates;
115
116
117 size_t fNRows;
118 size_t fNCols;
120
121public:
122
124
125 static AFloat * GetOnes() {return fOnes;}
126
127 TCudaMatrix();
128 TCudaMatrix(size_t i, size_t j);
130 TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
131
132 TCudaMatrix(const TCudaMatrix &) = default;
133 TCudaMatrix( TCudaMatrix &&) = default;
134 TCudaMatrix & operator=(const TCudaMatrix &) = default;
136 ~TCudaMatrix() = default;
137
138 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
139 operator TMatrixT<AFloat>() const;
140
141 inline cudaStream_t GetComputeStream() const;
142 inline void SetComputeStream(cudaStream_t stream);
143 /** Set the return buffer on the device to the specified value. This is
144 * required for example for reductions in order to initialize the
145 * accumulator. */
146 inline static void ResetDeviceReturn(AFloat value = 0.0);
147 /** Transfer the value in the device return buffer to the host. This
148 * transfer is synchronous */
149 inline static AFloat GetDeviceReturn();
150 /** Return device pointer to the device return buffer */
151 inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
152 inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
153
154 /** Blocking synchronization with the associated compute stream, if it's
155 * not the default stream. */
156 inline void Synchronize(const TCudaMatrix &) const;
157
158 static size_t GetNDim() {return 2;}
159 size_t GetNrows() const {return fNRows;}
160 size_t GetNcols() const {return fNCols;}
161 size_t GetNoElements() const {return fNRows * fNCols;}
162
163 const AFloat * GetDataPointer() const {return fElementBuffer.data();}
164 AFloat * GetDataPointer() {return fElementBuffer.data();}
165 const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
166
168
169 /** Access to elements of device matrices provided through TCudaDeviceReference
170 * class. Note that access is synchronous end enforces device synchronization
171 * on all streams. Only used for testing. */
172 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
173
174 void Print() const {
175 TMatrixT<AFloat> mat(*this);
176 mat.Print();
177 }
178
179 void Zero() {
180 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
181 }
182
183
184private:
185
186 /** Initializes all shared devices resource and makes sure that a sufficient
187 * number of curand states are allocated on the device and initialized as
188 * well as that the one-vector for the summation over columns has the right
189 * size. */
190 void InitializeCuda();
192
193};
194
195//
196// Inline Functions.
197//______________________________________________________________________________
198inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
199{
200 if (code != cudaSuccess)
201 {
202 fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
203 if (abort) exit(code);
204 }
205}
206
207//______________________________________________________________________________
208template<typename AFloat>
210 : fDevicePointer(devicePointer)
211{
212 // Nothing to do here.
213}
214
215//______________________________________________________________________________
216template<typename AFloat>
218{
219 AFloat buffer;
220 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
221 cudaMemcpyDeviceToHost);
222 return buffer;
223}
224
225//______________________________________________________________________________
226template<typename AFloat>
228{
229 cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
230 cudaMemcpyDeviceToDevice);
231}
232
233//______________________________________________________________________________
234template<typename AFloat>
236{
237 AFloat buffer = value;
238 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
239 cudaMemcpyHostToDevice);
240}
241
242//______________________________________________________________________________
243template<typename AFloat>
245{
246 AFloat buffer;
247 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
248 cudaMemcpyDeviceToHost);
249 buffer += value;
250 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
251 cudaMemcpyHostToDevice);
252}
253
254//______________________________________________________________________________
255template<typename AFloat>
257{
258 AFloat buffer;
259 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
260 cudaMemcpyDeviceToHost);
261 buffer -= value;
262 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
263 cudaMemcpyHostToDevice);
264}
265
266//______________________________________________________________________________
267template<typename AFloat>
268inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
269{
270 return fElementBuffer.GetComputeStream();
271}
272
273//______________________________________________________________________________
274template<typename AFloat>
275inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
276{
277 return fElementBuffer.SetComputeStream(stream);
278}
279
280//______________________________________________________________________________
281template<typename AFloat>
283{
284 cudaEvent_t event;
285 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
286 cudaEventRecord(event, A.GetComputeStream());
287 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
288 cudaEventDestroy(event);
289}
290
291//______________________________________________________________________________
292template<typename AFloat>
294{
295 AFloat buffer = value;
296 cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
297}
298
299//______________________________________________________________________________
300template<typename AFloat>
302{
303 AFloat buffer;
304 cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
305 return buffer;
306}
307
308//______________________________________________________________________________
309template<typename AFloat>
311{
312 AFloat * elementPointer = fElementBuffer.data();
313 elementPointer += j * fNRows + i;
314 return TCudaDeviceReference<AFloat>(elementPointer);
315}
316
317} // namespace DNN
318} // namespace TMVA
319
320#endif
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
TCudaDeviceReference.
Definition CudaMatrix.h:59
void operator-=(AFloat value)
Definition CudaMatrix.h:256
TCudaDeviceReference(AFloat *devicePointer)
Definition CudaMatrix.h:209
void operator=(const TCudaDeviceReference &other)
Definition CudaMatrix.h:227
void operator+=(AFloat value)
Definition CudaMatrix.h:244
TCudaMatrix Class.
Definition CudaMatrix.h:103
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition CudaMatrix.h:119
size_t GetNcols() const
Definition CudaMatrix.h:160
TCudaMatrix & operator=(const TCudaMatrix &)=default
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition CudaMatrix.h:301
void SetComputeStream(cudaStream_t stream)
Definition CudaMatrix.h:275
cudaStream_t GetComputeStream() const
Definition CudaMatrix.h:268
size_t GetNoElements() const
Definition CudaMatrix.h:161
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static Bool_t gInitializeCurand
Definition CudaMatrix.h:123
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition CudaMatrix.h:310
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition CudaMatrix.h:151
static curandState_t * fCurandStates
Definition CudaMatrix.h:113
const cublasHandle_t & GetCublasHandle() const
Definition CudaMatrix.h:165
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition CudaMatrix.h:293
const AFloat * GetDataPointer() const
Definition CudaMatrix.h:163
static size_t fNCurandStates
Definition CudaMatrix.h:114
static size_t GetNDim()
Definition CudaMatrix.h:158
TCudaMatrix(const TCudaMatrix &)=default
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
Definition CudaMatrix.h:167
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition CudaMatrix.h:110
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition CudaMatrix.h:282
static AFloat * GetOnes()
Definition CudaMatrix.h:125
static AFloat * fOnes
Vector used for summations of columns.
Definition CudaMatrix.h:111
static cublasHandle_t fCublasHandle
Definition CudaMatrix.h:109
static size_t fInstances
Current number of matrix instances.
Definition CudaMatrix.h:108
size_t GetNrows() const
Definition CudaMatrix.h:159
TCudaMatrix & operator=(TCudaMatrix &&)=default
AFloat * GetDataPointer()
Definition CudaMatrix.h:164
static size_t fNOnes
Current length of the one vector.
Definition CudaMatrix.h:112
TCudaMatrix(TCudaMatrix &&)=default
static curandState_t * GetCurandStatesPointer()
Definition CudaMatrix.h:152
void Print(Option_t *name="") const override
Print the matrix as a table of elements.
TMatrixT.
Definition TMatrixT.h:40
TLine * line
const Int_t n
Definition legend1.C:16
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition CudaMatrix.h:198
create variable transformations
TMarker m
Definition textangle.C:8