19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX 20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX 23 #include "cuda_runtime.h" 24 #include "cublas_v2.h" 25 #include "curand_kernel.h" 30 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); } 38 inline void cudaError(cudaError_t code,
const char *
file,
int line,
bool abort=
true);
53 template<
typename AFloat>
97 template<
typename AFloat>
134 inline cudaStream_t GetComputeStream()
const;
135 inline void SetComputeStream(cudaStream_t stream);
139 inline static void ResetDeviceReturn(AFloat value = 0.0);
142 inline static AFloat GetDeviceReturn();
149 inline void Synchronize(
const TCudaMatrix &)
const;
169 void InitializeCuda();
170 void InitializeCurandStates();
177 inline void cudaError(cudaError_t code,
const char *file,
int line,
bool abort)
179 if (code != cudaSuccess)
181 fprintf(stderr,
"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
182 if (abort) exit(code);
187 template<
typename AFloat>
195 template<
typename AFloat>
200 cudaMemcpyDeviceToHost);
205 template<
typename AFloat>
209 cudaMemcpyDeviceToDevice);
213 template<
typename AFloat>
216 AFloat buffer = value;
218 cudaMemcpyHostToDevice);
222 template<
typename AFloat>
227 cudaMemcpyDeviceToHost);
230 cudaMemcpyHostToDevice);
234 template<
typename AFloat>
239 cudaMemcpyDeviceToHost);
242 cudaMemcpyHostToDevice);
246 template<
typename AFloat>
249 return fElementBuffer.GetComputeStream();
253 template<
typename AFloat>
256 return fElementBuffer.SetComputeStream(stream);
260 template<
typename AFloat>
264 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
266 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
267 cudaEventDestroy(event);
271 template<
typename AFloat>
274 AFloat buffer = value;
275 cudaMemcpy(fDeviceReturn, & buffer,
sizeof(AFloat), cudaMemcpyHostToDevice);
279 template<
typename AFloat>
283 cudaMemcpy(& buffer, fDeviceReturn,
sizeof(AFloat), cudaMemcpyDeviceToHost);
288 template<
typename AFloat>
291 AFloat * elementPointer = fElementBuffer;
292 elementPointer += j * fNRows + i;
static curandState_t * GetCurandStatesPointer()
static cublasHandle_t fCublasHandle
void operator-=(AFloat value)
void SetComputeStream(cudaStream_t stream)
static curandState_t * fCurandStates
size_t GetNoElements() const
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
static AFloat * fOnes
Vector used for summations of columns.
AFloat * GetDataPointer()
TCudaDeviceBuffer< AFloat > fElementBuffer
static size_t fNCurandStates
static size_t fInstances
Current number of matrix instances.
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream...
void operator=(const TCudaDeviceReference &other)
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
static AFloat * fDeviceReturn
Buffer for kernel return values.
TCudaDeviceReference(AFloat *devicePointer)
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
const cublasHandle_t & GetCublasHandle() const
Abstract ClassifierFactory template that handles arbitrary types.
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
static AFloat * GetOnes()
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
const AFloat * GetDataPointer() const
cudaStream_t GetComputeStream() const
static size_t fNOnes
Current length of the one vector.
void operator+=(AFloat value)