Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaMatrix.cu
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12/////////////////////////////////////////////
13// Implementation of the TCudaMatrix class. //
14/////////////////////////////////////////////
15
18
19#include <iostream>
20
21namespace TMVA {
22namespace DNN {
23
24
25//____________________________________________________________________________
26__global__ void CurandInitializationKernel(unsigned long long seed,
27 curandState_t *state)
28{
29 int i = blockDim.y * blockIdx.y + threadIdx.y;
30 int j = blockDim.x * blockIdx.x + threadIdx.x;
31 int tid = i * gridDim.x + j;
32 curand_init(seed + tid, 0, tid, state + tid);
33}
34
35// Static members.
36//____________________________________________________________________________
37template<typename AFloat>
39template<typename AFloat>
41template<typename AFloat>
42AFloat * TCudaMatrix<AFloat>::fDeviceReturn = nullptr;
43template<typename AFloat>
44AFloat * TCudaMatrix<AFloat>::fOnes = nullptr;
45template<typename AFloat>
47template<typename AFloat>
49template<typename AFloat>
51template <typename AFloat>
53
54// Constructors.
55//____________________________________________________________________________
56template<typename AFloat>
58 : fNRows(0), fNCols(0), fElementBuffer()
59{
61}
62
63//____________________________________________________________________________
64template<typename AFloat>
66 : fNRows(m), fNCols(n), fElementBuffer(m * n, 0)
67{
69}
70
71//____________________________________________________________________________
72template<typename AFloat>
74 : fNRows(Host.GetNrows()), fNCols(Host.GetNcols()),
75 fElementBuffer(Host.GetNoElements(), 0)
76{
78
79 AFloat * buffer = new AFloat[fNRows * fNCols];
80 size_t index = 0;
81 for (size_t j = 0; j < fNCols; j++) {
82 for (size_t i = 0; i < fNRows; i++) {
83 buffer[index] = static_cast<AFloat>(Host(i, j));
84 index++;
85 }
86 }
87
88 cudaMemcpy(fElementBuffer.data(), buffer, fNRows * fNCols * sizeof(AFloat),
90}
91
92//____________________________________________________________________________
93template<typename AFloat>
95 size_t m, size_t n)
96 : fNRows(m), fNCols(n), fElementBuffer(buffer)
97{
99}
100
101//____________________________________________________________________________
102template <typename AFloat>
104{
105 if (fInstances == 0) {
106 cublasCreate(&fCublasHandle);
107 CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(AFloat)));
108 CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this)));
109 }
110 if (gInitializeCurand && TDevice::NThreads(*this) > (int) fNCurandStates) {
111 fNCurandStates = TDevice::NThreads(*this);
112 if (fNCurandStates > 10000000)
113 std::cout << "***** Warning - initialize a BIG curandstate for matrix " << fNRows << "," << fNCols << " nstate "
114 << fNCurandStates << std::endl;
115 //R__ASSERT( fNRows*fNCols <= 8*8*128*128);
116 if (fCurandStates) {
117 cudaFree(fCurandStates);
118 }
119 cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t));
120 InitializeCurandStates();
121 }
122 if (fNRows > fNOnes) {
123 fNOnes = fNRows;
124 if (fOnes) {
125 cudaFree(fOnes);
126 }
127 cudaMalloc(&fOnes, fNRows * sizeof(AFloat));
128 AFloat * buffer = new AFloat[fNRows];
129 for (size_t i = 0; i < fNRows; i++) {
130 buffer[i] = 1.0;
131 }
132 cudaMemcpy(fOnes, buffer, fNRows * sizeof(AFloat),
134 }
135 fInstances++;
136}
137
138//____________________________________________________________________________
139template<typename AFloat>
146
147// Conversion to TMatrixT.
148//____________________________________________________________________________
149template<typename AFloat>
151{
152 TMatrixT<AFloat> hostMatrix(GetNrows(), GetNcols());
153
154 AFloat * buffer = new AFloat[fNRows * fNCols];
155 cudaMemcpy(buffer, fElementBuffer.data(), fNRows * fNCols * sizeof(AFloat),
157
158 size_t index = 0;
159 for (size_t j = 0; j < fNCols; j++) {
160 for (size_t i = 0; i < fNRows; i++) {
161 hostMatrix(i, j) = static_cast<Double_t>(buffer[index]);
162 index++;
163 }
164 }
165
166 delete[] buffer;
167 return hostMatrix;
168}
169
170// Explicit Instantiations.
171
172template class TCudaMatrix<float>;
173template class TCudaMatrix<double>;
174
175} // namespace DNN
176} // namespace TMVA
#define CUDACHECK(ans)
Definition CudaMatrix.h:34
bool Bool_t
Definition RtypesCore.h:63
constexpr Bool_t kFALSE
Definition RtypesCore.h:94
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
TCudaMatrix Class.
Definition CudaMatrix.h:103
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition CudaMatrix.h:119
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static dim3 BlockDims2D()
Definition Device.h:55
static dim3 GridDims2D(int nrows, int ncols)
Definition Device.h:74
static int NThreads(const AMatrix &A)
Definition Device.h:101
const Int_t n
Definition legend1.C:16
__global__ void CurandInitializationKernel(unsigned long long seed, curandState_t *state)
Definition CudaMatrix.cu:26
create variable transformations
TMarker m
Definition textangle.C:8