18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
21#include "RConfigure.h"
24#error This file can be compiled only when cudnn is available in ROOT
66template<
typename AFloat = Float_t>
73 using Scalar_t = AFloat;
94#if (CUDNN_VERSION >= 8000)
104 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
105 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
106 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
107 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
108 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
109 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
111 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
112 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
113 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
115 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
119 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
140 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
141 return Tensor_t( {
n,
c,
h,
w}, GetTensorLayout(), 0, 0);
144 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
145 return Tensor_t( buffer, {
n,
c,
h,
w}, GetTensorLayout(), 0, 0);
148 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t w)
150 return Tensor_t({
n,
c,
w}, GetTensorLayout(), 0, 0);
153 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t w)
155 return Tensor_t(buffer, {
n,
c,
w}, GetTensorLayout(), 0, 0);
158 static bool IsCudnn() {
return true; }
162 static void CreateWeightTensors( std::vector<Matrix_t> &
newWeights,
const std::vector<Matrix_t> & weights) {
164 size_t n = weights.size();
165 for (
size_t i = 0; i <
n; ++i)
166 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
173 static void InitializeBNormDescriptors(TDescriptors * &
descriptors,
174 BNormLayer_t *L =
nullptr);
176 static void InitializeConvDescriptors(TDescriptors * &
descriptors,
177 ConvLayer_t *L =
nullptr);
179 static void InitializePoolDescriptors(TDescriptors * &
descriptors,
180 PoolingLayer_t *L =
nullptr);
192 template<
typename RNNLayer>
197 static void InitializeActivationDescriptor(ActivationDescriptor_t &
descriptors, EActivationFunction
activFunc,
double coef = 0.0);
199 static void ReleaseConvDescriptors(TDescriptors *
descriptors );
200 static void ReleasePoolDescriptors(TDescriptors *
descriptors );
201 static void ReleaseRNNDescriptors(TDescriptors *
descriptors);
202 static void ReleaseBNormDescriptors(TDescriptors *
descriptors );
203 static void ReleaseDescriptor(EmptyDescriptor_t &
emptyDescr) {}
204 static void ReleaseDescriptor(ActivationDescriptor_t &
activationDescr);
206 static void ReleaseDescriptor(DropoutDescriptor_t &
dropoutDescr);
207 static void ReleaseDescriptor(FilterDescriptor_t &
filterDescr);
208 static void ReleaseDescriptor(PoolingDescriptor_t &
poolingDescr);
209 static void ReleaseDescriptor(TensorDescriptor_t &
tensorDescr);
212 static void InitializeConvWorkspace(TWorkspace * & workspace,
214 const DNN::CNN::TConvParams & params,
215 ConvLayer_t *L =
nullptr);
216 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
218 const DNN::CNN::TConvParams & params,
219 PoolingLayer_t *L =
nullptr);
233 template<
typename RNNLayer>
237 static void FreeConvWorkspace(TWorkspace * workspace);
238 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
239 static void FreeRNNWorkspace(TWorkspace *workspace);
245 template <
typename RNNLayer>
260 static void MultiplyTranspose(Tensor_t &
output,
const Tensor_t &
input,
const Matrix_t &weights);
283 const Matrix_t & weights,
287 static void ScaleAdd(Tensor_t & A,
const Tensor_t & B,
288 Scalar_t alpha = 1.0,
289 Scalar_t beta = 1.0);
292 static void Copy(Tensor_t & A,
const Tensor_t & B);
295 template<
typename ATensor_t>
296 static void CopyDiffArch(Tensor_t & A,
299 template <
typename ATensor_t>
303 static void CopyDiffArch(Tensor_t A,
const Tensor_t & B ) {
Copy(A,B); }
306 template<
typename AMatrix_t>
307 static void CopyDiffArch(std::vector<Tensor_t> & A,
308 const std::vector<AMatrix_t> & B);
325 Tensor_t & Y, Tensor_t &
dY,
327 const AFloat alpha = 1,
328 const AFloat beta = 1) {}
330 static void ActivationFunctionForward(Tensor_t &
X, EActivationFunction
activFunct,
332 const double coef = 0.0,
const AFloat alpha = 1,
333 const AFloat beta = 0);
336 static void ActivationFunctionForward(Tensor_t &Y,
const Tensor_t &
X, EActivationFunction
activFunct,
338 const AFloat alpha = 1,
const AFloat beta = 0);
341 static void ActivationFunctionBackward(Tensor_t &
dX,
const Tensor_t & Y,
342 const Tensor_t &
dY,
const Tensor_t &
X,
345 const AFloat alpha = 1,
346 const AFloat beta = 0);
354 static void Relu(Tensor_t &) {}
355 static void Sigmoid(Tensor_t &) {}
356 static void Tanh(Tensor_t &) {}
357 static void FastTanh(Tensor_t &) {}
360 static void Gauss(Tensor_t &) {}
366 static void FastTanhDerivative(Tensor_t &,
const Tensor_t &) {}
387 const Matrix_t &weights);
389 const Matrix_t &
output,
const Matrix_t &weights);
394 const Matrix_t &weights);
397 const Matrix_t &
output,
const Matrix_t &weights);
402 const Matrix_t &weights);
404 const Matrix_t &
output,
const Matrix_t &weights);
439 static void DropoutForward(Tensor_t & A,
441 TWorkspace * workspace,
444 static void DropoutBackward(Tensor_t & A,
446 TWorkspace * workspace);
464 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &
x, Tensor_t &
y, Matrix_t &gamma, Matrix_t &beta,
472 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &
x, Matrix_t &gamma, Matrix_t &beta,
475 const TensorDescriptor_t &);
477 static void BatchNormLayerBackward(
int axis,
const Tensor_t &
x,
const Tensor_t &
dy, Tensor_t &
dx,
480 const Matrix_t &
iVariance, Scalar_t epsilon,
const TensorDescriptor_t &);
495 static Scalar_t L1Regularization(
const Matrix_t &
W)
498 return TCuda<AFloat>::L1Regularization(
mW);
504 return TCuda<AFloat>::AddL1RegularizationGradients(
mA,
mW, weightDecay);
507 static Scalar_t L2Regularization(
const Matrix_t &
W)
510 return TCuda<AFloat>::L2Regularization(
mW);
516 return TCuda<AFloat>::AddL1RegularizationGradients(
mA,
mW, weightDecay);
532 static void InitializeGauss(Matrix_t &A);
533 static void InitializeUniform(Matrix_t &A);
534 static void InitializeIdentity(Matrix_t &A);
535 static void InitializeZero(Matrix_t &A);
536 static void InitializeGlorotNormal(Matrix_t &A);
537 static void InitializeGlorotUniform(Matrix_t &A);
541 static TRandom &GetRandomGenerator();
544 static void SetRandomSeed(
size_t seed);
558 static void Dropout(Tensor_t &A, Scalar_t
p) {}
572 static void AddConvBiases(Matrix_t &
output,
const Matrix_t &
biases);
576 static void PrepareInternals(Tensor_t &) {}
579 static void ConvLayerForward(Tensor_t &
output,
581 const Tensor_t &
input,
const Matrix_t &weights,
const Matrix_t &
biases,
582 const DNN::CNN::TConvParams ¶ms, EActivationFunction
activFunc,
584 ConvWorkspace_t &workspace);
604 const ConvDescriptors_t &
descriptors, ConvWorkspace_t &workspace,
size_t ,
605 size_t ,
size_t ,
size_t ,
size_t ,
606 size_t ,
size_t ,
size_t ,
622 static void Downsample(Tensor_t &A, Tensor_t & ,
const Tensor_t &C,
const PoolingDescriptors_t &
descriptors,
624 size_t fltWidth,
size_t strideRows,
size_t strideCols);
655 static void Flatten(Tensor_t &A,
const Tensor_t &B);
659 static void Deflatten(Tensor_t &A,
const Tensor_t &B);
662 static void Rearrange(Tensor_t &out,
const Tensor_t &in);
665 static void RNNForward(
const Tensor_t &
x,
const Tensor_t &
hx,
const Tensor_t &
cx,
const Tensor_t &weights,
666 Tensor_t &
y, Tensor_t &
hy, Tensor_t &
cy,
const RNNDescriptors_t &
descr,
669 static void RNNBackward(
const Tensor_t &
x,
const Tensor_t &
hx,
const Tensor_t &
cx,
const Tensor_t &
y,
const Tensor_t &
dy,
670 const Tensor_t &
dhy,
const Tensor_t &
dcy,
const Tensor_t &weights, Tensor_t &
dx, Tensor_t &
dhx,
671 Tensor_t &
dcx, Tensor_t &
dw,
const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
678 Matrix_t & , Matrix_t & ,
688 static Matrix_t &LSTMLayerBackward(
690 Matrix_t & , Matrix_t & ,
691 Matrix_t & , Matrix_t & ,
692 Matrix_t & , Matrix_t & ,
694 Matrix_t & , Matrix_t & ,
695 Matrix_t & , Matrix_t & ,
696 Matrix_t & , Matrix_t & , Matrix_t & ,
697 Matrix_t & , Matrix_t & ,
698 const Matrix_t & ,
const Matrix_t & ,
699 const Matrix_t & ,
const Matrix_t & ,
700 const Matrix_t & ,
const Matrix_t & ,
701 const Matrix_t & ,
const Matrix_t & ,
702 const Matrix_t & ,
const Matrix_t & ,
703 const Matrix_t & ,
const Matrix_t & ,
704 const Matrix_t & ,
const Matrix_t & ,
705 const Matrix_t & , Matrix_t & ,
706 Matrix_t & , Matrix_t & )
712 static Matrix_t &GRULayerBackward(
714 Matrix_t & , Matrix_t & ,
715 Matrix_t & , Matrix_t & ,
716 Matrix_t & , Matrix_t & ,
717 Matrix_t & , Matrix_t & ,
718 Matrix_t & , Matrix_t & , Matrix_t & ,
719 const Matrix_t & ,
const Matrix_t & ,
720 const Matrix_t & ,
const Matrix_t & ,
721 const Matrix_t & ,
const Matrix_t & ,
722 const Matrix_t & ,
const Matrix_t & ,
723 const Matrix_t & ,
const Matrix_t & ,
724 const Matrix_t & , Matrix_t & ,
bool)
745 static void Hadamard(Tensor_t &A,
const Tensor_t &B)
749 assert(A.GetSize() == B.GetSize());
750 TCuda<AFloat>::Hadamard(
tmpA,
tmpB);
761 static Scalar_t
Sum(
const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
769 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
771 TCuda<AFloat>::ConstAdd(tmp,beta);
777 static void ConstMult(Matrix_t &A, Scalar_t beta) {
779 TCuda<AFloat>::ConstMult(tmp,beta);
787 TCuda<AFloat>::ReciprocalElementWise(tmp);
795 TCuda<AFloat>::SquareElementWise(tmp);
804 TCuda<AFloat>::SqrtElementWise(tmp);
808 static void AdamUpdate(Matrix_t & A,
const Matrix_t & M,
const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
817 TCuda<AFloat>::AdamUpdateFirstMom(
tmpA,
tmpB, beta);
822 TCuda<AFloat>::AdamUpdateSecondMom(
tmpA,
tmpB, beta);
826 static void PrintTensor(
const Tensor_t & A,
const std::string
name =
"tensor",
bool =
true);
838 static void SumRows(Matrix_t &B,
const Matrix_t &A);
843template <
typename AFloat>
844template <
typename ATensor>
852 if (B.GetLayout() == GetTensorLayout()) {
853 if ( B.GetShape().size() == 4) {
854 assert(B.GetShape().size() == 4);
855 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
882template <
typename AFloat>
883template <
typename AMatrix>
890 if (B.GetLayout() == GetTensorLayout() ) {
901template <
typename AFloat>
902template <
typename AMatrix_t>
904 const std::vector<AMatrix_t> &A)
906 for (
size_t i = 0; i < B.size(); ++i) {
911template <
typename AFloat>
914 std::cout <<
name <<
" size = " << A.GetSize() <<
" shape = { ";
915 auto shape = A.GetShape();
916 for (
size_t k = 0; k < shape.size()-1; ++k)
917 std::cout << shape[k] <<
" , ";
918 std::cout << shape.back() <<
" } ";
919 std::cout <<
" strides = { ";
920 auto strides = A.GetStrides();
921 for (
size_t k = 0; k < strides.size()-1; ++k)
922 std::cout << strides[k] <<
" , ";
923 std::cout << strides.back() <<
" }\n ";
924 if (A.GetShape().size() == 1 ) {
925 size_t n = A.GetShape()[0];
927 for (
size_t j = 0;
j <
n; ++
j) {
928 std::cout << A(0,
j) <<
" ";
930 if (
truncate &&
n < A.GetShape()[0]) std::cout <<
" ...... ";
931 std::cout <<
" } " << std::endl;
932 }
else if (A.GetShape().size() == 2 ) {
933 size_t n1 = A.GetShape()[0];
934 size_t n2 = A.GetShape()[1];
936 for (
size_t i = 0; i <
n1; ++i) {
939 for (
size_t j = 0;
j <
n2; ++
j) {
940 std::cout << A(i,
j) <<
" ";
942 if (
truncate &&
n2 < A.GetShape()[1]) std::cout <<
" ...... ";
943 std::cout <<
" } " << std::endl;
945 if (
truncate &&
n1 < A.GetShape()[0]) std::cout <<
" ...............\n";
946 }
else if (A.GetShape().size() == 3 ) {
947 size_t n1 = A.GetFirstSize();
948 size_t n2 = A.GetHSize();
949 size_t n3 = A.GetWSize();
953 for (
size_t i = 0; i <
n1; ++i) {
955 for (
size_t j = 0;
j <
n2; ++
j) {
957 for (
size_t k = 0; k <
n3; ++k) {
958 std::cout << A(i,
j,k) <<
" ";
960 if (
truncate &&
n3 < A.GetWSize()) std::cout <<
" ...... ";
961 std::cout <<
" } " << std::endl;
963 if (
truncate &&
n2 < A.GetHSize()) std::cout <<
".................\n";
964 std::cout <<
" } " << std::endl;
966 if (
truncate &&
n1 < A.GetFirstSize()) std::cout <<
"...................\n";
967 }
else if (A.GetShape().size() == 4 ) {
968 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
970 for (
size_t j = 0;
j < A.GetShape()[1]; ++
j) {
972 for (
size_t k = 0; k < A.GetShape()[2]; ++k) {
973 size_t n = A.GetShape()[3];
975 for (
size_t l = 0;
l <
n; ++
l) {
976 std::cout << A(i,
j,k,
l) <<
" ";
978 if (
truncate &&
n < A.GetShape()[3]) std::cout <<
" ...... ";
979 std::cout <<
" } " << std::endl;
981 std::cout <<
" } " << std::endl;
983 std::cout <<
" } " << std::endl;
987 for (
size_t l = 0;
l < A.GetSize(); ++
l) {
988 std::cout << A.GetData()[
l] <<
" ";
994template <
typename AFloat>
1000 std::cout <<
"Descriptor for 4d tensor of shape { " <<
n <<
" , " <<
c <<
" , " <<
h <<
" , " <<
w <<
" }"
1001 <<
" and strides { " <<
s1 <<
" , " <<
s2 <<
" , " <<
s3 <<
" , " <<
s4 <<
" }" << std::endl;
1003template <
typename AFloat>
1007 std::vector<int>
dims(ndim);
1008 std::vector<int> strides(ndim);
1013 std::cout <<
"Descriptor for Nd tensor of dim = " <<
n <<
" shape { ";
1015 std::cout <<
d <<
" , ";
1016 std::cout <<
"} and strides { ";
1017 for (
auto s : strides)
1018 std::cout << s <<
" , ";
1019 std::cout <<
" }" << std::endl;
1034template <
typename AFloat>
1036template <
typename AFloat>
1038template <
typename AFloat>
1040template <
typename AFloat>
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
void PrintTensor(RTensor< T > &t)
This is the base class for the ROOT Random number generators.
T Sum(const RVec< T > &v, const T zero=T(0))
Sum elements of an RVec.
void Copy(void *source, void *dest)
__global__ void SymmetricRelu(AFloat *A, int m, int n)
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void IdentityDerivative(AFloat *A, int m, int n)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Relu(AFloat *A, int m, int n)
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
std::shared_ptr< std::function< double(double)> > Tanh
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations