18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
21#include "RConfigure.h"
24#error This file can be compiled only when cudnn is available in ROOT
56struct TCudnnEmptyDescriptor {};
66template<
typename AFloat = Float_t>
73 using Scalar_t = AFloat;
74 using Matrix_t = TCudaTensor<AFloat>;
75 using Tensor_t = TCudaTensor<AFloat>;
76 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
77 using HostBuffer_t = TCudaHostBuffer<AFloat>;
80 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
81 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
82 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
83 using FilterDescriptor_t = cudnnFilterDescriptor_t;
85 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
87 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
88 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
89 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
90 using AlgorithmDataType_t = cudnnDataType_t;
91 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
92 using TensorDescriptor_t = cudnnTensorDescriptor_t;
93 using RecurrentDescriptor_t = cudnnRNNDescriptor_t;
94#if (CUDNN_VERSION >= 8000)
95 using RNNDataDescriptor_t = cudnnRNNDataDescriptor_t;
97 using RNNDataDescriptor_t = TCudnnEmptyDescriptor;
99 using EmptyDescriptor_t = TCudnnEmptyDescriptor;
101 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
102 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
104 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
105 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
106 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
107 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
108 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
109 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
111 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
112 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
113 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
115 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
119 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
130 static int ConvFwdAlgorithm;
131 static int ConvBwdDataAlgorithm;
132 static int ConvBwdFilterAlgorithm;
134 static Long_t ConvMaxWorkspaceSize;
140 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
141 return Tensor_t( {
n,
c,
h,
w}, GetTensorLayout(), 0, 0);
144 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
145 return Tensor_t( buffer, {
n,
c,
h,
w}, GetTensorLayout(), 0, 0);
148 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t w)
150 return Tensor_t({
n,
c,
w}, GetTensorLayout(), 0, 0);
153 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t w)
155 return Tensor_t(buffer, {
n,
c,
w}, GetTensorLayout(), 0, 0);
158 static bool IsCudnn() {
return true; }
162 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
163 if (!newWeights.empty()) newWeights.clear();
164 size_t n = weights.size();
165 for (
size_t i = 0; i <
n; ++i)
166 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
173 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
174 BNormLayer_t *L =
nullptr);
176 static void InitializeConvDescriptors(TDescriptors * & descriptors,
177 ConvLayer_t *L =
nullptr);
179 static void InitializePoolDescriptors(TDescriptors * & descriptors,
180 PoolingLayer_t *L =
nullptr);
182 static void InitializeRNNDescriptors(TDescriptors *&descriptors, RNNLayer_t *layer)
184 InitializeRecurrentDescriptors<RNNLayer_t>(descriptors, layer);
186 static void InitializeLSTMDescriptors(TDescriptors *&descriptors, LSTMLayer_t *layer) {
187 InitializeRecurrentDescriptors<LSTMLayer_t>(descriptors, layer);
189 static void InitializeGRUDescriptors(TDescriptors *&descriptors, GRULayer_t *layer) {
190 InitializeRecurrentDescriptors<GRULayer_t>(descriptors, layer);
192 template<
typename RNNLayer>
193 static void InitializeRecurrentDescriptors(TDescriptors *&descriptors, RNNLayer *L);
197 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc,
double coef = 0.0);
199 static void ReleaseConvDescriptors(TDescriptors * descriptors );
200 static void ReleasePoolDescriptors(TDescriptors * descriptors );
201 static void ReleaseRNNDescriptors(TDescriptors *descriptors);
202 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
203 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {}
204 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
205 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
206 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
207 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
208 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
209 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
212 static void InitializeConvWorkspace(TWorkspace * & workspace,
213 TDescriptors * & descriptors,
214 const DNN::CNN::TConvParams & params,
215 ConvLayer_t *L =
nullptr);
216 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
217 TDescriptors * & descriptors,
218 const DNN::CNN::TConvParams & params,
219 PoolingLayer_t *L =
nullptr);
221 static void InitializeRNNWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, RNNLayer_t *layer)
223 InitializeRecurrentWorkspace<RNNLayer_t>(workspace, descriptors, layer);
225 static void InitializeLSTMWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, LSTMLayer_t *layer)
227 InitializeRecurrentWorkspace<LSTMLayer_t>(workspace, descriptors, layer);
229 static void InitializeGRUWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, GRULayer_t *layer)
231 InitializeRecurrentWorkspace<GRULayer_t>(workspace, descriptors, layer);
233 template<
typename RNNLayer>
234 static void InitializeRecurrentWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors,
237 static void FreeConvWorkspace(TWorkspace * workspace);
238 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
239 static void FreeRNNWorkspace(TWorkspace *workspace);
242 static void InitializeRNNTensors(RNNLayer_t *layer) { InitializeRecurrentTensors<RNNLayer_t>(layer); }
243 static void InitializeLSTMTensors(LSTMLayer_t *layer) { InitializeRecurrentTensors<LSTMLayer_t>(layer); }
244 static void InitializeGRUTensors(GRULayer_t *layer) { InitializeRecurrentTensors<GRULayer_t>(layer); }
245 template <
typename RNNLayer>
246 static void InitializeRecurrentTensors(RNNLayer *layer);
260 static void MultiplyTranspose(Tensor_t &
output,
const Tensor_t &
input,
const Matrix_t &weights);
278 static void Backward(Tensor_t & activationGradientsBackward,
279 Matrix_t & weightGradients,
280 Matrix_t & biasGradients,
282 const Tensor_t & activationGradients,
283 const Matrix_t & weights,
284 const Tensor_t & activationBackward);
287 static void ScaleAdd(Tensor_t & A,
const Tensor_t & B,
288 Scalar_t alpha = 1.0,
289 Scalar_t beta = 1.0);
292 static void Copy(Tensor_t & A,
const Tensor_t & B);
295 template<
typename ATensor_t>
296 static void CopyDiffArch(Tensor_t & A,
297 const ATensor_t & B);
299 template <
typename ATensor_t>
300 static void CopyWeightsDiffArch(Tensor_t &A,
const ATensor_t &B);
303 static void CopyDiffArch(Tensor_t A,
const Tensor_t & B ) {
Copy(A,B); }
306 template<
typename AMatrix_t>
307 static void CopyDiffArch(std::vector<Tensor_t> & A,
308 const std::vector<AMatrix_t> & B);
325 Tensor_t & Y, Tensor_t & dY,
326 ActivationDescriptor_t activationDescr,
327 const AFloat alpha = 1,
328 const AFloat beta = 1) {}
330 static void ActivationFunctionForward(Tensor_t &
X, EActivationFunction activFunct,
331 const ActivationDescriptor_t activationDescr,
332 const double coef = 0.0,
const AFloat alpha = 1,
333 const AFloat beta = 0);
336 static void ActivationFunctionForward(Tensor_t &Y,
const Tensor_t &
X, EActivationFunction activFunct,
337 const ActivationDescriptor_t activationDescr,
const double coef = 0.0,
338 const AFloat alpha = 1,
const AFloat beta = 0);
341 static void ActivationFunctionBackward(Tensor_t & dX,
const Tensor_t & Y,
342 const Tensor_t & dY,
const Tensor_t &
X,
343 EActivationFunction activFunct,
344 const ActivationDescriptor_t activationDescr,
345 const AFloat alpha = 1,
346 const AFloat beta = 0);
354 static void Relu(Tensor_t &) {}
355 static void Sigmoid(Tensor_t &) {}
356 static void Tanh(Tensor_t &) {}
357 static void FastTanh(Tensor_t &) {}
360 static void Gauss(Tensor_t &) {}
366 static void FastTanhDerivative(Tensor_t &,
const Tensor_t &) {}
387 const Matrix_t &weights);
389 const Matrix_t &
output,
const Matrix_t &weights);
394 const Matrix_t &weights);
397 const Matrix_t &
output,
const Matrix_t &weights);
402 const Matrix_t &weights);
404 const Matrix_t &
output,
const Matrix_t &weights);
420 static void Sigmoid(Matrix_t &YHat,
422 static void Softmax(Matrix_t &YHat,
439 static void DropoutForward(Tensor_t & A,
440 TDescriptors * descriptors,
441 TWorkspace * workspace,
444 static void DropoutBackward(Tensor_t & A,
445 TDescriptors * descriptors,
446 TWorkspace * workspace);
464 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &
x, Tensor_t &
y, Matrix_t &gamma, Matrix_t &beta,
465 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
466 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
467 Scalar_t epsilon,
const TensorDescriptor_t &bnParDescriptor);
472 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &
x, Matrix_t &gamma, Matrix_t &beta,
473 Tensor_t &
y,
const Matrix_t &runningMeans,
474 const Matrix_t &runningVars, Scalar_t epsilon,
475 const TensorDescriptor_t &);
477 static void BatchNormLayerBackward(
int axis,
const Tensor_t &
x,
const Tensor_t &dy, Tensor_t &dx,
479 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
const Matrix_t &variance,
480 const Matrix_t &iVariance, Scalar_t epsilon,
const TensorDescriptor_t &);
495 static Scalar_t L1Regularization(
const Matrix_t &W)
497 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
498 return TCuda<AFloat>::L1Regularization(mW);
502 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
503 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
504 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
507 static Scalar_t L2Regularization(
const Matrix_t &W)
509 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
510 return TCuda<AFloat>::L2Regularization(mW);
514 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
515 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
516 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
532 static void InitializeGauss(Matrix_t &A);
533 static void InitializeUniform(Matrix_t &A);
534 static void InitializeIdentity(Matrix_t &A);
535 static void InitializeZero(Matrix_t &A);
536 static void InitializeGlorotNormal(Matrix_t &A);
537 static void InitializeGlorotUniform(Matrix_t &A);
541 static TRandom &GetRandomGenerator();
544 static void SetRandomSeed(
size_t seed);
558 static void Dropout(Tensor_t &A, Scalar_t
p) {}
572 static void AddConvBiases(Matrix_t &
output,
const Matrix_t &biases);
576 static void PrepareInternals(Tensor_t &) {}
579 static void ConvLayerForward(Tensor_t &
output,
580 Tensor_t &inputActivationFunc,
581 const Tensor_t &
input,
const Matrix_t &weights,
const Matrix_t &biases,
582 const DNN::CNN::TConvParams ¶ms, EActivationFunction activFunc,
583 Tensor_t & ,
const ConvDescriptors_t &descriptors,
584 ConvWorkspace_t &workspace);
600 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
601 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
602 const Matrix_t &weights,
const Tensor_t &activationBackward,
603 const Tensor_t &outputTensor, EActivationFunction activFunc,
604 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace,
size_t ,
605 size_t ,
size_t ,
size_t ,
size_t ,
606 size_t ,
size_t ,
size_t ,
622 static void Downsample(Tensor_t &A, Tensor_t & ,
const Tensor_t &C,
const PoolingDescriptors_t &descriptors,
623 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
624 size_t fltWidth,
size_t strideRows,
size_t strideCols);
634 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
const Tensor_t &activationGradients,
635 const Tensor_t & ,
const Tensor_t &inputActivation,
636 const Tensor_t &outputTensor,
const PoolingDescriptors_t &descriptors,
637 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
638 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t nLocalViews);
655 static void Flatten(Tensor_t &A,
const Tensor_t &B);
659 static void Deflatten(Tensor_t &A,
const Tensor_t &B);
662 static void Rearrange(Tensor_t &out,
const Tensor_t &in);
665 static void RNNForward(
const Tensor_t &
x,
const Tensor_t &hx,
const Tensor_t &cx,
const Tensor_t &weights,
666 Tensor_t &
y, Tensor_t &hy, Tensor_t &cy,
const RNNDescriptors_t &descr,
667 RNNWorkspace_t &workspace,
bool isTraining);
669 static void RNNBackward(
const Tensor_t &
x,
const Tensor_t &hx,
const Tensor_t &cx,
const Tensor_t &
y,
const Tensor_t &dy,
670 const Tensor_t &dhy,
const Tensor_t &dcy,
const Tensor_t &weights, Tensor_t &dx, Tensor_t &dhx,
671 Tensor_t &dcx, Tensor_t &dw,
const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
676 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward,
678 Matrix_t & , Matrix_t & ,
686 return state_gradients_backward;
688 static Matrix_t &LSTMLayerBackward(
689 Matrix_t & state_gradients_backward , Matrix_t & ,
690 Matrix_t & , Matrix_t & ,
691 Matrix_t & , Matrix_t & ,
692 Matrix_t & , Matrix_t & ,
694 Matrix_t & , Matrix_t & ,
695 Matrix_t & , Matrix_t & ,
696 Matrix_t & , Matrix_t & , Matrix_t & ,
697 Matrix_t & , Matrix_t & ,
698 const Matrix_t & ,
const Matrix_t & ,
699 const Matrix_t & ,
const Matrix_t & ,
700 const Matrix_t & ,
const Matrix_t & ,
701 const Matrix_t & ,
const Matrix_t & ,
702 const Matrix_t & ,
const Matrix_t & ,
703 const Matrix_t & ,
const Matrix_t & ,
704 const Matrix_t & ,
const Matrix_t & ,
705 const Matrix_t & , Matrix_t & ,
706 Matrix_t & , Matrix_t & )
708 return state_gradients_backward;
712 static Matrix_t &GRULayerBackward(
713 Matrix_t & state_gradients_backward, Matrix_t & ,
714 Matrix_t & , Matrix_t & ,
715 Matrix_t & , Matrix_t & ,
716 Matrix_t & , Matrix_t & ,
717 Matrix_t & , Matrix_t & ,
718 Matrix_t & , Matrix_t & , Matrix_t & ,
719 const Matrix_t & ,
const Matrix_t & ,
720 const Matrix_t & ,
const Matrix_t & ,
721 const Matrix_t & ,
const Matrix_t & ,
722 const Matrix_t & ,
const Matrix_t & ,
723 const Matrix_t & ,
const Matrix_t & ,
724 const Matrix_t & , Matrix_t & ,
bool)
726 return state_gradients_backward;
745 static void Hadamard(Tensor_t &A,
const Tensor_t &B)
747 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
748 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
749 assert(A.GetSize() == B.GetSize());
750 TCuda<AFloat>::Hadamard(tmpA, tmpB);
761 static Scalar_t
Sum(
const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
769 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
770 TCudaMatrix<AFloat>
tmp(A.GetDeviceBuffer(), 1, A.GetSize());
771 TCuda<AFloat>::ConstAdd(tmp,beta);
777 static void ConstMult(Matrix_t &A, Scalar_t beta) {
778 TCudaMatrix<AFloat>
tmp(A.GetDeviceBuffer(), 1, A.GetSize());
779 TCuda<AFloat>::ConstMult(tmp,beta);
786 TCudaMatrix<AFloat>
tmp(A.GetDeviceBuffer(), 1, A.GetSize());
787 TCuda<AFloat>::ReciprocalElementWise(tmp);
794 TCudaMatrix<AFloat>
tmp(A.GetDeviceBuffer(), 1, A.GetSize());
795 TCuda<AFloat>::SquareElementWise(tmp);
803 TCudaMatrix<AFloat>
tmp(A.GetDeviceBuffer(), 1, A.GetSize());
804 TCuda<AFloat>::SqrtElementWise(tmp);
808 static void AdamUpdate(Matrix_t & A,
const Matrix_t & M,
const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
809 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
810 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
811 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
812 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
815 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
816 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
817 TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
820 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
821 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
822 TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
826 static void PrintTensor(
const Tensor_t & A,
const std::string
name =
"tensor",
bool =
true);
828 static void PrintTensor4dDescriptor(TensorDescriptor_t descriptor);
829 static void PrintTensorNdDescriptor(TensorDescriptor_t descriptor,
int n = 10);
838 static void SumRows(Matrix_t &B,
const Matrix_t &A);
843template <
typename AFloat>
844template <
typename ATensor>
845void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
852 if (B.GetLayout() == GetTensorLayout()) {
853 if ( B.GetShape().size() == 4) {
854 assert(B.GetShape().size() == 4);
855 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
856 for (
size_t i = 0; i < firstSize; ++i) {
859 TCudaTensor<AFloat> tmpOut = B.At(i);
861 TCudaTensor<AFloat> tmpIn(matIn.
GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
868 TCudaMatrix<AFloat> tmp2(tmp);
869 TCudaTensor<AFloat> tA(tmp2);
875 TCudaMatrix<AFloat> tmp2(tmp);
876 TCudaTensor<AFloat> tA(tmp2);
882template <
typename AFloat>
883template <
typename AMatrix>
884void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B,
const AMatrix &A)
890 if (B.GetLayout() == GetTensorLayout() ) {
895 TCudaMatrix<AFloat> tmp2(tmp);
896 TCudaTensor<AFloat> tA(tmp2);
901template <
typename AFloat>
902template <
typename AMatrix_t>
903void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
904 const std::vector<AMatrix_t> &A)
906 for (
size_t i = 0; i < B.size(); ++i) {
907 CopyWeightsDiffArch(B[i], A[i]);
911template <
typename AFloat>
912void TCudnn<AFloat>::PrintTensor(
const typename TCudnn<AFloat>::Tensor_t & A,
const std::string
name,
bool truncate )
914 std::cout <<
name <<
" size = " << A.GetSize() <<
" shape = { ";
915 auto shape = A.GetShape();
916 for (
size_t k = 0; k < shape.size()-1; ++k)
917 std::cout << shape[k] <<
" , ";
918 std::cout << shape.back() <<
" } ";
919 std::cout <<
" strides = { ";
920 auto strides = A.GetStrides();
921 for (
size_t k = 0; k < strides.size()-1; ++k)
922 std::cout << strides[k] <<
" , ";
923 std::cout << strides.back() <<
" }\n ";
924 if (A.GetShape().size() == 1 ) {
925 size_t n = A.GetShape()[0];
926 if (truncate)
n = std::min(
n,
size_t(10));
927 for (
size_t j = 0; j <
n; ++j) {
928 std::cout << A(0,j) <<
" ";
930 if (truncate &&
n < A.GetShape()[0]) std::cout <<
" ...... ";
931 std::cout <<
" } " << std::endl;
932 }
else if (A.GetShape().size() == 2 ) {
933 size_t n1 = A.GetShape()[0];
934 size_t n2 = A.GetShape()[1];
935 if (truncate) n1 = std::min(n1,
size_t(10));
936 for (
size_t i = 0; i < n1; ++i) {
938 if (truncate) n2 = std::min(n2,
size_t(10));
939 for (
size_t j = 0; j < n2; ++j) {
940 std::cout << A(i,j) <<
" ";
942 if (truncate && n2 < A.GetShape()[1]) std::cout <<
" ...... ";
943 std::cout <<
" } " << std::endl;
945 if (truncate && n1 < A.GetShape()[0]) std::cout <<
" ...............\n";
946 }
else if (A.GetShape().size() == 3 ) {
947 size_t n1 = A.GetFirstSize();
948 size_t n2 = A.GetHSize();
949 size_t n3 = A.GetWSize();
950 if (truncate) n1 = std::min(n1,
size_t(10));
951 if (truncate) n2 = std::min(n2,
size_t(10));
952 if (truncate) n3 = std::min(n3,
size_t(10));
953 for (
size_t i = 0; i < n1; ++i) {
955 for (
size_t j = 0; j < n2; ++j) {
957 for (
size_t k = 0; k < n3; ++k) {
958 std::cout << A(i,j,k) <<
" ";
960 if (truncate && n3 < A.GetWSize()) std::cout <<
" ...... ";
961 std::cout <<
" } " << std::endl;
963 if (truncate && n2 < A.GetHSize()) std::cout <<
".................\n";
964 std::cout <<
" } " << std::endl;
966 if (truncate && n1 < A.GetFirstSize()) std::cout <<
"...................\n";
967 }
else if (A.GetShape().size() == 4 ) {
968 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
970 for (
size_t j = 0; j < A.GetShape()[1]; ++j) {
972 for (
size_t k = 0; k < A.GetShape()[2]; ++k) {
973 size_t n = A.GetShape()[3];
974 if (truncate)
n = std::min(
n,
size_t(10));
975 for (
size_t l = 0;
l <
n; ++
l) {
976 std::cout << A(i,j,k,
l) <<
" ";
978 if (truncate &&
n < A.GetShape()[3]) std::cout <<
" ...... ";
979 std::cout <<
" } " << std::endl;
981 std::cout <<
" } " << std::endl;
983 std::cout <<
" } " << std::endl;
987 for (
size_t l = 0;
l < A.GetSize(); ++
l) {
988 std::cout << A.GetData()[
l] <<
" ";
994template <
typename AFloat>
995void TCudnn<AFloat>::PrintTensor4dDescriptor(TensorDescriptor_t descriptor) {
997 int s1, s2, s3, s4 = 0;
998 cudnnDataType_t dataType;
999 cudnnGetTensor4dDescriptor(descriptor, &dataType, &
n, &
c, &
h, &
w, &
s1, &s2, &s3, &s4);
1000 std::cout <<
"Descriptor for 4d tensor of shape { " <<
n <<
" , " <<
c <<
" , " <<
h <<
" , " <<
w <<
" }"
1001 <<
" and strides { " <<
s1 <<
" , " << s2 <<
" , " << s3 <<
" , " << s4 <<
" }" << std::endl;
1003template <
typename AFloat>
1004void TCudnn<AFloat>::PrintTensorNdDescriptor(TensorDescriptor_t descriptor,
int ndim)
1007 std::vector<int> dims(ndim);
1008 std::vector<int> strides(ndim);
1009 cudnnDataType_t dataType;
1010 cudnnGetTensorNdDescriptor(descriptor, ndim, &dataType, &
n, dims.data(), strides.data());
1013 std::cout <<
"Descriptor for Nd tensor of dim = " <<
n <<
" shape { ";
1015 std::cout <<
d <<
" , ";
1016 std::cout <<
"} and strides { ";
1017 for (
auto s : strides)
1018 std::cout << s <<
" , ";
1019 std::cout <<
" }" << std::endl;
1034template <
typename AFloat>
1035int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
1036template <
typename AFloat>
1037int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
1038template <
typename AFloat>
1039int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
1040template <
typename AFloat>
1041Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1;
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
void PrintTensor(RTensor< T > &t)
const Element * GetMatrixArray() const override
This is the base class for the ROOT Random number generators.
T Sum(const RVec< T > &v, const T zero=T(0))
Sum elements of an RVec.
void Copy(void *source, void *dest)
__global__ void SymmetricRelu(AFloat *A, int m, int n)
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void IdentityDerivative(AFloat *A, int m, int n)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Relu(AFloat *A, int m, int n)
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
std::shared_ptr< std::function< double(double)> > Tanh
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations