18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
21#include "RConfigure.h"
24#error This file can be compiled only when cudnn is available in ROOT
54struct TCudnnEmptyDescriptor {};
64template<
typename AFloat = Float_t>
71 using Scalar_t = AFloat;
72 using Matrix_t = TCudaTensor<AFloat>;
73 using Tensor_t = TCudaTensor<AFloat>;
74 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
75 using HostBuffer_t = TCudaHostBuffer<AFloat>;
78 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
79 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
80 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
81 using FilterDescriptor_t = cudnnFilterDescriptor_t;
83 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
85 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
86 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
87 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
88 using AlgorithmDataType_t = cudnnDataType_t;
89 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
90 using TensorDescriptor_t = cudnnTensorDescriptor_t;
91 using RecurrentDescriptor_t = cudnnRNNDescriptor_t;
93 using EmptyDescriptor_t = TCudnnEmptyDescriptor;
95 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
96 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
98 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
99 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
100 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
101 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
102 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
103 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
105 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
106 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
107 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
109 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
113 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
124 static int ConvFwdAlgorithm;
125 static int ConvBwdDataAlgorithm;
126 static int ConvBwdFilterAlgorithm;
128 static Long_t ConvMaxWorkspaceSize;
134 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
135 return Tensor_t( {
n,
c,
h,w}, GetTensorLayout(), 0, 0);
138 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
139 return Tensor_t( buffer, {
n,
c,
h,w}, GetTensorLayout(), 0, 0);
142 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t w)
144 return Tensor_t({
n,
c, w}, GetTensorLayout(), 0, 0);
147 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t w)
149 return Tensor_t(buffer, {
n,
c, w}, GetTensorLayout(), 0, 0);
152 static bool IsCudnn() {
return true; }
156 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
157 if (!newWeights.empty()) newWeights.clear();
158 size_t n = weights.size();
159 for (
size_t i = 0; i <
n; ++i)
160 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
167 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
168 BNormLayer_t *L =
nullptr);
170 static void InitializeConvDescriptors(TDescriptors * & descriptors,
171 ConvLayer_t *L =
nullptr);
173 static void InitializePoolDescriptors(TDescriptors * & descriptors,
174 PoolingLayer_t *L =
nullptr);
176 static void InitializeRNNDescriptors(TDescriptors *&descriptors, RNNLayer_t *layer)
178 InitializeRecurrentDescriptors<RNNLayer_t>(descriptors, layer);
180 static void InitializeLSTMDescriptors(TDescriptors *&descriptors, LSTMLayer_t *layer) {
181 InitializeRecurrentDescriptors<LSTMLayer_t>(descriptors, layer);
183 static void InitializeGRUDescriptors(TDescriptors *&descriptors, GRULayer_t *layer) {
184 InitializeRecurrentDescriptors<GRULayer_t>(descriptors, layer);
186 template<
typename RNNLayer>
187 static void InitializeRecurrentDescriptors(TDescriptors *&descriptors, RNNLayer *L);
191 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc,
double coef = 0.0);
193 static void ReleaseConvDescriptors(TDescriptors * descriptors );
194 static void ReleasePoolDescriptors(TDescriptors * descriptors );
195 static void ReleaseRNNDescriptors(TDescriptors *descriptors);
196 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
197 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {}
198 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
199 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
200 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
201 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
202 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
203 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
206 static void InitializeConvWorkspace(TWorkspace * & workspace,
207 TDescriptors * & descriptors,
208 const DNN::CNN::TConvParams & params,
209 ConvLayer_t *L =
nullptr);
210 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
211 TDescriptors * & descriptors,
212 const DNN::CNN::TConvParams & params,
213 PoolingLayer_t *L =
nullptr);
215 static void InitializeRNNWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, RNNLayer_t *layer)
217 InitializeRecurrentWorkspace<RNNLayer_t>(workspace, descriptors, layer);
219 static void InitializeLSTMWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, LSTMLayer_t *layer)
221 InitializeRecurrentWorkspace<LSTMLayer_t>(workspace, descriptors, layer);
223 static void InitializeGRUWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, GRULayer_t *layer)
225 InitializeRecurrentWorkspace<GRULayer_t>(workspace, descriptors, layer);
227 template<
typename RNNLayer>
228 static void InitializeRecurrentWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors,
231 static void FreeConvWorkspace(TWorkspace * workspace);
232 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
233 static void FreeRNNWorkspace(TWorkspace *workspace);
236 static void InitializeRNNTensors(RNNLayer_t *layer) { InitializeRecurrentTensors<RNNLayer_t>(layer); }
237 static void InitializeLSTMTensors(LSTMLayer_t *layer) { InitializeRecurrentTensors<LSTMLayer_t>(layer); }
238 static void InitializeGRUTensors(GRULayer_t *layer) { InitializeRecurrentTensors<GRULayer_t>(layer); }
239 template <
typename RNNLayer>
240 static void InitializeRecurrentTensors(RNNLayer *layer);
254 static void MultiplyTranspose(Tensor_t &
output,
const Tensor_t &input,
const Matrix_t &weights);
257 static void AddRowWise(Tensor_t &
output,
const Matrix_t &biases);
272 static void Backward(Tensor_t & activationGradientsBackward,
273 Matrix_t & weightGradients,
274 Matrix_t & biasGradients,
276 const Tensor_t & activationGradients,
277 const Matrix_t & weights,
278 const Tensor_t & activationBackward);
281 static void ScaleAdd(Tensor_t & A,
const Tensor_t & B,
282 Scalar_t alpha = 1.0,
283 Scalar_t beta = 1.0);
286 static void Copy(Tensor_t & A,
const Tensor_t & B);
289 template<
typename ATensor_t>
290 static void CopyDiffArch(Tensor_t & A,
291 const ATensor_t & B);
293 template <
typename ATensor_t>
294 static void CopyWeightsDiffArch(Tensor_t &A,
const ATensor_t &B);
297 static void CopyDiffArch(Tensor_t A,
const Tensor_t & B ) {
Copy(A,B); }
300 template<
typename AMatrix_t>
301 static void CopyDiffArch(std::vector<Tensor_t> & A,
302 const std::vector<AMatrix_t> & B);
317 static void Identity(Tensor_t & X) {}
318 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
319 Tensor_t & Y, Tensor_t & dY,
320 ActivationDescriptor_t activationDescr,
321 const AFloat alpha = 1,
322 const AFloat beta = 1) {}
324 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
325 const ActivationDescriptor_t activationDescr,
326 const double coef = 0.0,
const AFloat alpha = 1,
327 const AFloat beta = 0);
330 static void ActivationFunctionForward(Tensor_t &Y,
const Tensor_t & X, EActivationFunction activFunct,
331 const ActivationDescriptor_t activationDescr,
const double coef = 0.0,
332 const AFloat alpha = 1,
const AFloat beta = 0);
335 static void ActivationFunctionBackward(Tensor_t & dX,
const Tensor_t & Y,
336 const Tensor_t & dY,
const Tensor_t & X,
337 EActivationFunction activFunct,
338 const ActivationDescriptor_t activationDescr,
339 const AFloat alpha = 1,
340 const AFloat beta = 0);
348 static void Relu(Tensor_t &) {}
349 static void Sigmoid(Tensor_t &) {}
350 static void Tanh(Tensor_t &) {}
351 static void FastTanh(Tensor_t &) {}
352 static void SymmetricRelu(Tensor_t &) {}
354 static void Gauss(Tensor_t &) {}
356 static void IdentityDerivative(Tensor_t &,
const Tensor_t &) {}
357 static void ReluDerivative(Tensor_t &,
const Tensor_t &) {}
358 static void SigmoidDerivative(Tensor_t &,
const Tensor_t &) {}
359 static void TanhDerivative(Tensor_t &,
const Tensor_t &) {}
360 static void FastTanhDerivative(Tensor_t &,
const Tensor_t &) {}
361 static void SymmetricReluDerivative(Tensor_t & ,
const Tensor_t & ) {}
362 static void SoftSignDerivative(Tensor_t & ,
const Tensor_t & ) {}
363 static void GaussDerivative(Tensor_t & ,
const Tensor_t & ) {}
380 static Scalar_t MeanSquaredError(
const Matrix_t &Y,
const Matrix_t &
output,
381 const Matrix_t &weights);
382 static void MeanSquaredErrorGradients(Matrix_t &dY,
const Matrix_t &Y,
383 const Matrix_t &
output,
const Matrix_t &weights);
387 static Scalar_t CrossEntropy(
const Matrix_t &Y,
const Matrix_t &
output,
388 const Matrix_t &weights);
390 static void CrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
391 const Matrix_t &
output,
const Matrix_t &weights);
395 static Scalar_t SoftmaxCrossEntropy(
const Matrix_t &Y,
const Matrix_t &
output,
396 const Matrix_t &weights);
397 static void SoftmaxCrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
398 const Matrix_t &
output,
const Matrix_t &weights);
414 static void Sigmoid(Matrix_t &YHat,
416 static void Softmax(Matrix_t &YHat,
433 static void DropoutForward(Tensor_t & A,
434 TDescriptors * descriptors,
435 TWorkspace * workspace,
438 static void DropoutBackward(Tensor_t & A,
439 TDescriptors * descriptors,
440 TWorkspace * workspace);
458 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &
x, Tensor_t &
y, Matrix_t &gamma, Matrix_t &beta,
459 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
460 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
461 Scalar_t
epsilon,
const TensorDescriptor_t &bnParDescriptor);
466 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &
x, Matrix_t &gamma, Matrix_t &beta,
467 Tensor_t &
y,
const Matrix_t &runningMeans,
468 const Matrix_t &runningVars, Scalar_t
epsilon,
469 const TensorDescriptor_t &);
471 static void BatchNormLayerBackward(
int axis,
const Tensor_t &
x,
const Tensor_t &dy, Tensor_t &dx,
473 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
const Matrix_t &variance,
474 const Matrix_t &iVariance, Scalar_t
epsilon,
const TensorDescriptor_t &);
489 static Scalar_t L1Regularization(
const Matrix_t &W)
491 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
492 return TCuda<AFloat>::L1Regularization(mW);
494 static void AddL1RegularizationGradients(Matrix_t &A,
const Matrix_t &W, Scalar_t weightDecay)
496 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
497 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
498 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
501 static Scalar_t L2Regularization(
const Matrix_t &W)
503 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
504 return TCuda<AFloat>::L2Regularization(mW);
506 static void AddL2RegularizationGradients(Matrix_t &A,
const Matrix_t &W, Scalar_t weightDecay)
508 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
509 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
510 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
526 static void InitializeGauss(Matrix_t &A);
527 static void InitializeUniform(Matrix_t &A);
528 static void InitializeIdentity(Matrix_t &A);
529 static void InitializeZero(Matrix_t &A);
530 static void InitializeGlorotNormal(Matrix_t &A);
531 static void InitializeGlorotUniform(Matrix_t &A);
535 static TRandom &GetRandomGenerator();
538 static void SetRandomSeed(
size_t seed);
552 static void Dropout(Tensor_t &A, Scalar_t p) {}
566 static void AddConvBiases(Matrix_t &
output,
const Matrix_t &biases);
570 static void PrepareInternals(Tensor_t &) {}
573 static void ConvLayerForward(Tensor_t &
output,
574 Tensor_t &inputActivationFunc,
575 const Tensor_t &input,
const Matrix_t &weights,
const Matrix_t &biases,
576 const DNN::CNN::TConvParams ¶ms, EActivationFunction activFunc,
577 Tensor_t & ,
const ConvDescriptors_t &descriptors,
578 ConvWorkspace_t &workspace);
594 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
595 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
596 const Matrix_t &weights,
const Tensor_t &activationBackward,
597 const Tensor_t &outputTensor, EActivationFunction activFunc,
598 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace,
size_t ,
599 size_t ,
size_t ,
size_t ,
size_t ,
600 size_t ,
size_t ,
size_t ,
616 static void Downsample(Tensor_t &A, Tensor_t & ,
const Tensor_t &C,
const PoolingDescriptors_t &descriptors,
617 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
618 size_t fltWidth,
size_t strideRows,
size_t strideCols);
628 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
const Tensor_t &activationGradients,
629 const Tensor_t & ,
const Tensor_t &inputActivation,
630 const Tensor_t &outputTensor,
const PoolingDescriptors_t &descriptors,
631 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
632 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t nLocalViews);
649 static void Flatten(Tensor_t &A,
const Tensor_t &B);
653 static void Deflatten(Tensor_t &A,
const Tensor_t &B);
656 static void Rearrange(Tensor_t &out,
const Tensor_t &in);
659 static void RNNForward(
const Tensor_t &
x,
const Tensor_t &hx,
const Tensor_t &cx,
const Tensor_t &weights,
660 Tensor_t &
y, Tensor_t &hy, Tensor_t &cy,
const RNNDescriptors_t &descr,
661 RNNWorkspace_t &workspace,
bool isTraining);
663 static void RNNBackward(
const Tensor_t &
x,
const Tensor_t &hx,
const Tensor_t &cx,
const Tensor_t &
y,
const Tensor_t &dy,
664 const Tensor_t &dhy,
const Tensor_t &dcy,
const Tensor_t &weights, Tensor_t &dx, Tensor_t &dhx,
665 Tensor_t &dcx, Tensor_t &dw,
const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
670 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward,
672 Matrix_t & , Matrix_t & ,
680 return state_gradients_backward;
682 static Matrix_t &LSTMLayerBackward(
683 Matrix_t & state_gradients_backward , Matrix_t & ,
684 Matrix_t & , Matrix_t & ,
685 Matrix_t & , Matrix_t & ,
686 Matrix_t & , Matrix_t & ,
688 Matrix_t & , Matrix_t & ,
689 Matrix_t & , Matrix_t & ,
690 Matrix_t & , Matrix_t & , Matrix_t & ,
691 Matrix_t & , Matrix_t & ,
692 const Matrix_t & ,
const Matrix_t & ,
693 const Matrix_t & ,
const Matrix_t & ,
694 const Matrix_t & ,
const Matrix_t & ,
695 const Matrix_t & ,
const Matrix_t & ,
696 const Matrix_t & ,
const Matrix_t & ,
697 const Matrix_t & ,
const Matrix_t & ,
698 const Matrix_t & ,
const Matrix_t & ,
699 const Matrix_t & , Matrix_t & ,
700 Matrix_t & , Matrix_t & )
702 return state_gradients_backward;
706 static Matrix_t &GRULayerBackward(
707 Matrix_t & state_gradients_backward, Matrix_t & ,
708 Matrix_t & , Matrix_t & ,
709 Matrix_t & , Matrix_t & ,
710 Matrix_t & , Matrix_t & ,
711 Matrix_t & , Matrix_t & ,
712 Matrix_t & , Matrix_t & , Matrix_t & ,
713 const Matrix_t & ,
const Matrix_t & ,
714 const Matrix_t & ,
const Matrix_t & ,
715 const Matrix_t & ,
const Matrix_t & ,
716 const Matrix_t & ,
const Matrix_t & ,
717 const Matrix_t & ,
const Matrix_t & ,
718 const Matrix_t & , Matrix_t & ,
bool)
720 return state_gradients_backward;
739 static void Hadamard(Tensor_t &A,
const Tensor_t &B)
741 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
742 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
743 assert(A.GetSize() == B.GetSize());
744 TCuda<AFloat>::Hadamard(tmpA, tmpB);
755 static Scalar_t
Sum(
const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
763 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
764 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
765 TCuda<AFloat>::ConstAdd(tmp,beta);
771 static void ConstMult(Matrix_t &A, Scalar_t beta) {
772 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
773 TCuda<AFloat>::ConstMult(tmp,beta);
779 static void ReciprocalElementWise(Matrix_t &A) {
780 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
781 TCuda<AFloat>::ReciprocalElementWise(tmp);
787 static void SquareElementWise(Matrix_t &A) {
788 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
789 TCuda<AFloat>::SquareElementWise(tmp);
796 static void SqrtElementWise(Matrix_t &A) {
797 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
798 TCuda<AFloat>::SqrtElementWise(tmp);
802 static void AdamUpdate(Matrix_t & A,
const Matrix_t & M,
const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
803 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
804 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
805 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
806 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
808 static void AdamUpdateFirstMom(Matrix_t & A,
const Matrix_t & B, Scalar_t beta) {
809 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
810 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
811 TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
813 static void AdamUpdateSecondMom(Matrix_t & A,
const Matrix_t & B, Scalar_t beta) {
814 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
815 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
816 TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
820 static void PrintTensor(
const Tensor_t & A,
const std::string
name =
"tensor",
bool =
false);
822 static void PrintTensor4dDescriptor(TensorDescriptor_t descriptor);
823 static void PrintTensorNdDescriptor(TensorDescriptor_t descriptor,
int n = 10);
832 static void SumRows(Matrix_t &B,
const Matrix_t &A);
837template <
typename AFloat>
838template <
typename ATensor>
839void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
846 if (B.GetLayout() == GetTensorLayout()) {
847 if ( B.GetShape().size() == 4) {
848 assert(B.GetShape().size() == 4);
849 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
850 for (
size_t i = 0; i < firstSize; ++i) {
853 TCudaTensor<AFloat> tmpOut = B.At(i);
855 TCudaTensor<AFloat> tmpIn(matIn.
GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
862 TCudaMatrix<AFloat> tmp2(tmp);
863 TCudaTensor<AFloat> tA(tmp2);
869 TCudaMatrix<AFloat> tmp2(tmp);
870 TCudaTensor<AFloat> tA(tmp2);
876template <
typename AFloat>
877template <
typename AMatrix>
878void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B,
const AMatrix &A)
884 if (B.GetLayout() == GetTensorLayout() ) {
889 TCudaMatrix<AFloat> tmp2(tmp);
890 TCudaTensor<AFloat> tA(tmp2);
895template <
typename AFloat>
896template <
typename AMatrix_t>
897void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
898 const std::vector<AMatrix_t> &A)
900 for (
size_t i = 0; i < B.size(); ++i) {
901 CopyWeightsDiffArch(B[i], A[i]);
905template <
typename AFloat>
906void TCudnn<AFloat>::PrintTensor(
const typename TCudnn<AFloat>::Tensor_t & A,
const std::string
name,
bool truncate )
908 std::cout <<
name <<
" size = " << A.GetSize() <<
" shape = { ";
909 auto shape = A.GetShape();
910 for (
size_t k = 0; k < shape.size()-1; ++k)
911 std::cout << shape[k] <<
" , ";
912 std::cout << shape.back() <<
" } ";
913 std::cout <<
" strides = { ";
914 auto strides = A.GetStrides();
915 for (
size_t k = 0; k < strides.size()-1; ++k)
916 std::cout << strides[k] <<
" , ";
917 std::cout << strides.back() <<
" }\n ";
919 if (A.GetShape().size() == 2 ) {
920 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
922 size_t n = A.GetShape()[1];
923 if (truncate)
n = std::min(
n,
size_t(10));
924 for (
size_t j = 0; j <
n; ++j) {
925 std::cout << A(i,j) <<
" ";
928 if (truncate &&
n < A.GetShape()[1]) std::cout <<
" ...... ";
929 std::cout <<
" } " << std::endl;
931 }
else if (A.GetShape().size() == 3 ) {
932 for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
934 for (
size_t j = 0; j < A.GetHSize(); ++j) {
936 size_t n = A.GetWSize();
937 if (truncate)
n = std::min(
n,
size_t(10));
938 for (
size_t k = 0; k <
n; ++k) {
939 std::cout << A(i,j,k) <<
" ";
941 if (truncate &&
n < A.GetWSize()) std::cout <<
" ...... ";
942 std::cout <<
" } " << std::endl;
944 std::cout <<
" } " << std::endl;
946 }
else if (A.GetShape().size() == 4 ) {
947 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
949 for (
size_t j = 0; j < A.GetShape()[1]; ++j) {
951 for (
size_t k = 0; k < A.GetShape()[2]; ++k) {
952 size_t n = A.GetShape()[3];
953 if (truncate)
n = std::min(
n,
size_t(10));
954 for (
size_t l = 0;
l <
n; ++
l) {
955 std::cout << A(i,j,k,
l) <<
" ";
957 if (truncate &&
n < A.GetShape()[3]) std::cout <<
" ...... ";
958 std::cout <<
" } " << std::endl;
960 std::cout <<
" } " << std::endl;
962 std::cout <<
" } " << std::endl;
966 for (
size_t l = 0;
l < A.GetSize(); ++
l) {
967 std::cout << A.GetData()[
l] <<
" ";
973template <
typename AFloat>
974void TCudnn<AFloat>::PrintTensor4dDescriptor(TensorDescriptor_t descriptor) {
976 int s1, s2, s3, s4 = 0;
977 cudnnDataType_t dataType;
978 cudnnGetTensor4dDescriptor(descriptor, &dataType, &
n, &
c, &
h, &w, &
s1, &s2, &s3, &s4);
979 std::cout <<
"Descriptor for 4d tensor of shape { " <<
n <<
" , " <<
c <<
" , " <<
h <<
" , " << w <<
" }"
980 <<
" and strides { " <<
s1 <<
" , " << s2 <<
" , " << s3 <<
" , " << s4 <<
" }" << std::endl;
982template <
typename AFloat>
983void TCudnn<AFloat>::PrintTensorNdDescriptor(TensorDescriptor_t descriptor,
int ndim)
986 std::vector<int> dims(ndim);
987 std::vector<int> strides(ndim);
988 cudnnDataType_t dataType;
989 cudnnGetTensorNdDescriptor(descriptor, ndim, &dataType, &
n, dims.data(), strides.data());
992 std::cout <<
"Descriptor for Nd tensor of dim = " <<
n <<
" shape { ";
994 std::cout <<
d <<
" , ";
995 std::cout <<
"} and strides { ";
996 for (
auto s : strides)
997 std::cout << s <<
" , ";
998 std::cout <<
" }" << std::endl;
1013template <
typename AFloat>
1014int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
1015template <
typename AFloat>
1016int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
1017template <
typename AFloat>
1018int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
1019template <
typename AFloat>
1020Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1;
virtual const Element * GetMatrixArray() const
TMatrixT< Element > & T()
This is the base class for the ROOT Random number generators.
void Copy(void *source, void *dest)
T Sum(const RVec< T > &v)
Sum elements of an RVec.
std::shared_ptr< std::function< double(double)> > Tanh
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
T Identity(T value)
Identity function f(x) = x.
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations
static void output(int code)