Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Kernels.cuh
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12/////////////////////////////////////////////////////////////////////////
13// Implementation of the device kernels for the CUDA implementation of //
14// the low-level interface. //
15/////////////////////////////////////////////////////////////////////////
16
17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
19
22#include "cuda.h"
23#include "math.h"
24
25namespace TMVA {
26namespace DNN {
27namespace Cuda {
28
29//____________________________________________________________________________
30template<typename AFloat>
31__device__ AFloat AtomicAdd(AFloat* address, AFloat val);
32
33template<>
34__device__ double AtomicAdd(double* address, double val)
35{
36 unsigned long long int* address_as_ull = (unsigned long long int*)address;
37 unsigned long long int old = *address_as_ull, assumed;
38 do {
39 assumed = old;
40 old = atomicCAS(address_as_ull, assumed,
41 __double_as_longlong(val +
42 __longlong_as_double(assumed)));
43 } while (assumed != old);
44 return __longlong_as_double(old);
45}
46
47template<>
48__device__ float AtomicAdd(float* address, float val)
49{
50 return atomicAdd(address, val);
51}
52
53//____________________________________________________________________________
54template<typename AFloat>
55__device__ void ReduceSumVertical(AFloat *result,
56 AFloat * sdata,
57 int n)
58{
59 // i,j are block row and column indices.
60 int i = threadIdx.y;
61 int j = threadIdx.x;
62 int index = i * blockDim.x + j;
63
64 __syncthreads();
65 if ((blockDim.y > 512) && (i < 512)) {
66 if ((i + 512) < blockDim.y) {
67 sdata[index] += sdata[index + 512 * blockDim.x];
68 }
69 }
70
71 __syncthreads();
72 if ((blockDim.y > 256) && (i < 256)) {
73 if ((i + 256) < blockDim.y) {
74 sdata[index] += sdata[index + 256 * blockDim.x];
75 }
76 }
77 __syncthreads();
78 if ((blockDim.y > 128) && (i < 128)) {
79 if ((i + 128) < blockDim.y) {
80 sdata[index] += sdata[index + 128 * blockDim.x];
81 }
82 }
83 __syncthreads();
84 if ((blockDim.y > 64) && (i < 64)) {
85 if ((i + 64) < blockDim.y) {
86 sdata[index] += sdata[index + 64 * blockDim.x];
87 }
88 }
89 __syncthreads();
90 if ((blockDim.y > 32) && (i < 32)) {
91 if ((i + 32) < blockDim.y) {
92 sdata[index] += sdata[index + 32 * blockDim.x];
93 }
94 }
95 __syncthreads();
96 if ((blockDim.y > 16) && (i < 16)) {
97 if ((i + 16) < blockDim.y) {
98 sdata[index] += sdata[index + 16 * blockDim.x];
99 }
100 }
101 __syncthreads();
102 if ((blockDim.y > 8) && (i < 8)) {
103 if ((i + 8) < blockDim.y) {
104 sdata[index] += sdata[index + 8 * blockDim.x];
105 }
106 }
107 __syncthreads();
108 if ((blockDim.y > 4) && (i < 4)) {
109 if ((i + 4) < blockDim.y) {
110 sdata[index] += sdata[index + 4 * blockDim.x];
111 }
112 }
113 __syncthreads();
114 if ((blockDim.y > 2) && (i < 2)) {
115 if ((i + 2) < blockDim.y) {
116 sdata[index] += sdata[index + 2 * blockDim.x];
117 }
118 }
119 __syncthreads();
120 if ((blockDim.y > 1) && (i < 1)) {
121 if ((i + 1) < blockDim.y) {
122 sdata[index] += sdata[index + 1 * blockDim.x];
123 }
124 }
125 __syncthreads();
126 if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) {
127 AtomicAdd(result + j, sdata[index]);
128 }
129 __syncthreads();
130}
131
132//____________________________________________________________________________
133template<typename AFloat>
134__device__ void ReduceSum(AFloat *result, AFloat * sdata)
135{
136 int tid = threadIdx.x + threadIdx.y * blockDim.x;
137
138 __syncthreads();
139 if ((TDevice::BlockSize > 512) && (tid < 512)) {
140 if ((tid + 512) < TDevice::BlockSize) {
141 sdata[tid] += sdata[tid + 512];
142 }
143 }
144
145 __syncthreads();
146 if ((TDevice::BlockSize > 256) && (tid < 256)) {
147 if ((tid + 256) < TDevice::BlockSize) {
148 sdata[tid] += sdata[tid + 256];
149 }
150 }
151 __syncthreads();
152 if ((TDevice::BlockSize > 128) && (tid < 128)) {
153 if ((tid + 128) < TDevice::BlockSize) {
154 sdata[tid] += sdata[tid + 128];
155 }
156 }
157 __syncthreads();
158 if ((TDevice::BlockSize > 64) && (tid < 64)) {
159 if ((tid + 64) < TDevice::BlockSize) {
160 sdata[tid] += sdata[tid + 64];
161 }
162 }
163 __syncthreads();
164 if ((TDevice::BlockSize > 32) && (tid < 32)) {
165 if ((tid + 32) < TDevice::BlockSize) {
166 sdata[tid] += sdata[tid + 32];
167 }
168 }
169 __syncthreads();
170 if ((TDevice::BlockSize > 16) && (tid < 16)) {
171 if ((tid + 16) < TDevice::BlockSize) {
172 sdata[tid] += sdata[tid + 16];
173 }
174 }
175 __syncthreads();
176 if ((TDevice::BlockSize > 8) && (tid < 8)) {
177 if ((tid + 8) < TDevice::BlockSize) {
178 sdata[tid] += sdata[tid + 8];
179 }
180 }
181 __syncthreads();
182 if ((TDevice::BlockSize > 4) && (tid < 4)) {
183 if ((tid + 4) < TDevice::BlockSize) {
184 sdata[tid] += sdata[tid + 4];
185 }
186 }
187 __syncthreads();
188 if ((TDevice::BlockSize > 2) && (tid < 2)) {
189 if ((tid + 2) < TDevice::BlockSize) {
190 sdata[tid] += sdata[tid + 2];
191 }
192 }
193 __syncthreads();
194 if ((TDevice::BlockSize > 1) && (tid < 1)) {
195 if ((tid + 1) < TDevice::BlockSize) {
196 sdata[tid] += sdata[tid + 1];
197 }
198 }
199 if (tid == 0) {
200 AtomicAdd(result, sdata[0]);
201 }
202
203 __syncthreads();
204}
205
206template<typename AFloat>
207__device__ AFloat max(AFloat x, AFloat y)
208{
209 if (x < y) return y;
210 return x;
211}
212
213////////////////////////////////////////////////////////////////////////////////////
214/// \brief Calculate the dimension of an output volume, given the sliding parameters
215/// and the input shape.
216/// \param[in] imgDim The size of the input tensor in a spatial dimension.
217/// \param[in] fltDim The size of the sliding filter in the same dimension.
218/// \param[in] padding Number of zeroes to pad the input with.
219/// \param[in] stride Number of pixels the kernel is sliding in each iteration.
220/// \returns The output dimension.
221///
222/// Note that no checks are performed to assert validity of the input parameters.
223/// We are allowed to assume them valid because those checks have already been
224/// performed prior to the invocation of the kernel.
225////////////////////////////////////////////////////////////////////////////////////
226__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
227{
228 // Parameters passed at this point are guaranteed to be valid - skip checks.
229 return ((imgDim - fltDim + 2 * padding) / stride) + 1;
230}
231
232////////////////////////////////////////////////////////////////////////////////////
233/// \brief A kernel that re-arranges image regions of the input matrix \B, into
234/// column vectors in matrix \A.
235///
236/// \param[out] A The output matrix. Each row corresponds to a receptive field.
237/// \param[in] B The input matrix. Each row corresponds to a row in the image view.
238/// \param[in] depth The depth of the input tensor.
239/// \param[in] imgHeight The height of the input tensor.
240/// \param[in] imgWidth The output of the input tensor
241/// \param[in] fltHeight Height of the filter.
242/// \param[in] fltWidth Width of the filter.
243/// \param[in] strideRows stride size in the horizontal dimension.
244/// \param[in] strideCols stride size in the vertical dimension.
245/// \param[in] zeroPaddingHeight The padding in the horizontal dimension.
246/// \param[in] zeroPaddingWidth The padding in the vertical dimension.
247///
248/// The kernel should be invoked with one thread per output element. Note that
249/// matrices \A and \B have different shapes. Each thread in this kernel is
250/// responsible for filling one cell of the output matrix \A. It does so by computing
251/// the correct element to copy from the input matrix \B. We therefore never need to
252/// block. When reading this kernel it is important to keep in mind that TCudaMatrix
253/// objects are saved in column major order for compatibility with cuBLAS.
254////////////////////////////////////////////////////////////////////////////////////
255template<typename AFloat>
256__global__ void Im2Col(AFloat * A,
257 const AFloat * B,
258 int depth,
259 int imgHeight,
260 int imgWidth,
261 int fltHeight,
262 int fltWidth,
263 int strideRows,
264 int strideCols,
265 int zeroPaddingHeight,
266 int zeroPaddingWidth)
267{
268 // The row of the output matrix.
269 int i = blockDim.y * blockIdx.y + threadIdx.y;
270
271 // The column of the output matrix.
272 int j = blockDim.x * blockIdx.x + threadIdx.x;
273
274 // Number of column in matrix A.
275 int NLocalViewPixels = fltHeight * fltWidth * depth;
276
277 // Number of rows in matrix A.
278 int NLocalViews = calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols) *
279 calculateDimension(imgHeight, fltHeight, zeroPaddingHeight, strideRows);
280
281 if (i >= NLocalViews || j >= NLocalViewPixels) return;
282
283 int index = j * NLocalViews + i;
284
285 int numSlidesPerRow = calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols);
286
287 // Which image channel of B?
288 int bz = j / (fltHeight * fltWidth);
289
290 // Which row in matrix B?
291 int by = (i / numSlidesPerRow) * strideRows - zeroPaddingHeight + (j - bz * fltHeight * fltWidth) / fltWidth;
292
293 // Which column in matrix B?
294 int bx = (i % numSlidesPerRow) * strideCols - zeroPaddingWidth + (j - bz * fltHeight * fltWidth) % fltWidth;
295
296 if (bx < 0 || by < 0 || bx >= imgWidth || by >= imgHeight) {
297 // This is a padding element.
298 A[index] = 0;
299 }
300 else {
301 A[index] = B[(bx + by * imgWidth) * depth + bz];
302 }
303}
304
305//____________________________________________________________________________
306template<typename AFloat>
307__global__ void AddRowWise(AFloat * W,
308 const AFloat * theta,
309 int m, int n)
310{
311 int i = blockDim.y * blockIdx.y + threadIdx.y;
312 int j = blockDim.x * blockIdx.x + threadIdx.x;
313 int index = j * m + i;
314
315 if ((i < m) && (j < n))
316 W[index] += theta[j];
317}
318
319//____________________________________________________________________________
320template<typename AFloat>
321__global__ void Hadamard(AFloat * B,
322 const AFloat * A,
323 int m, int n)
324{
325 int i = blockDim.y * blockIdx.y + threadIdx.y;
326 int j = blockDim.x * blockIdx.x + threadIdx.x;
327 int index = j * m + i;
328
329 if ((i < m) && (j < n))
330 B[index] *= A[index];
331}
332
333//____________________________________________________________________________
334template<typename AFloat>
335__global__ void ConstAdd(AFloat * A, AFloat beta,
336 int m, int n)
337{
338 int i = blockDim.y * blockIdx.y + threadIdx.y;
339 int j = blockDim.x * blockIdx.x + threadIdx.x;
340 int index = j * m + i;
341
342 if ((i < m) && (j < n)) {
343 A[index] = A[index] + beta;
344 }
345}
346
347//____________________________________________________________________________
348template<typename AFloat>
349__global__ void ConstMult(AFloat * A, AFloat beta,
350 int m, int n)
351{
352 int i = blockDim.y * blockIdx.y + threadIdx.y;
353 int j = blockDim.x * blockIdx.x + threadIdx.x;
354 int index = j * m + i;
355
356 if ((i < m) && (j < n)) {
357 A[index] = A[index] * beta;
358 }
359}
360
361//____________________________________________________________________________
362template<typename AFloat>
363__global__ void ReciprocalElementWise(AFloat * A,
364 int m, int n)
365{
366 int i = blockDim.y * blockIdx.y + threadIdx.y;
367 int j = blockDim.x * blockIdx.x + threadIdx.x;
368 int index = j * m + i;
369
370 if ((i < m) && (j < n)) {
371 A[index] = 1.0 / A[index];
372 }
373}
374
375//____________________________________________________________________________
376template<typename AFloat>
377__global__ void SquareElementWise(AFloat * A,
378 int m, int n)
379{
380 int i = blockDim.y * blockIdx.y + threadIdx.y;
381 int j = blockDim.x * blockIdx.x + threadIdx.x;
382 int index = j * m + i;
383
384 if ((i < m) && (j < n)) {
385 A[index] = A[index] * A[index];
386 }
387}
388
389//____________________________________________________________________________
390template<typename AFloat>
391__global__ void SqrtElementWise(AFloat * A,
392 int m, int n)
393{
394 int i = blockDim.y * blockIdx.y + threadIdx.y;
395 int j = blockDim.x * blockIdx.x + threadIdx.x;
396 int index = j * m + i;
397
398 if ((i < m) && (j < n)) {
399 A[index] = sqrt(A[index]);
400 }
401}
402
403
404/// optimizer kernel functions
405
406//____________________________________________________________________________
407template<typename AFloat>
408__global__ void AdamUpdate(AFloat * A, const AFloat * M, const AFloat * V,
409 int m, int n, AFloat alpha, AFloat eps)
410{
411 int i = blockDim.y * blockIdx.y + threadIdx.y;
412 int j = blockDim.x * blockIdx.x + threadIdx.x;
413 int index = j * m + i;
414
415 if ((i < m) && (j < n)) {
416 A[index] = A[index] - alpha * M[index]/( sqrt(V[index]) + eps);
417 }
418}
419
420//____________________________________________________________________________
421template<typename AFloat>
422__global__ void AdamUpdateFirstMom(AFloat * A, const AFloat * B,
423 int m, int n, AFloat beta)
424{
425 int i = blockDim.y * blockIdx.y + threadIdx.y;
426 int j = blockDim.x * blockIdx.x + threadIdx.x;
427 int index = j * m + i;
428
429 if ((i < m) && (j < n)) {
430 A[index] = beta * A[index] + (1.-beta) * B[index];
431 }
432}
433
434//____________________________________________________________________________
435template<typename AFloat>
436__global__ void AdamUpdateSecondMom(AFloat * A, const AFloat * B,
437 int m, int n, AFloat beta)
438{
439 int i = blockDim.y * blockIdx.y + threadIdx.y;
440 int j = blockDim.x * blockIdx.x + threadIdx.x;
441 int index = j * m + i;
442
443 if ((i < m) && (j < n)) {
444 A[index] = beta * A[index] + (1.-beta) * B[index] * B[index];
445 }
446}
447
448//____________________________________________________________________________
449template<typename AFloat>
450__global__ void IdentityDerivative(AFloat * A,
451 int m, int n)
452{
453 int i = blockDim.y * blockIdx.y + threadIdx.y;
454 int j = blockDim.x * blockIdx.x + threadIdx.x;
455 int index = j * m + i;
456
457 if ((i < m) && (j < n))
458 A[index] = 1.0;
459}
460
461//____________________________________________________________________________
462template<typename AFloat>
463__global__ void Relu(AFloat * A,
464 int m, int n)
465{
466 int i = blockDim.y * blockIdx.y + threadIdx.y;
467 int j = blockDim.x * blockIdx.x + threadIdx.x;
468 int index = j * m + i;
469
470 if ((i < m) && (j < n)) {
471 AFloat x = A[index];
472 A[index] = (x < 0.0) ? 0.0 : x;
473 }
474}
475
476//____________________________________________________________________________
477template<typename AFloat>
478__global__ void ReluDerivative(AFloat * B,
479 const AFloat * A, int m, int n)
480{
481 int i = blockDim.y * blockIdx.y + threadIdx.y;
482 int j = blockDim.x * blockIdx.x + threadIdx.x;
483 int index = j * m + i;
484
485 if ((i < m) && (j < n)) {
486 AFloat x = A[index];
487 B[index] = (x < 0.0) ? 0.0 : 1.0;
488 }
489}
490
491//____________________________________________________________________________
492template<typename AFloat>
493__global__ void Sigmoid(AFloat * A,
494 int m, int n)
495{
496 int i = blockDim.y * blockIdx.y + threadIdx.y;
497 int j = blockDim.x * blockIdx.x + threadIdx.x;
498 int index = j * m + i;
499
500 if ((i < m) && (j < n)) {
501 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
502 A[index] = sig;
503 }
504}
505
506//____________________________________________________________________________
507template<typename AFloat>
508__global__ void Sigmoid(AFloat * B,
509 const AFloat * A,
510 int m, int n)
511{
512 int i = blockDim.y * blockIdx.y + threadIdx.y;
513 int j = blockDim.x * blockIdx.x + threadIdx.x;
514 int index = j * m + i;
515
516 if ((i < m) && (j < n)) {
517 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
518 B[index] = sig;
519 }
520}
521
522//____________________________________________________________________________
523template<typename AFloat>
524__global__ void SigmoidDerivative(AFloat * B,
525 const AFloat * A,
526 int m, int n)
527{
528 int i = blockDim.y * blockIdx.y + threadIdx.y;
529 int j = blockDim.x * blockIdx.x + threadIdx.x;
530 int index = j * m + i;
531
532 if ((i < m) && (j < n)) {
533 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
534 B[index] = sig * (1.0 - sig);
535 }
536}
537
538//____________________________________________________________________________
539template<typename AFloat>
540__global__ void Softmax(AFloat * B,
541 const AFloat * A,
542 int m, int n)
543{
544 int i = blockDim.x * blockIdx.x + threadIdx.x;
545
546 if (i < m) {
547 AFloat sum = 0.0;
548 for (int j = 0; j < n; j++) {
549 sum += exp(A[i + j * n]);
550 }
551 for (int j = 0; j < n; j++) {
552 B[i + j * n] = exp(A[i * n + j]) / sum;
553 }
554 }
555}
556
557//____________________________________________________________________________
558template<typename AFloat>
559__global__ void Tanh(AFloat * A,
560 int m, int n)
561{
562 int i = blockDim.y * blockIdx.y + threadIdx.y;
563 int j = blockDim.x * blockIdx.x + threadIdx.x;
564 int index = j * m + i;
565
566 if ((i < m) && (j < n)) {
567 AFloat t = ::tanh(A[index]);
568 A[index] = t;
569 }
570}
571
572//____________________________________________________________________________
573template<typename AFloat>
574__global__ void TanhDerivative(AFloat * B,
575 const AFloat * A,
576 int m, int n)
577{
578 int i = blockDim.y * blockIdx.y + threadIdx.y;
579 int j = blockDim.x * blockIdx.x + threadIdx.x;
580 int index = j * m + i;
581
582 if ((i < m) && (j < n)) {
583 AFloat t = ::tanh(A[index]);
584 B[index] = 1 - t*t;
585 }
586}
587
588//____________________________________________________________________________
589template<typename AFloat>
590__global__ void SymmetricRelu(AFloat * A,
591 int m, int n)
592{
593 int i = blockDim.y * blockIdx.y + threadIdx.y;
594 int j = blockDim.x * blockIdx.x + threadIdx.x;
595 int index = j * m + i;
596
597 if ((i < m) && (j < n)) {
598 A[index] = abs(A[index]);
599 }
600}
601
602//____________________________________________________________________________
603template<typename AFloat>
604__global__ void SymmetricReluDerivative(AFloat * B,
605 const AFloat * A,
606 int m, int n)
607{
608 int i = blockDim.y * blockIdx.y + threadIdx.y;
609 int j = blockDim.x * blockIdx.x + threadIdx.x;
610 int index = j * m + i;
611
612 if ((i < m) && (j < n)) {
613 B[index] = (A[index] < 0.0) ? -1.0 : 1.0;
614 }
615}
616
617//____________________________________________________________________________
618template<typename AFloat>
619__global__ void SoftSign(AFloat * A,
620 int m, int n)
621{
622 int i = blockDim.y * blockIdx.y + threadIdx.y;
623 int j = blockDim.x * blockIdx.x + threadIdx.x;
624 int index = j * m + i;
625
626 if ((i < m) && (j < n)) {
627 AFloat x = A[index];
628 A[index] = x / (1.0 + abs(x));
629 }
630}
631
632//____________________________________________________________________________
633template<typename AFloat>
634__global__ void SoftSignDerivative(AFloat * B,
635 const AFloat * A,
636 int m, int n)
637{
638 int i = blockDim.y * blockIdx.y + threadIdx.y;
639 int j = blockDim.x * blockIdx.x + threadIdx.x;
640 int index = j * m + i;
641
642 if ((i < m) && (j < n)) {
643 AFloat x = 1.0 + fabs(A[index]);
644 B[index] = 1 / (x * x);
645 }
646}
647
648//____________________________________________________________________________
649template<typename AFloat>
650__global__ void Gauss(AFloat * A,
651 int m, int n)
652{
653 int i = blockDim.y * blockIdx.y + threadIdx.y;
654 int j = blockDim.x * blockIdx.x + threadIdx.x;
655 int index = j * m + i;
656
657 if ((i < m) && (j < n)) {
658 AFloat x = A[index];
659 A[index] = exp(- x * x);
660 }
661}
662
663//____________________________________________________________________________
664template<typename AFloat>
665__global__ void GaussDerivative(AFloat * B,
666 const AFloat * A,
667 int m, int n)
668{
669 int i = blockDim.y * blockIdx.y + threadIdx.y;
670 int j = blockDim.x * blockIdx.x + threadIdx.x;
671 int index = j * m + i;
672
673 if ((i < m) && (j < n)) {
674 AFloat x = A[index];
675 B[index] = - 2.0 * x * exp(- x * x);
676 }
677}
678
679//____________________________________________________________________________
680template<typename AFloat>
681__global__ void MeanSquaredError(AFloat * result,
682 const AFloat * Y,
683 const AFloat * output,
684 const AFloat * weights,
685 int m, int n)
686{
687 int i = blockDim.y * blockIdx.y + threadIdx.y;
688 int j = blockDim.x * blockIdx.x + threadIdx.x;
689 int tid = blockDim.x * threadIdx.y + threadIdx.x;
690 int index = j * m + i;
691
692 __shared__ AFloat sdata[TDevice::BlockSize];
693
694 if ((i < m) && (j < n)) {
695 AFloat w = weights[i];
696 AFloat norm = 1 / ((AFloat) (m * n));
697 AFloat e = Y[index] - output[index];
698 sdata[tid] = w * norm * e * e;
699 } else {
700 sdata[tid] = 0.0;
701 }
702 ReduceSum(result, sdata);
703}
704
705//____________________________________________________________________________
706template<typename AFloat>
707__global__ void SquaredSum(AFloat * result,
708 const AFloat * A,
709 int m, int n)
710{
711 int i = blockDim.y * blockIdx.y + threadIdx.y;
712 int j = blockDim.x * blockIdx.x + threadIdx.x;
713 int tid = blockDim.x * threadIdx.y + threadIdx.x;
714 int index = j * m + i;
715
716 __shared__ AFloat sdata[TDevice::BlockSize];
717
718 if ((i < m) && (j < n)) {
719 AFloat e = A[index];
720 sdata[tid] = e * e;
721 } else {
722 sdata[tid] = 0.0;
723 }
724 ReduceSum(result, sdata);
725}
726
727//____________________________________________________________________________
728template<typename AFloat>
729__global__ void AbsoluteSum(AFloat * result,
730 const AFloat * A,
731 int m, int n)
732{
733 int i = blockDim.y * blockIdx.y + threadIdx.y;
734 int j = blockDim.x * blockIdx.x + threadIdx.x;
735 int tid = blockDim.x * threadIdx.y + threadIdx.x;
736 int index = j * m + i;
737
738 __shared__ AFloat sdata[TDevice::BlockSize];
739
740 if ((i < m) && (j < n)) {
741 sdata[tid] = abs(A[index]);
742 } else {
743 sdata[tid] = 0.0;
744 }
745 ReduceSum(result, sdata);
746}
747
748//____________________________________________________________________________
749template<typename AFloat>
750__global__ void MeanSquaredErrorGradients(AFloat * dY,
751 const AFloat * Y,
752 const AFloat * output,
753 const AFloat * weights,
754 int m, int n)
755{
756 int i = blockDim.y * blockIdx.y + threadIdx.y;
757 int j = blockDim.x * blockIdx.x + threadIdx.x;
758 int index = j * m + i;
759
760 if ((i < m) && (j < n)) {
761 dY[index] = weights[i] * 2.0 / ((AFloat) (m * n)) * (output[index] - Y[index]);
762 }
763}
764
765//____________________________________________________________________________
766template<typename AFloat>
767__global__ void AddL1RegularizationGradients(AFloat * A,
768 const AFloat * B,
769 AFloat weightDecay,
770 int m, int n)
771{
772 int i = blockDim.y * blockIdx.y + threadIdx.y;
773 int j = blockDim.x * blockIdx.x + threadIdx.x;
774 int index = j * m + i;
775
776 if ((i < m) && (j < n)) {
777 AFloat sign = (B[index] < 0.0) ? -1.0 : 1.0;
778 A[index] += sign * weightDecay;
779 }
780}
781
782//____________________________________________________________________________
783template<typename AFloat>
784__global__ void AddL2RegularizationGradients(AFloat * A,
785 const AFloat * B,
786 AFloat weightDecay,
787 int m, int n)
788{
789 int i = blockDim.y * blockIdx.y + threadIdx.y;
790 int j = blockDim.x * blockIdx.x + threadIdx.x;
791 int index = j * m + i;
792
793 if ((i < m) && (j < n)) {
794 A[index] += 2.0 * weightDecay * B[index];
795 }
796}
797
798//____________________________________________________________________________
799template<typename AFloat>
800__global__ void CrossEntropy(AFloat * result,
801 const AFloat * Y,
802 const AFloat * output,
803 const AFloat * weights,
804 int m, int n)
805{
806 int i = blockDim.y * blockIdx.y + threadIdx.y;
807 int j = blockDim.x * blockIdx.x + threadIdx.x;
808 int tid = blockDim.x * threadIdx.y + threadIdx.x;
809 int index = j * m + i;
810
811 __shared__ AFloat sdata[TDevice::BlockSize];
812
813 if ((i < m) && (j < n)) {
814 AFloat norm = 1 / ((AFloat) (m * n));
815 AFloat x = output[index];
816 AFloat lr = std::log(1. + exp(-x));
817 if (x < -75.) lr = -x;
818 else if (x > 75.) lr = exp(-x);
819
820 AFloat ce = Y[index] * lr + (1.0 - Y[index]) * (x + lr);
821 sdata[tid] = weights[i] * norm * ce;
822 } else {
823 sdata[tid] = 0.0;
824 }
825
826 ReduceSum(result, sdata);
827}
828
829//____________________________________________________________________________
830template<typename AFloat>
831__global__ void CrossEntropyGradients(AFloat * dY,
832 const AFloat * Y,
833 const AFloat * output,
834 const AFloat * weights,
835 int m, int n)
836{
837 int i = blockDim.y * blockIdx.y + threadIdx.y;
838 int j = blockDim.x * blockIdx.x + threadIdx.x;
839 int index = j * m + i;
840
841 if ((i < m) && (j < n)) {
842 AFloat norm = 1 / ((AFloat) (m * n));
843 AFloat y = Y[index];
844 AFloat sig = 1.0 / (1.0 + exp(-output[index]));
845 dY[index] = weights[i] * norm * (sig - y);
846 }
847}
848
849//____________________________________________________________________________
850template<typename AFloat>
851__global__ void SoftmaxCrossEntropy(AFloat * result,
852 const AFloat * Y,
853 const AFloat * output,
854 const AFloat * weights,
855 int m, int n)
856{
857 int i = blockDim.y * blockIdx.y + threadIdx.y;
858 int tid = threadIdx.y;
859
860 __shared__ AFloat sdata[TDevice::BlockSize];
861 AFloat norm = 1.0 / ((AFloat) m);
862
863 sdata[tid] = 0.0;
864 if (i < m) {
865 AFloat sum = 0.0;
866 for (int j = 0; j < n; j++) {
867 sum += exp(output[i + j * m]);
868 }
869 for (int j = 0; j < n; j++) {
870 sdata[tid] += Y[i + j * m] * log(exp(output[i + j * m]) / sum);
871 }
872 sdata[tid] *= -weights[i] * norm;
873 } else {
874 sdata[tid] = 0.0;
875 }
876
877 ReduceSum(result, sdata);
878}
879
880//____________________________________________________________________________
881template<typename AFloat>
882__global__ void SoftmaxCrossEntropyGradients(AFloat * dY,
883 const AFloat * Y,
884 const AFloat * output,
885 const AFloat * weights,
886 int m, int n)
887{
888 int i = blockDim.y * blockIdx.y + threadIdx.y;
889 AFloat norm = 1.0 / ((AFloat) m);
890
891 if (i < m) {
892 AFloat sum = 0.0;
893 AFloat sumY = 0.0;
894 for (int j = 0; j < n; j++) {
895 sum += exp(output[i + j * m]);
896 sumY += Y[i + j * m];
897 }
898 for (int j = 0; j < n; j++) {
899 dY[i + j * m] = sumY * exp(output[i + j * m]) / sum - Y[i + j * m];
900 dY[i + j * m] *= weights[i] * norm;
901 }
902 }
903}
904
905//____________________________________________________________________________
906template<typename AFloat>
907__global__ void ReduceMatrix(AFloat *result,
908 const AFloat *A,
909 int m, int n)
910{
911 int i = blockDim.y * blockIdx.y + threadIdx.y;
912 int j = blockDim.x * blockIdx.x + threadIdx.x;
913 int tid = threadIdx.y * blockDim.x + threadIdx.x;
914 int index = j * m + i;
915
916 __shared__ AFloat smem[TDevice::BlockSize];
917 if ((i < m) && (j < n))
918 smem[tid] = A[index];
919 else
920 smem[tid] = 0.0;
921
922 ReduceSum(result, smem);
923}
924
925//____________________________________________________________________________
926template<typename AFloat>
927__global__ void SumColumns(AFloat *B,
928 const AFloat *A,
929 int m, int n)
930{
931 int i = blockDim.y * blockIdx.y + threadIdx.y;
932 int j = blockDim.x * blockIdx.x + threadIdx.x;
933 int matrixIndex = j * m + i;
934 int blockIndex = blockDim.x * threadIdx.y + threadIdx.x;
935
936
937 __shared__ AFloat smem[TDevice::BlockSize];
938
939 if ((i < m) && (j < n)) {
940 smem[blockIndex] = A[matrixIndex];
941 } else {
942 smem[blockIndex] = 0.0;
943 }
944
945 ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n);
946}
947
948template<typename AFloat>
949__global__ void AlmostEquals(bool * result, const AFloat * A, const AFloat * B, double epsilon, int m, int n)
950{
951 int i = blockDim.y * blockIdx.y + threadIdx.y;
952 int j = blockDim.x * blockIdx.x + threadIdx.x;
953
954 if (i >= m || j >= n) return;
955 int matrixIndex = j * m + i;
956
957 // This is a race condition but still thread safe: If many threads find inequality I don't care
958 // if they overwrite each other, the result is still going to be false.
959 if(fabs(A[matrixIndex] - B[matrixIndex]) > epsilon) result[0] = false;
960}
961
962//____________________________________________________________________________
963template<typename AFloat>
964__global__ void Dropout(AFloat *A,
965 int m, int n,
966 AFloat dropoutProbability,
967 curandState_t *state)
968{
969 int i = blockDim.y * blockIdx.y + threadIdx.y;
970 int j = blockDim.x * blockIdx.x + threadIdx.x;
971 int tid = i * gridDim.x + j;
972 if ((i < m) && (j < n)) {
973 float r = curand_uniform(state + tid);
974 if (r > dropoutProbability) {
975 A[j * m + i] = 0.0;
976 } else {
977 A[j * m + i] /= dropoutProbability;
978 }
979 }
980}
981
982//____________________________________________________________________________
983//////////////////////////////////////////////////////////////////////////////////////////////
984/// \brief Downsampling kernel used as the forward propagation step of a
985/// Max-Pooling layer.
986///
987/// \param[out] A The output matrix. Each row corresponds to a slice and each element
988/// is the max within a receptive field.
989/// \param[out] B The winning indices matrix. Each element is the index of the max element.
990/// \param[in] C The input matrix. Each row is a slice.
991/// \param[in] imgHeight The heigh of the input.
992/// \param[in] imgWidth The output of the input.
993/// \param[in] fltHeight Height of the kernel.
994/// \param[in] fltWidth Width of the kernel.
995/// \param[in] strideRows stride size in the horizontal dimension.
996/// \param[in] strideCols stride size in the vertical dimension.
997///
998/// Each output element is the maximum of the receptive field. The caller launches one thread
999/// per output element in order to eliminate shared write access.
1000///////////////////////////////////////////////////////////////////////////////////////////////
1001template<typename AFloat>
1002__global__ void Downsample(AFloat * output, AFloat * indexMatrix, const AFloat * input, int depth, int imgHeight,
1003 int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
1004{
1005 // The row of the output matrix.
1006 int i = blockDim.y * blockIdx.y + threadIdx.y;
1007
1008 // The column of the output matrix.
1009 int j = blockDim.x * blockIdx.x + threadIdx.x;
1010
1011 // Number of columns in matrix A.
1012 int NLocalViews = calculateDimension(imgWidth, fltWidth, 0, strideCols) *
1013 calculateDimension(imgHeight, fltHeight, 0, strideRows);
1014
1015 if (i >= depth || j >= NLocalViews) return;
1016
1017 int outputIndex = j * depth + i;
1018
1019 int numSlidesPerRow = calculateDimension(imgWidth, fltWidth, 0, strideCols);
1020
1021 int rowMin = (j / numSlidesPerRow) * strideRows; // First row of B that this thread should look at.
1022 int colMin = (j % numSlidesPerRow) * strideCols; // First column of B that this thread should look at.
1023 int bz = i; // Slice of B that this thread should look at.
1024
1025 AFloat value = 0;
1026 AFloat maxIndex = 0;
1027 bool first = true; // The first element should write to `value` no matter what.
1028
1029 for (size_t by = rowMin; by < rowMin + fltHeight; by++) {
1030 for (size_t bx = colMin; bx < colMin + fltWidth; bx++) {
1031 int inputIndex = (bx + by * imgWidth) * depth + bz;
1032 if (input[inputIndex] > value || first) {
1033 first = false;
1034 maxIndex = bx + by * imgWidth;
1035 value = input[inputIndex];
1036 }
1037 }
1038 }
1039 indexMatrix[outputIndex] = maxIndex;
1040 output[outputIndex] = value;
1041
1042}
1043
1044/////////////////////////////////////////////////////////////////////////////////////////////////
1045/// \brief Back-propagate the gradients through a max-pooling layer.
1046///
1047/// \param[out] gradientsBackward The gradients to be written. One gradient for each neuron at the layers's input.
1048/// \param[in] gradients The gradients coming from the next layer. One gradient for each receptive field.
1049/// \param[in] indexMatrix Winning indices. One index for each receptive field.
1050/// \param[in] depth The depth of the input tensor.
1051/// \param[in] imgHeight The height of the input tensor.
1052/// \param[in] imgWidth The output of the input tensor
1053/// \param[in] fltHeight Height of the filter.
1054/// \param[in] fltWidth Width of the filter.
1055/// \param[in] strideRows stride size in the horizontal dimension.
1056/// \param[in] strideCols stride size in the vertical dimension.
1057/////////////////////////////////////////////////////////////////////////////////////////////////
1058template<typename AFloat>
1059__global__ void MaxPoolBackward(AFloat * activationGradientsBackward,
1060 const AFloat * activationGradients,
1061 const AFloat * indexMatrix,
1062 int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth,
1063 int strideRows, int strideCols)
1064{
1065 int slice = blockDim.y * blockIdx.y + threadIdx.y; // row of the gradientsBackward matrix.
1066 int j = blockDim.x * blockIdx.x + threadIdx.x; // column of the gradientsBackward matrix.
1067
1068 if (slice >= depth || j >= imgHeight * imgWidth) return;
1069
1070 int height = calculateDimension(imgHeight, fltHeight, 0, strideRows);
1071 int width = calculateDimension(imgWidth, fltWidth, 0, strideCols);
1072
1073 // Which gradientsBackward element should this thread write to?
1074 int backRow = j % imgHeight;
1075 int backCol = j / imgHeight;
1076
1077 // Which gradient and indexMatrix elements should this thread read?
1078 int nextRowMin = floor((backRow - fltHeight) / (AFloat) strideRows) + 1;
1079 int nextColMin = floor((backCol - fltWidth) / (AFloat) strideCols) + 1;
1080
1081 int outputIndex = 0;
1082 AFloat grad = 0;
1083
1084 // Iterate over all output elements that were the outcome of receptive fields I was part of.
1085 for (int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
1086 for (int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
1087
1088 if (row >= height || col >= width || col < 0 || row < 0) continue;
1089
1090 outputIndex = (row * width + col) * depth + slice;
1091
1092 // Was I the winning index within this receptive field?
1093 if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
1094 grad += activationGradients[outputIndex];
1095 }
1096 }
1097 }
1098 activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
1099}
1100
1101template<typename AFloat>
1102__global__ void RotateWeights(AFloat * A, const AFloat * B, int filterDepth, int filterHeight, int filterWidth,
1103 int numFilters)
1104{
1105 int i = blockDim.y * blockIdx.y + threadIdx.y;
1106 int j = blockDim.x * blockIdx.x + threadIdx.x;
1107
1108 if (i >= numFilters || j > filterDepth * filterHeight * filterWidth) return;
1109
1110 int jump = filterHeight * filterWidth;
1111 int row = j / jump;
1112 int col = i * jump + jump - j % jump - 1;
1113
1114 A[col * filterDepth + row] = B[j * numFilters + i];
1115}
1116
1117template<typename AFloat>
1118__global__ void AddBiases(AFloat * A, const AFloat * B, int nRows, int nCols)
1119{
1120 int i = blockDim.y * blockIdx.y + threadIdx.y;
1121 int j = blockDim.x * blockIdx.x + threadIdx.x;
1122 if (i >= nRows || j >= nCols) return;
1123
1124 A[i + j * nRows] += B[i];
1125}
1126
1127template<typename AFloat>
1128__global__ void UpdateWeights(AFloat * A, const AFloat ** B, int batchSize, int nRows, int nCols)
1129{
1130 int i = blockDim.y * blockIdx.y + threadIdx.y;
1131 int j = blockDim.x * blockIdx.x + threadIdx.x;
1132
1133 if (i >= nRows || j >= nCols) return;
1134
1135 for (size_t event = 0; event < batchSize; event++) {
1136 size_t index = i * nCols + j;
1137 A[index] += B[event][index];
1138 }
1139}
1140
1141template<typename AFloat>
1142__global__ void Reshape(AFloat * A, const AFloat * B, int nRowsA, int nColsA, int nRowsB, int nColsB)
1143{
1144 int i = blockDim.y * blockIdx.y + threadIdx.y;
1145 int j = blockDim.x * blockIdx.x + threadIdx.x;
1146 if (i >= nRowsA || j >= nColsA) return;
1147
1148 size_t indexA = j * nRowsA + i;
1149
1150 size_t nElem = i * nColsA + j;
1151 size_t indexB = (nElem % nColsB) * nRowsB + nElem / nColsB;
1152
1153 A[indexA] = B[indexB];
1154}
1155
1156////////////////////////////////////////////////////////////////////////////////
1157/// \brief Flatten an array of 2D-arrays into a single 2D-array.
1158///
1159/// \param[out] A Output 2D-array saved in column major order.
1160/// \param[in] B Input array of 2D-arrays. Each element is a matrix to be concatenated.
1161/// \param[in] size Number of 2D-arrays in the input.
1162/// \param[in] nRows Number of rows in each matrix of the input.
1163/// \param[in] nCols Number of columns on each matrix of the input.
1164///
1165/// B is a pointer to `size` raw `TCudaMatrix` pointers. Each of those contains
1166/// elements saved on column major order. However the concatenation is performed
1167/// row wise. Each thread writes a single output element by locating the
1168/// appropriate input index.
1169//////////////////////////////////////////////////////////////////////////////////
1170template<typename AFloat>
1171__global__ void Flatten(AFloat * A, const AFloat *B, int size, int nRows, int nCols)
1172{
1173 int i = blockDim.y * blockIdx.y + threadIdx.y;
1174 int j = blockDim.x * blockIdx.x + threadIdx.x;
1175
1176 int nColsA = nRows * nCols;
1177 if (i >= size || j >= nColsA) return;
1178
1179 // Get a transposed view on matrix B[i].
1180 int row = j / nCols;
1181 int col = j % nCols;
1182 // AFloat element = B[i][col * nRows + row];
1183 AFloat element = B[ i * nColsA + col * nRows + row ];
1184
1185 size_t index = j * size + i;
1186 A[index] = element;
1187}
1188
1189// row major version of flatten (keep roaw before columns in memory): used by Cudnn
1190template<typename AFloat>
1191__global__ void FlattenRM(AFloat * A, const AFloat *B, int size, int nRows, int nCols)
1192{
1193 int i = blockDim.y * blockIdx.y + threadIdx.y;
1194 int j = blockDim.x * blockIdx.x + threadIdx.x;
1195
1196 int nColsA = nRows * nCols;
1197 if (i >= size || j >= nColsA) return;
1198
1199 // Get a transposed view on matrix B[i].
1200 int row = j / nCols;
1201 int col = j % nCols;
1202 // AFloat element = B[i][col * nRows + row];
1203 AFloat element = B[ i * nColsA + row * nCols + col ];
1204
1205 size_t index = j * size + i;
1206 A[index] = element;
1207}
1208
1209
1210////////////////////////////////////////////////////////////////////////////////
1211/// \brief Deflatten a 2D-array into an array of 2D-arrays.
1212///
1213/// \param[out] A Output array of 2D-arrays, each of which is column-major.
1214/// \param[in] B Input 2D-array to be split into `size` parts.
1215/// \param[in] size Number of 2D-arrays in the output.
1216/// \param[in] nRows Number of rows in each matrix of the output.
1217/// \param[in] nCols Number of columns on each matrix of the output.
1218///
1219/// A is a pointer to `size` raw `TCudaMatrix` pointers. Each of those will
1220/// contain elements saved on column major order. However the concatenation
1221/// is performed row wise. Each thread writes a single output element
1222/// by locating the appropriate input index.
1223//////////////////////////////////////////////////////////////////////////////////
1224template<typename AFloat>
1225__global__ void Deflatten(AFloat * A, const AFloat * B, int size, int nRows, int nCols)
1226{
1227 int i = blockDim.y * blockIdx.y + threadIdx.y;
1228 int j = blockDim.x * blockIdx.x + threadIdx.x;
1229
1230 int nColsB = nRows * nCols;
1231 if (i >= size || j >= nColsB) return;
1232
1233 AFloat element = B[j * size + i];
1234
1235 // Get a transposed view on matrix A[i].
1236 int row = j / nCols;
1237 int col = j % nCols;
1238 A[ i * nColsB + col * nRows + row] = element;
1239}
1240
1241// row major of flatten (used by Cudnn)
1242template<typename AFloat>
1243__global__ void DeflattenRM(AFloat * A, const AFloat * B, int size, int nRows, int nCols)
1244{
1245 int i = blockDim.y * blockIdx.y + threadIdx.y;
1246 int j = blockDim.x * blockIdx.x + threadIdx.x;
1247
1248 int nColsB = nRows * nCols;
1249 if (i >= size || j >= nColsB) return;
1250
1251 AFloat element = B[j * size + i];
1252
1253 // Get a transposed view on matrix A[i].
1254 int row = j / nCols;
1255 int col = j % nCols;
1256 A[ i * nColsB + row * nCols + col] = element;
1257}
1258
1259} // namespace Cuda
1260} // namespace DNN
1261} // namespace TMVA
1262
1263#endif
#define e(i)
Definition RSha256.hxx:103
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t width
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
Implementation of the CrossEntropy as separation criterion.
static constexpr int BlockSize
Definition Device.h:44
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
__global__ void SymmetricRelu(AFloat *A, int m, int n)
Definition Kernels.cuh:590
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
Definition Kernels.cuh:1128
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
Definition Kernels.cuh:226
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:524
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
Definition Kernels.cuh:964
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:882
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:927
__global__ void IdentityDerivative(AFloat *A, int m, int n)
Definition Kernels.cuh:450
__global__ void SqrtElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:391
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
Definition Kernels.cuh:408
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:851
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:767
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
Definition Kernels.cuh:55
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:750
__global__ void Relu(AFloat *A, int m, int n)
Definition Kernels.cuh:463
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:478
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:729
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:784
__device__ AFloat max(AFloat x, AFloat y)
Definition Kernels.cuh:207
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
Definition Kernels.cuh:307
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:349
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:665
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
Definition Kernels.cuh:1225
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
Definition Kernels.cuh:1171
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:540
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
Definition Kernels.cuh:1102
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:574
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:831
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:907
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
Definition Kernels.cuh:256
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Definition Kernels.cuh:1243
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:335
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:604
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:681
__global__ void SquareElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:377
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:634
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
Definition Kernels.cuh:1142
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:321
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
Definition Kernels.cuh:949
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Definition Kernels.cuh:1191
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:707
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:422
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:363
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
Definition Kernels.cuh:134
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
Definition Kernels.cuh:1059
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
Definition Kernels.cuh:1002
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:436
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
Definition Kernels.cuh:1118
std::shared_ptr< std::function< double(double)> > Tanh
Definition NeuralNet.cxx:29
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
std::shared_ptr< std::function< double(double)> > Gauss
Definition NeuralNet.cxx:12
std::shared_ptr< std::function< double(double)> > Sigmoid
Definition NeuralNet.cxx:26
std::shared_ptr< std::function< double(double)> > SoftSign
Definition NeuralNet.cxx:32
create variable transformations
TMarker m
Definition textangle.C:8
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()