11 #ifndef TMVA_DNN_MINIMIZERS 12 #define TMVA_DNN_MINIMIZERS 52 template<
typename Architecture_t>
56 using Scalar_t =
typename Architecture_t::Scalar_t;
57 using Matrix_t =
typename Architecture_t::Matrix_t;
77 size_t convergenceSteps,
84 template <
typename Data_t,
typename Net_t>
85 Scalar_t Train(
const Data_t & TrainingDataIn,
size_t nTrainingSamples,
86 const Data_t & TestDataIn,
size_t nTestSamples,
87 Net_t & net,
size_t nThreads = 1);
90 template <
typename Data_t,
typename Net_t>
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
100 template <
typename Net_t>
105 template <
typename Net_t>
114 template <
typename Net_t>
115 void Step(Net_t &master,
116 std::vector<Net_t> &nets,
120 template <
typename Net_t>
122 std::vector<Net_t> &nets,
125 template <
typename Net_t>
130 std::vector<Net_t> &nets,
137 template <
typename Net_t>
142 template <
typename Net_t>
170 template<
typename Architecture_t>
180 template<
typename Architecture_t>
182 size_t convergenceSteps,
192 template<
typename Architecture_t>
193 template <
typename Data_t,
typename Net_t>
195 size_t nTrainingSamples,
196 const Data_t & testData,
208 bool converged =
false;
213 net.GetOutputWidth(), nThreads);
214 auto testNet = net.CreateClone(nTestSamples);
216 testNet.GetBatchSize(),
217 testNet.GetInputWidth(),
218 net.GetOutputWidth());
219 std::vector<Net_t> nets{};
220 nets.reserve(nThreads);
221 for (
size_t i = 0; i < nThreads; i++) {
223 for (
size_t j = 0; j < net.GetDepth(); j++)
225 auto &masterLayer = net.GetLayer(j);
226 auto &layer = nets.back().GetLayer(j);
228 masterLayer.GetWeights());
230 masterLayer.GetBiases());
234 std::chrono::time_point<std::chrono::system_clock> start, end;
235 start = std::chrono::system_clock::now();
241 trainLoader.Shuffle();
242 std::vector<TBatch<Architecture_t>> batches{};
243 for (
size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
245 for (
size_t j = 0; j < nThreads; j++) {
246 batches.reserve(nThreads);
247 batches.push_back(trainLoader.GetBatch());
249 Step(net, nets, batches);
255 end = std::chrono::system_clock::now();
256 std::chrono::duration<double> elapsed_seconds = end - start;
257 start = std::chrono::system_clock::now();
258 double seconds = elapsed_seconds.count();
259 double batchesInEpoch = (double) (nTrainingSamples / net.GetBatchSize());
261 nFlops *= net.GetNFlops();
262 std::cout <<
"Elapsed time for " << fTestInterval <<
" Epochs: " 263 << seconds <<
" [s] => " << nFlops * 1
e-9 / seconds
264 <<
" GFlop/s" << std::endl;
266 auto b = *testLoader.begin();
267 auto inputMatrix =
b.GetInput();
268 auto outputMatrix =
b.GetOutput();
269 Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix);
271 std::cout <<
"Step " <<
fStepCount <<
": Training Error = " 272 << loss << std::endl;
281 template<
typename Architecture_t>
282 template <
typename Data_t,
typename Net_t>
284 size_t nTrainingSamples,
285 const Data_t & testData,
298 bool converged =
false;
303 net.GetOutputWidth(), nThreads);
304 auto testNet = net.CreateClone(net.GetBatchSize());
306 testNet.GetBatchSize(),
307 testNet.GetInputWidth(),
308 net.GetOutputWidth());
310 net.InitializeGradients();
311 std::vector<Net_t> nets{};
312 nets.reserve(nThreads);
313 for (
size_t i = 0; i < nThreads; i++) {
315 for (
size_t j = 0; j < net.GetDepth(); j++)
317 auto &masterLayer = net.GetLayer(j);
318 auto &layer = nets.back().GetLayer(j);
320 masterLayer.GetWeights());
322 masterLayer.GetBiases());
326 std::chrono::time_point<std::chrono::system_clock> start, end;
327 start = std::chrono::system_clock::now();
333 trainLoader.Shuffle();
335 std::vector<TBatch<Architecture_t>> batches{};
336 for (
size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
338 batches.reserve(nThreads);
339 for (
size_t j = 0; j < nThreads; j++) {
340 batches.push_back(trainLoader.GetBatch());
342 if (momentum != 0.0) {
345 Step(net, nets, batches);
352 for (
size_t i = 0; i < nTestSamples / net.GetBatchSize(); i += nThreads) {
353 auto b = testLoader.GetBatch();
354 auto inputMatrix =
b.GetInput();
355 auto outputMatrix =
b.GetOutput();
356 fTestError += testNet.Loss(inputMatrix, outputMatrix);
367 template<
typename Architecture_t>
368 template <
typename Net_t>
375 net.Forward(input,
true);
376 net.Backward(input, output);
378 for (
size_t i = 0; i < net.GetDepth(); i++)
380 auto &layer = net.GetLayer(i);
381 Architecture_t::ScaleAdd(layer.GetWeights(),
382 layer.GetWeightGradients(),
384 Architecture_t::ScaleAdd(layer.GetBiases(),
385 layer.GetBiasGradients(),
391 template<
typename Architecture_t>
392 template <
typename Net_t>
401 net.Backward(input,
output);
403 for (
size_t i = 0; i < net.GetDepth(); i++)
405 auto &layer = net.GetLayer(i);
406 Architecture_t::ScaleAdd(layer.GetWeights(),
407 layer.GetWeightGradients(),
409 Architecture_t::ScaleAdd(layer.GetBiases(),
410 layer.GetBiasGradients(),
417 template<
typename Architecture_t>
418 template <
typename Net_t>
421 std::vector<Net_t> & nets,
424 typename Architecture_t::Matrix_t
dummy(0,0);
425 size_t depth = master.GetDepth();
428 for (
size_t j = 0; j < nets.size(); j++) {
429 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
432 for (
size_t i = 1; i < depth; i++)
434 for (
size_t j = 0; j < nets.size(); j++) {
435 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
439 for (
size_t j = 0; j < nets.size(); j++) {
440 evaluateGradients<Architecture_t>(
441 nets[j].GetLayer(depth-1).GetActivationGradients(),
442 nets[j].GetLossFunction(),
443 batches[j].GetOutput(),
444 nets[j].GetLayer(depth-1).GetOutput());
447 for (
size_t i = depth - 1; i > 0; i--)
449 for (
size_t j = 0; j < nets.size(); j++) {
450 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
451 nets[j].GetLayer(i-1).GetOutput(),
452 nets[j].GetRegularization(),
453 nets[j].GetWeightDecay());
456 for (
size_t j = 0; j < nets.size(); j++) {
457 nets[j].GetLayer(0).Backward(dummy,
458 batches[j].GetInput(),
459 nets[j].GetRegularization(),
460 nets[j].GetWeightDecay());
463 for (
size_t j = 0; j < nets.size(); j++) {
464 for (
size_t i = 0; i < depth; i++)
466 auto &masterLayer = master.GetLayer(i);
467 auto &layer = nets[j].GetLayer(i);
468 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
469 layer.GetWeightGradients(),
472 masterLayer.GetWeights());
473 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
474 layer.GetBiasGradients(),
477 masterLayer.GetBiases());
483 template<
typename Architecture_t>
484 template <
typename Net_t>
487 std::vector<Net_t> & nets,
491 typename Architecture_t::Matrix_t
dummy(0,0);
492 size_t depth = master.GetDepth();
495 for (
size_t j = 0; j < nets.size(); j++) {
496 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
499 for (
size_t i = 1; i < depth; i++)
501 for (
size_t j = 0; j < nets.size(); j++) {
502 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
506 for (
size_t j = 0; j < nets.size(); j++) {
507 evaluateGradients<Architecture_t>(
508 nets[j].GetLayer(depth-1).GetActivationGradients(),
509 nets[j].GetLossFunction(),
510 batches[j].GetOutput(),
511 nets[j].GetLayer(depth-1).GetOutput());
514 for (
size_t i = depth - 1; i > 0; i--)
516 for (
size_t j = 0; j < nets.size(); j++) {
517 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
518 nets[j].GetLayer(i-1).GetOutput(),
519 nets[j].GetRegularization(),
520 nets[j].GetWeightDecay());
521 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
522 nets[j].GetLayer(i).GetWeightGradients(),
524 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
525 nets[j].GetLayer(i).GetBiasGradients(),
528 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
529 master.GetLayer(i).GetWeightGradients(),
531 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
532 master.GetLayer(i).GetBiasGradients(),
535 for (
size_t j = 0; j < nets.size(); j++) {
536 nets[j].GetLayer(0).Backward(dummy,
537 batches[j].GetInput(),
538 nets[j].GetRegularization(),
539 nets[j].GetWeightDecay());
540 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
541 nets[j].GetLayer(0).GetWeightGradients(),
543 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
544 nets[j].GetLayer(0).GetBiasGradients(),
548 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
549 master.GetLayer(0).GetWeightGradients(),
551 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
552 master.GetLayer(0).GetBiasGradients(),
555 for (
size_t i = 0; i < depth; i++)
557 auto &masterLayer = master.GetLayer(i);
558 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
559 masterLayer.GetWeightGradients(),
561 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
562 masterLayer.GetBiasGradients(),
564 for (
size_t j = 0; j < nets.size(); j++) {
565 auto &layer = nets[j].GetLayer(i);
567 masterLayer.GetWeights());
569 masterLayer.GetBiases());
575 template<
typename Architecture_t>
576 template <
typename Net_t>
579 std::vector<Net_t> & nets,
583 typename Architecture_t::Matrix_t
dummy(0,0);
584 size_t depth = master.GetDepth();
587 for (
size_t j = 0; j < nets.size(); j++) {
588 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
591 for (
size_t i = 1; i < depth; i++)
593 for (
size_t j = 0; j < nets.size(); j++) {
594 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
599 for (
size_t j = 0; j < nets.size(); j++) {
600 evaluateGradients<Architecture_t>(
601 nets[j].GetLayer(depth-1).GetActivationGradients(),
602 nets[j].GetLossFunction(),
603 batches[j].GetOutput(),
604 nets[j].GetLayer(depth-1).GetOutput());
608 for (
size_t i = depth - 1; i > 0; i--)
610 for (
size_t j = 0; j < nets.size(); j++) {
611 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
612 nets[j].GetLayer(i-1).GetOutput(),
613 nets[j].GetRegularization(),
614 nets[j].GetWeightDecay());
618 for (
size_t j = 0; j < nets.size(); j++) {
619 nets[j].GetLayer(0).Backward(dummy,
620 batches[j].GetInput(),
621 nets[j].GetRegularization(),
622 nets[j].GetWeightDecay());
625 for (
size_t i = 0; i < depth; i++)
627 auto &masterLayer = master.GetLayer(i);
628 for (
size_t j = 0; j < nets.size(); j++) {
629 auto &layer = nets[j].GetLayer(i);
631 masterLayer.GetWeights());
633 masterLayer.GetBiases());
634 Architecture_t::ScaleAdd(layer.GetWeights(),
635 masterLayer.GetWeightGradients(),
637 Architecture_t::ScaleAdd(layer.GetBiases(),
638 masterLayer.GetBiasGradients(),
641 for (
size_t j = 0; j < nets.size(); j++) {
642 auto &layer = nets[j].GetLayer(i);
643 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
644 layer.GetWeightGradients(),
646 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
647 layer.GetBiasGradients(),
650 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
651 masterLayer.GetWeightGradients(),
653 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
654 masterLayer.GetBiasGradients(),
656 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
657 masterLayer.GetWeightGradients(),
659 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
660 masterLayer.GetBiasGradients(),
666 template<
typename Architecture_t>
667 template <
typename Net_t>
673 net.Forward(input,
true);
674 net.Backward(input, output);
676 for (
size_t i = 0; i < net.GetDepth(); i++)
678 auto &layer = net.GetLayer(i);
679 Architecture_t::ScaleAdd(layer.GetWeights(),
680 layer.GetWeightGradients(),
683 Architecture_t::ScaleAdd(layer.GetBiases(),
684 layer.GetBiasGradients(),
691 template<
typename Architecture_t>
692 template <
typename Net_t>
701 net.Backward(input,
output);
703 for (
size_t i = 0; i < net.GetDepth(); i++)
705 auto &layer = net.GetLayer(i);
706 Architecture_t::ScaleAdd(layer.GetWeights(),
707 layer.GetWeightGradients(),
710 Architecture_t::ScaleAdd(layer.GetBiases(),
711 layer.GetBiasGradients(),
719 template<
typename Architecture_t>
733 template<
typename Architecture_t>
typename Architecture_t::Scalar_t Scalar_t
Scalar_t GetTrainingError() const
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output)
Perform a single optimization step on a given batch.
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current.
size_t fBatchSize
Batch size to use for the training.
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Scalar_t GetTestError() const
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t fTestError
Holds the most recently computed test loss.
void SetBatchSize(Scalar_t rate)
void SetConvergenceSteps(size_t steps)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
size_t fConvergenceCount
Current number of training epochs without.
size_t fTestInterval
Interval for the computation of the test error.
void Reset()
Reset minimizer object to initial state.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetLearningRate(Scalar_t rate)
size_t GetTestInterval() const
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Similar to StepReducedWeights(...) but also evaluates the loss.
static RooMathCoreReg dummy
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Abstract ClassifierFactory template that handles arbitrary types.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Same as Step(...) but also evaluate the loss on the given training data.
typename Architecture_t::Matrix_t Matrix_t
Scalar_t fLearningRate
Learning rate .
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
void SetTestInterval(size_t interval)
size_t GetConvergenceCount() const
Scalar_t fTrainingError
Holds the most recently computed training loss.