11 #ifndef TMVA_DNN_MINIMIZERS 12 #define TMVA_DNN_MINIMIZERS 53 template<
typename Architecture_t>
57 using Scalar_t =
typename Architecture_t::Scalar_t;
58 using Matrix_t =
typename Architecture_t::Matrix_t;
77 size_t convergenceSteps,
83 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
84 fConvergenceCount = 0;
90 template <
typename Data_t,
typename Net_t>
91 Scalar_t Train(
const Data_t & TrainingDataIn,
size_t nTrainingSamples,
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
size_t nThreads = 1);
96 template <
typename Data_t,
typename Net_t>
98 const Data_t & TestDataIn,
size_t nTestSamples,
99 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
106 template <
typename Net_t>
111 template <
typename Net_t>
120 template <
typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
126 template <
typename Net_t>
128 std::vector<Net_t> &nets,
131 template <
typename Net_t>
136 std::vector<Net_t> &nets,
143 template <
typename Net_t>
148 template <
typename Net_t>
176 template <
typename Architecture_t>
185 template <
typename Architecture_t>
194 template<
typename Architecture_t>
195 template <
typename Data_t,
typename Net_t>
197 size_t nTrainingSamples,
198 const Data_t & testData,
210 net.GetOutputWidth(), nThreads);
211 auto testNet = net.CreateClone(nTestSamples);
213 testNet.GetBatchSize(),
214 testNet.GetInputWidth(),
215 net.GetOutputWidth());
216 std::vector<Net_t> nets{};
217 nets.reserve(nThreads);
218 for (
size_t i = 0; i < nThreads; i++) {
220 for (
size_t j = 0; j < net.GetDepth(); j++)
222 auto &masterLayer = net.GetLayer(j);
223 auto &layer = nets.back().GetLayer(j);
225 masterLayer.GetWeights());
227 masterLayer.GetBiases());
231 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
232 std::vector<TBatch<Architecture_t>> batches{};
233 batches.reserve(nThreads);
237 trainLoader.Shuffle();
238 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
240 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
241 Step(net, nets, batches);
245 auto b = *testLoader.begin();
246 auto inputMatrix =
b.GetInput();
247 auto outputMatrix =
b.GetOutput();
248 fTestError = testNet.Loss(inputMatrix, outputMatrix);
256 template<
typename Architecture_t>
257 template <
typename Data_t,
typename Net_t>
259 size_t nTrainingSamples,
260 const Data_t & testData,
273 net.GetOutputWidth(), nThreads);
274 auto testNet = net.CreateClone(net.GetBatchSize());
276 testNet.GetBatchSize(),
277 testNet.GetInputWidth(),
278 net.GetOutputWidth());
280 net.InitializeGradients();
281 std::vector<Net_t> nets{};
282 nets.reserve(nThreads);
283 for (
size_t i = 0; i < nThreads; i++) {
285 for (
size_t j = 0; j < net.GetDepth(); j++)
287 auto &masterLayer = net.GetLayer(j);
288 auto &layer = nets.back().GetLayer(j);
290 masterLayer.GetWeights());
292 masterLayer.GetBiases());
296 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
297 std::vector<TBatch<Architecture_t>> batches{};
298 batches.reserve(nThreads);
302 trainLoader.Shuffle();
303 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
305 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
306 if (momentum != 0.0) {
309 Step(net, nets, batches);
315 for (
size_t i = 0; i < batchesInEpoch; i++) {
316 auto b = testLoader.GetBatch();
317 auto inputMatrix =
b.GetInput();
318 auto outputMatrix =
b.GetOutput();
319 fTestError += testNet.Loss(inputMatrix, outputMatrix);
327 template<
typename Architecture_t>
328 template <
typename Net_t>
333 net.Forward(input,
true);
334 net.Backward(input, output);
336 for (
size_t i = 0; i < net.GetDepth(); i++)
338 auto &layer = net.GetLayer(i);
339 Architecture_t::ScaleAdd(layer.GetWeights(),
340 layer.GetWeightGradients(),
342 Architecture_t::ScaleAdd(layer.GetBiases(),
343 layer.GetBiasGradients(),
349 template<
typename Architecture_t>
350 template <
typename Net_t>
357 net.Backward(input,
output);
359 for (
size_t i = 0; i < net.GetDepth(); i++)
361 auto &layer = net.GetLayer(i);
362 Architecture_t::ScaleAdd(layer.GetWeights(),
363 layer.GetWeightGradients(),
365 Architecture_t::ScaleAdd(layer.GetBiases(),
366 layer.GetBiasGradients(),
373 template<
typename Architecture_t>
374 template <
typename Net_t>
377 std::vector<Net_t> & nets,
380 typename Architecture_t::Matrix_t
dummy(0,0);
381 size_t depth = master.GetDepth();
384 for (
size_t j = 0; j < nets.size(); j++) {
385 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
388 for (
size_t i = 1; i < depth; i++)
390 for (
size_t j = 0; j < nets.size(); j++) {
391 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
395 for (
size_t j = 0; j < nets.size(); j++) {
396 evaluateGradients<Architecture_t>(
397 nets[j].GetLayer(depth-1).GetActivationGradients(),
398 nets[j].GetLossFunction(),
399 batches[j].GetOutput(),
400 nets[j].GetLayer(depth-1).GetOutput());
403 for (
size_t i = depth - 1; i > 0; i--)
405 for (
size_t j = 0; j < nets.size(); j++) {
406 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
407 nets[j].GetLayer(i-1).GetOutput(),
408 nets[j].GetRegularization(),
409 nets[j].GetWeightDecay());
412 for (
size_t j = 0; j < nets.size(); j++) {
413 nets[j].GetLayer(0).Backward(dummy,
414 batches[j].GetInput(),
415 nets[j].GetRegularization(),
416 nets[j].GetWeightDecay());
419 for (
size_t j = 0; j < nets.size(); j++) {
420 for (
size_t i = 0; i < depth; i++)
422 auto &masterLayer = master.GetLayer(i);
423 auto &layer = nets[j].GetLayer(i);
424 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
425 layer.GetWeightGradients(),
428 masterLayer.GetWeights());
429 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
430 layer.GetBiasGradients(),
433 masterLayer.GetBiases());
439 template<
typename Architecture_t>
440 template <
typename Net_t>
443 std::vector<Net_t> & nets,
447 typename Architecture_t::Matrix_t
dummy(0,0);
448 size_t depth = master.GetDepth();
451 for (
size_t j = 0; j < nets.size(); j++) {
452 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
455 for (
size_t i = 1; i < depth; i++)
457 for (
size_t j = 0; j < nets.size(); j++) {
458 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
462 for (
size_t j = 0; j < nets.size(); j++) {
463 evaluateGradients<Architecture_t>(
464 nets[j].GetLayer(depth-1).GetActivationGradients(),
465 nets[j].GetLossFunction(),
466 batches[j].GetOutput(),
467 nets[j].GetLayer(depth-1).GetOutput());
470 for (
size_t i = depth - 1; i > 0; i--)
472 for (
size_t j = 0; j < nets.size(); j++) {
473 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
474 nets[j].GetLayer(i-1).GetOutput(),
475 nets[j].GetRegularization(),
476 nets[j].GetWeightDecay());
477 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478 nets[j].GetLayer(i).GetWeightGradients(),
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481 nets[j].GetLayer(i).GetBiasGradients(),
484 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
485 master.GetLayer(i).GetWeightGradients(),
487 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
488 master.GetLayer(i).GetBiasGradients(),
491 for (
size_t j = 0; j < nets.size(); j++) {
492 nets[j].GetLayer(0).Backward(dummy,
493 batches[j].GetInput(),
494 nets[j].GetRegularization(),
495 nets[j].GetWeightDecay());
496 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
497 nets[j].GetLayer(0).GetWeightGradients(),
499 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
500 nets[j].GetLayer(0).GetBiasGradients(),
504 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
505 master.GetLayer(0).GetWeightGradients(),
507 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
508 master.GetLayer(0).GetBiasGradients(),
511 for (
size_t i = 0; i < depth; i++)
513 auto &masterLayer = master.GetLayer(i);
514 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
515 masterLayer.GetWeightGradients(),
517 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
518 masterLayer.GetBiasGradients(),
520 for (
size_t j = 0; j < nets.size(); j++) {
521 auto &layer = nets[j].GetLayer(i);
523 masterLayer.GetWeights());
525 masterLayer.GetBiases());
531 template<
typename Architecture_t>
532 template <
typename Net_t>
535 std::vector<Net_t> & nets,
539 typename Architecture_t::Matrix_t
dummy(0,0);
540 size_t depth = master.GetDepth();
543 for (
size_t j = 0; j < nets.size(); j++) {
544 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
547 for (
size_t i = 1; i < depth; i++)
549 for (
size_t j = 0; j < nets.size(); j++) {
550 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
555 for (
size_t j = 0; j < nets.size(); j++) {
556 evaluateGradients<Architecture_t>(
557 nets[j].GetLayer(depth-1).GetActivationGradients(),
558 nets[j].GetLossFunction(),
559 batches[j].GetOutput(),
560 nets[j].GetLayer(depth-1).GetOutput());
564 for (
size_t i = depth - 1; i > 0; i--)
566 for (
size_t j = 0; j < nets.size(); j++) {
567 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
568 nets[j].GetLayer(i-1).GetOutput(),
569 nets[j].GetRegularization(),
570 nets[j].GetWeightDecay());
574 for (
size_t j = 0; j < nets.size(); j++) {
575 nets[j].GetLayer(0).Backward(dummy,
576 batches[j].GetInput(),
577 nets[j].GetRegularization(),
578 nets[j].GetWeightDecay());
581 for (
size_t i = 0; i < depth; i++)
583 auto &masterLayer = master.GetLayer(i);
584 for (
size_t j = 0; j < nets.size(); j++) {
585 auto &layer = nets[j].GetLayer(i);
587 masterLayer.GetWeights());
589 masterLayer.GetBiases());
590 Architecture_t::ScaleAdd(layer.GetWeights(),
591 masterLayer.GetWeightGradients(),
593 Architecture_t::ScaleAdd(layer.GetBiases(),
594 masterLayer.GetBiasGradients(),
597 for (
size_t j = 0; j < nets.size(); j++) {
598 auto &layer = nets[j].GetLayer(i);
599 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
600 layer.GetWeightGradients(),
602 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
603 layer.GetBiasGradients(),
606 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
607 masterLayer.GetWeightGradients(),
609 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
610 masterLayer.GetBiasGradients(),
612 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
613 masterLayer.GetWeightGradients(),
615 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
616 masterLayer.GetBiasGradients(),
622 template<
typename Architecture_t>
623 template <
typename Net_t>
629 net.Forward(input,
true);
630 net.Backward(input, output);
632 for (
size_t i = 0; i < net.GetDepth(); i++)
634 auto &layer = net.GetLayer(i);
635 Architecture_t::ScaleAdd(layer.GetWeights(),
636 layer.GetWeightGradients(),
639 Architecture_t::ScaleAdd(layer.GetBiases(),
640 layer.GetBiasGradients(),
647 template<
typename Architecture_t>
648 template <
typename Net_t>
657 net.Backward(input,
output);
659 for (
size_t i = 0; i < net.GetDepth(); i++)
661 auto &layer = net.GetLayer(i);
662 Architecture_t::ScaleAdd(layer.GetWeights(),
663 layer.GetWeightGradients(),
666 Architecture_t::ScaleAdd(layer.GetBiases(),
667 layer.GetBiasGradients(),
675 template<
typename Architecture_t>
689 template<
typename Architecture_t>
typename Architecture_t::Scalar_t Scalar_t
Scalar_t GetTrainingError() const
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output)
Perform a single optimization step on a given batch.
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current training session.
size_t fBatchSize
Batch size to use for the training.
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Scalar_t GetTestError() const
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t fTestError
Holds the most recently computed test loss.
void SetBatchSize(Scalar_t rate)
void SetConvergenceSteps(size_t steps)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
size_t fConvergenceCount
Current number of training epochs without.
size_t fTestInterval
Interval for the computation of the test error.
void Reset()
Reset minimizer object to default state.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetLearningRate(Scalar_t rate)
size_t GetTestInterval() const
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Similar to StepReducedWeights(...) but also evaluates the loss.
static RooMathCoreReg dummy
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Abstract ClassifierFactory template that handles arbitrary types.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Same as Step(...) but also evaluate the loss on the given training data.
typename Architecture_t::Matrix_t Matrix_t
Scalar_t fLearningRate
Learning rate .
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
void SetTestInterval(size_t interval)
size_t GetConvergenceCount() const
Scalar_t fTrainingError
Holds the most recently computed training loss.