11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
54template<
typename Architecture_t>
58 using Scalar_t =
typename Architecture_t::Scalar_t;
59 using Matrix_t =
typename Architecture_t::Matrix_t;
78 size_t convergenceSteps,
91 template <
typename Data_t,
typename Net_t>
93 const Data_t & TestDataIn,
size_t nTestSamples,
94 Net_t & net,
size_t nThreads = 1);
97 template <
typename Data_t,
typename Net_t>
99 const Data_t & TestDataIn,
size_t nTestSamples,
100 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
107 template <
typename Net_t>
112 template <
typename Net_t>
121 template <
typename Net_t>
122 void Step(Net_t &master,
123 std::vector<Net_t> &nets,
127 template <
typename Net_t>
129 std::vector<Net_t> &nets,
132 template <
typename Net_t>
137 std::vector<Net_t> &nets,
144 template <
typename Net_t>
149 template <
typename Net_t>
175template <
typename Architecture_t>
177 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
178 fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
184template <
typename Architecture_t>
186 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
187 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
193template<
typename Architecture_t>
194template <
typename Data_t,
typename Net_t>
196 size_t nTrainingSamples,
197 const Data_t & testData,
209 net.GetOutputWidth(), nThreads);
210 auto testNet = net.CreateClone(nTestSamples);
212 testNet.GetBatchSize(),
213 testNet.GetInputWidth(),
214 net.GetOutputWidth());
215 std::vector<Net_t> nets{};
216 nets.reserve(nThreads);
217 for (
size_t i = 0; i < nThreads; i++) {
219 for (
size_t j = 0; j < net.GetDepth(); j++)
221 auto &masterLayer = net.GetLayer(j);
222 auto &layer = nets.back().GetLayer(j);
223 Architecture_t::Copy(layer.GetWeights(),
224 masterLayer.GetWeights());
225 Architecture_t::Copy(layer.GetBiases(),
226 masterLayer.GetBiases());
230 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
231 std::vector<TBatch<Architecture_t>> batches{};
232 batches.reserve(nThreads);
235 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
237 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
239 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
240 Step(net, nets, batches);
244 auto b = *testLoader.
begin();
245 auto inputMatrix =
b.GetInput();
246 auto outputMatrix =
b.GetOutput();
247 auto weightMatrix =
b.GetWeights();
248 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
250 }
while (!HasConverged());
252 return fMinimumError;
256template<
typename Architecture_t>
257template <
typename Data_t,
typename Net_t>
259 size_t nTrainingSamples,
260 const Data_t & testData,
273 net.GetOutputWidth(), nThreads);
274 auto testNet = net.CreateClone(net.GetBatchSize());
276 testNet.GetBatchSize(),
277 testNet.GetInputWidth(),
278 net.GetOutputWidth());
280 net.InitializeGradients();
281 std::vector<Net_t> nets{};
282 nets.reserve(nThreads);
283 for (
size_t i = 0; i < nThreads; i++) {
285 for (
size_t j = 0; j < net.GetDepth(); j++)
287 auto &masterLayer = net.GetLayer(j);
288 auto &layer = nets.back().GetLayer(j);
289 Architecture_t::Copy(layer.GetWeights(),
290 masterLayer.GetWeights());
291 Architecture_t::Copy(layer.GetBiases(),
292 masterLayer.GetBiases());
296 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
297 std::vector<TBatch<Architecture_t>> batches{};
298 batches.reserve(nThreads);
301 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
303 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
305 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
306 if (momentum != 0.0) {
307 StepMomentum(net, nets, batches, momentum);
309 Step(net, nets, batches);
315 for (
size_t i = 0; i < batchesInEpoch; i++) {
317 auto inputMatrix =
b.GetInput();
318 auto outputMatrix =
b.GetOutput();
319 auto weightMatrix =
b.GetWeights();
320 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
322 fTestError /= (
Double_t)batchesInEpoch;
323 }
while (!HasConverged());
324 return fMinimumError;
328template <
typename Architecture_t>
329template <
typename Net_t>
333 net.Forward(input,
true);
334 net.Backward(input,
output, weights);
336 for (
size_t i = 0; i < net.GetDepth(); i++)
338 auto &layer = net.GetLayer(i);
339 Architecture_t::ScaleAdd(layer.GetWeights(),
340 layer.GetWeightGradients(),
342 Architecture_t::ScaleAdd(layer.GetBiases(),
343 layer.GetBiasGradients(),
349template <
typename Architecture_t>
350template <
typename Net_t>
355 net.Backward(input,
output);
357 for (
size_t i = 0; i < net.GetDepth(); i++)
359 auto &layer = net.GetLayer(i);
360 Architecture_t::ScaleAdd(layer.GetWeights(),
361 layer.GetWeightGradients(),
363 Architecture_t::ScaleAdd(layer.GetBiases(),
364 layer.GetBiasGradients(),
371template<
typename Architecture_t>
372 template <
typename Net_t>
375 std::vector<Net_t> & nets,
378 typename Architecture_t::Matrix_t dummy(0,0);
379 size_t depth = master.GetDepth();
382 for (
size_t j = 0; j < nets.size(); j++) {
383 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
386 for (
size_t i = 1; i < depth; i++)
388 for (
size_t j = 0; j < nets.size(); j++) {
389 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
393 for (
size_t j = 0; j < nets.size(); j++) {
394 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
395 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
396 batches[j].GetWeights());
399 for (
size_t i = depth - 1; i > 0; i--)
401 for (
size_t j = 0; j < nets.size(); j++) {
402 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
403 nets[j].GetLayer(i-1).GetOutput(),
404 nets[j].GetRegularization(),
405 nets[j].GetWeightDecay());
408 for (
size_t j = 0; j < nets.size(); j++) {
409 nets[j].GetLayer(0).Backward(dummy,
410 batches[j].GetInput(),
411 nets[j].GetRegularization(),
412 nets[j].GetWeightDecay());
415 for (
size_t j = 0; j < nets.size(); j++) {
416 for (
size_t i = 0; i < depth; i++)
418 auto &masterLayer = master.GetLayer(i);
419 auto &layer = nets[j].GetLayer(i);
420 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
421 layer.GetWeightGradients(),
423 Architecture_t::Copy(layer.GetWeights(),
424 masterLayer.GetWeights());
425 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
426 layer.GetBiasGradients(),
428 Architecture_t::Copy(layer.GetBiases(),
429 masterLayer.GetBiases());
435template<
typename Architecture_t>
436template <
typename Net_t>
439 std::vector<Net_t> & nets,
443 typename Architecture_t::Matrix_t dummy(0,0);
444 size_t depth = master.GetDepth();
447 for (
size_t j = 0; j < nets.size(); j++) {
448 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
451 for (
size_t i = 1; i < depth; i++)
453 for (
size_t j = 0; j < nets.size(); j++) {
454 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
458 for (
size_t j = 0; j < nets.size(); j++) {
459 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
460 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
461 batches[j].GetWeights());
464 for (
size_t i = depth - 1; i > 0; i--)
466 for (
size_t j = 0; j < nets.size(); j++) {
467 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
468 nets[j].GetLayer(i-1).GetOutput(),
469 nets[j].GetRegularization(),
470 nets[j].GetWeightDecay());
471 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
472 nets[j].GetLayer(i).GetWeightGradients(),
473 - fLearningRate / momentum);
474 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
475 nets[j].GetLayer(i).GetBiasGradients(),
476 - fLearningRate / momentum);
478 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
479 master.GetLayer(i).GetWeightGradients(),
481 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
482 master.GetLayer(i).GetBiasGradients(),
485 for (
size_t j = 0; j < nets.size(); j++) {
486 nets[j].GetLayer(0).Backward(dummy,
487 batches[j].GetInput(),
488 nets[j].GetRegularization(),
489 nets[j].GetWeightDecay());
490 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
491 nets[j].GetLayer(0).GetWeightGradients(),
492 - fLearningRate / momentum);
493 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
494 nets[j].GetLayer(0).GetBiasGradients(),
495 - fLearningRate / momentum);
498 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
499 master.GetLayer(0).GetWeightGradients(),
501 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
502 master.GetLayer(0).GetBiasGradients(),
505 for (
size_t i = 0; i < depth; i++)
507 auto &masterLayer = master.GetLayer(i);
508 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
509 masterLayer.GetWeightGradients(),
511 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
512 masterLayer.GetBiasGradients(),
514 for (
size_t j = 0; j < nets.size(); j++) {
515 auto &layer = nets[j].GetLayer(i);
516 Architecture_t::Copy(layer.GetWeights(),
517 masterLayer.GetWeights());
518 Architecture_t::Copy(layer.GetBiases(),
519 masterLayer.GetBiases());
525template<
typename Architecture_t>
526template <
typename Net_t>
529 std::vector<Net_t> & nets,
533 typename Architecture_t::Matrix_t dummy(0,0);
534 size_t depth = master.GetDepth();
537 for (
size_t j = 0; j < nets.size(); j++) {
538 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
541 for (
size_t i = 1; i < depth; i++)
543 for (
size_t j = 0; j < nets.size(); j++) {
544 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
549 for (
size_t j = 0; j < nets.size(); j++) {
550 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
551 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
552 batches[j].GetWeights());
556 for (
size_t i = depth - 1; i > 0; i--)
558 for (
size_t j = 0; j < nets.size(); j++) {
559 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
560 nets[j].GetLayer(i-1).GetOutput(),
561 nets[j].GetRegularization(),
562 nets[j].GetWeightDecay());
566 for (
size_t j = 0; j < nets.size(); j++) {
567 nets[j].GetLayer(0).Backward(dummy,
568 batches[j].GetInput(),
569 nets[j].GetRegularization(),
570 nets[j].GetWeightDecay());
573 for (
size_t i = 0; i < depth; i++)
575 auto &masterLayer = master.GetLayer(i);
576 for (
size_t j = 0; j < nets.size(); j++) {
577 auto &layer = nets[j].GetLayer(i);
578 Architecture_t::Copy(layer.GetWeights(),
579 masterLayer.GetWeights());
580 Architecture_t::Copy(layer.GetBiases(),
581 masterLayer.GetBiases());
582 Architecture_t::ScaleAdd(layer.GetWeights(),
583 masterLayer.GetWeightGradients(),
585 Architecture_t::ScaleAdd(layer.GetBiases(),
586 masterLayer.GetBiasGradients(),
589 for (
size_t j = 0; j < nets.size(); j++) {
590 auto &layer = nets[j].GetLayer(i);
591 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
592 layer.GetWeightGradients(),
593 - fLearningRate / momentum);
594 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
595 layer.GetBiasGradients(),
596 - fLearningRate / momentum);
598 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
599 masterLayer.GetWeightGradients(),
601 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
602 masterLayer.GetBiasGradients(),
604 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
605 masterLayer.GetWeightGradients(),
607 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
608 masterLayer.GetBiasGradients(),
614template<
typename Architecture_t>
615template <
typename Net_t>
621 net.Forward(input,
true);
622 net.Backward(input,
output);
624 for (
size_t i = 0; i < net.GetDepth(); i++)
626 auto &layer = net.GetLayer(i);
627 Architecture_t::ScaleAdd(layer.GetWeights(),
628 layer.GetWeightGradients(),
631 Architecture_t::ScaleAdd(layer.GetBiases(),
632 layer.GetBiasGradients(),
639template <
typename Architecture_t>
640template <
typename Net_t>
646 fTrainingError = loss;
647 net.Backward(input,
output, weights);
649 for (
size_t i = 0; i < net.GetDepth(); i++)
651 auto &layer = net.GetLayer(i);
652 Architecture_t::ScaleAdd(layer.GetWeights(),
653 layer.GetWeightGradients(),
656 Architecture_t::ScaleAdd(layer.GetBiases(),
657 layer.GetBiasGradients(),
665template<
typename Architecture_t>
668 if (fTestError < fMinimumError * 0.999) {
669 fConvergenceCount = 0;
670 fMinimumError = fTestError;
675 return (fConvergenceCount >= fConvergenceSteps);
679template<
typename Architecture_t>
682 fTestError = testError;
683 if (fTestError < fMinimumError * 0.999) {
684 fConvergenceCount = 0;
685 fMinimumError = fTestError;
687 fConvergenceCount += fTestInterval;
689 return (fConvergenceCount >= fConvergenceSteps);
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
void Shuffle()
Shuffle the order of the samples in the batch.
size_t fConvergenceCount
Current number of training epochs without.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Scalar_t fTrainingError
Holds the most recently computed training loss.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Scalar_t GetTrainingError() const
size_t fTestInterval
Interval for the computation of the test error.
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetBatchSize(Scalar_t rate)
Scalar_t fTestError
Holds the most recently computed test loss.
typename Architecture_t::Matrix_t Matrix_t
size_t GetTestInterval() const
size_t fStepCount
Number of steps performed in the current training session.
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
void SetConvergenceSteps(size_t steps)
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Scalar_t fMinimumError
The minimum loss achieved on the training set.
void SetLearningRate(Scalar_t rate)
Scalar_t fLearningRate
Learning rate .
size_t fBatchSize
Batch size to use for the training.
size_t GetConvergenceSteps() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Scalar_t GetTestError() const
typename Architecture_t::Scalar_t Scalar_t
create variable transformations
static void output(int code)