# Source code for netket.optimizer

# Copyright 2021 The NetKet Authors - All rights reserved.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

from . import qgt, solver

from .sr import SR

from .linear_operator import LinearOperator
from .preconditioner import (
LinearPreconditioner,
PreconditionerT,
identity_preconditioner,
)

## Optimisers

from netket.utils import _hide_submodules

[docs]def Sgd(learning_rate: float):
The Stochastic Gradient Descent <https://en.wikipedia.org/wiki/Stochastic_gradient_descent>_
is one of the most popular optimizers in machine learning applications.
Given a stochastic estimate of the gradient of the cost function (:math:G(\mathbf{p})),
it performs the update:

.. math:: p^\prime_k = p_k -\eta G_k(\mathbf{p}),

where :math:\eta is the so-called learning rate.
NetKet also implements two extensions to the simple SGD,
the first one is :math:L_2 regularization,
and the second one is the possibility to set a decay
factor :math:\gamma \leq 1 for the learning rate, such that
at iteration :math:n the learning rate is :math:\eta \gamma^n.

Args:
learning_rate: The learning rate :math:\eta.

Examples:
Simple SGD optimizer.

>>> from netket.optimizer import Sgd
>>> op = Sgd(learning_rate=0.05)
"""
from optax import sgd

return sgd(learning_rate)

[docs]def Momentum(learning_rate: float, beta: float = 0.9, nesterov: bool = False):
r"""Momentum-based Optimizer.
The momentum update incorporates an exponentially weighted moving average
over previous gradients to speed up descent
Qian, N. (1999) <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.5612&rep=rep1&type=pdf>_.
The momentum vector :math:\mathbf{m} is initialized to zero.
Given a stochastic estimate of the gradient of the cost function
:math:G(\mathbf{p}), the updates for the parameter :math:p_k and
corresponding component of the momentum :math:m_k are

.. math::

m^\prime_k &= \beta m_k + (1-\beta)G_k(\mathbf{p})\\
p^\prime_k &= \eta m^\prime_

Args:
learning_rate: The learning rate :math:\eta
beta: Momentum exponential decay rate, should be in [0,1].
nesterov: Flag to use nesterov momentum correction

Examples:
Momentum optimizer.

>>> from netket.optimizer import Momentum
>>> op = Momentum(learning_rate=0.01)
"""
from optax import sgd

return sgd(learning_rate, momentum=beta, nesterov=nesterov)

learning_rate: float = 0.001,
epscut: float = 1.0e-7,
initial_accumulator_value: float = 0.1,
):
In many cases, in Sgd the learning rate :math:\eta should
decay as a function of training iteration to prevent overshooting
rate algorithm that automatically scales the learning rate with a sum
over past gradients. The vector :math:\mathbf{g} is initialized to zero.
Given a stochastic estimate of the gradient of the cost function :math:G(\mathbf{p}),
the updates for :math:g_k and the parameter :math:p_k are

.. math:: g^\prime_k &= g_k + G_k(\mathbf{p})^2\\
p^\prime_k &= p_k - \frac{\eta}{\sqrt{g_k + \epsilon}}G_k(\mathbf{p})

the gradients are sparse, but the learning rate may become too small
after many updates because the sum over the squares of past gradients is cumulative.

Args:
learning_rate: Learning rate :math:\eta.
epscut: Small :math:\epsilon cutoff.
initial_accumulator_value: initial value of the accumulator

Examples:

"""

learning_rate, eps=epscut, initial_accumulator_value=initial_accumulator_value
)

[docs]def Adam(learning_rate: float = 0.001, b1: float = 0.9, b2: float = 0.999, eps=1e-08):

Args:
learning_rate: Learning rate :math:\eta.
b1: Decay rate for the exponentially weighted average of grads.
b2: Decay rate for the exponentially weighted average of squared norm of grads.
eps: Term added to the denominator to improve numerical stability.
"""

[docs]def RmsProp(
learning_rate: float = 0.001,
beta: float = 0.9,
epscut: float = 1.0e-7,
centered: bool = False,
):
r"""RMSProp optimizer.

RMSProp is a well-known update algorithm proposed by Geoff Hinton
in his Neural Networks course notes Neural Networks course notes
<http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>_.
After initializing the vector :math:\mathbf{s} to zero, :math:s_k and t
he parameters :math:p_k are updated as

.. math:: s^\prime_k = \beta s_k + (1-\beta) G_k(\mathbf{p})^2 \\
p^\prime_k = p_k - \frac{\eta}{\sqrt{s_k}+\epsilon} G_k(\mathbf{p})

Constructs a new RmsProp optimizer.

Args:
learning_rate: The learning rate :math:\eta
beta: Exponential decay rate.
epscut: Small cutoff value.
centered: whether to center the moving average.

Examples:
RmsProp optimizer.

>>> from netket.optimizer import RmsProp
>>> op = RmsProp(learning_rate=0.02)
"""
from optax import rmsprop

return rmsprop(learning_rate, decay=beta, eps=epscut, centered=centered)

_hide_submodules(__name__)