Source code for netket.optimizer

# Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import solver
from . import qgt

from .linear_operator import LinearOperator
from .preconditioner import (
    LinearPreconditioner,
    PreconditionerT,
    identity_preconditioner,
    DeprecatedPreconditionerSignature as _DeprecatedPreconditionerSignature,
)

from .sr import SR

from netket.utils import _hide_submodules


## Optimisers



[docs]
def Sgd(learning_rate: float):
    r"""Stochastic Gradient Descent Optimizer.
    The `Stochastic Gradient Descent <https://en.wikipedia.org/wiki/Stochastic_gradient_descent>`_
    is one of the most popular optimizers in machine learning applications.
    Given a stochastic estimate of the gradient of the cost function (:math:`G(\mathbf{p})`),
    it performs the update:

    .. math:: p^\prime_k = p_k -\eta G_k(\mathbf{p}),

    where :math:`\eta` is the so-called learning rate.
    NetKet also implements two extensions to the simple SGD,
    the first one is :math:`L_2` regularization,
    and the second one is the possibility to set a decay
    factor :math:`\gamma \leq 1` for the learning rate, such that
    at iteration :math:`n` the learning rate is :math:`\eta \gamma^n`.

    Args:
       learning_rate: The learning rate :math:`\eta`.

    Examples:
       Simple SGD optimizer.

       >>> from netket.optimizer import Sgd
       >>> op = Sgd(learning_rate=0.05)
    """
    from optax import sgd

    return sgd(learning_rate)




[docs]
def Momentum(learning_rate: float, beta: float = 0.9, nesterov: bool = False):
    r"""Momentum-based Optimizer.
        The momentum update incorporates an exponentially weighted moving average
        over previous gradients to speed up descent
        `Qian, N. (1999) <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.5612&rep=rep1&type=pdf>`_.
        The momentum vector :math:`\mathbf{m}` is initialized to zero.
        Given a stochastic estimate of the gradient of the cost function
        :math:`G(\mathbf{p})`, the updates for the parameter :math:`p_k` and
        corresponding component of the momentum :math:`m_k` are

        .. math::

            m^\prime_k &= \beta m_k + (1-\beta)G_k(\mathbf{p})\\
            p^\prime_k &= \eta m^\prime_

        Args:
           learning_rate: The learning rate :math:`\eta`
           beta: Momentum exponential decay rate, should be in [0,1].
           nesterov: Flag to use nesterov momentum correction

        Examples:
           Momentum optimizer.

           >>> from netket.optimizer import Momentum
           >>> op = Momentum(learning_rate=0.01)
    """
    from optax import sgd

    return sgd(learning_rate, momentum=beta, nesterov=nesterov)




[docs]
def AdaGrad(
    learning_rate: float = 0.001,
    epscut: float = 1.0e-7,
    initial_accumulator_value: float = 0.1,
):
    r"""AdaGrad Optimizer.
        In many cases, in Sgd the learning rate :math:`\eta` should
        decay as a function of training iteration to prevent overshooting
        as the optimum is approached. AdaGrad is an adaptive learning
        rate algorithm that automatically scales the learning rate with a sum
        over past gradients. The vector :math:`\mathbf{g}` is initialized to zero.
        Given a stochastic estimate of the gradient of the cost function :math:`G(\mathbf{p})`,
        the updates for :math:`g_k` and the parameter :math:`p_k` are


        .. math:: g^\prime_k &= g_k + G_k(\mathbf{p})^2\\
                  p^\prime_k &= p_k - \frac{\eta}{\sqrt{g_k + \epsilon}}G_k(\mathbf{p})

        AdaGrad has been shown to perform particularly well when
        the gradients are sparse, but the learning rate may become too small
        after many updates because the sum over the squares of past gradients is cumulative.


        Args:
           learning_rate: Learning rate :math:`\eta`.
           epscut: Small :math:`\epsilon` cutoff.
           initial_accumulator_value: initial value of the accumulator

        Examples:
           Simple AdaGrad optimizer.

           >>> from netket.optimizer import AdaGrad
           >>> op = AdaGrad()
        """
    from optax import adagrad

    return adagrad(
        learning_rate, eps=epscut, initial_accumulator_value=initial_accumulator_value
    )




[docs]
def Adam(learning_rate: float = 0.001, b1: float = 0.9, b2: float = 0.999, eps=1e-08):
    r"""Adam Optimizer.

    Args:
        learning_rate: Learning rate :math:`\eta`.
        b1: Decay rate for the exponentially weighted average of grads.
        b2: Decay rate for the exponentially weighted average of squared norm of grads.
        eps: Term added to the denominator to improve numerical stability.
    """
    from optax import adam

    return adam(learning_rate, b1=b1, b2=b2, eps=eps)




[docs]
def RmsProp(
    learning_rate: float = 0.001,
    beta: float = 0.9,
    epscut: float = 1.0e-7,
    centered: bool = False,
):
    r"""RMSProp optimizer.

    RMSProp is a well-known update algorithm proposed by Geoff Hinton
    in his Neural Networks course notes `Neural Networks course notes
    <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
    It corrects the problem with AdaGrad by using an exponentially weighted
    moving average over past squared gradients instead of a cumulative sum.
    After initializing the vector :math:`\mathbf{s}` to zero, :math:`s_k` and t
    he parameters :math:`p_k` are updated as

    .. math:: s^\prime_k = \beta s_k + (1-\beta) G_k(\mathbf{p})^2 \\
              p^\prime_k = p_k - \frac{\eta}{\sqrt{s_k}+\epsilon} G_k(\mathbf{p})


    Constructs a new ``RmsProp`` optimizer.

    Args:
       learning_rate: The learning rate :math:`\eta`
       beta: Exponential decay rate.
       epscut: Small cutoff value.
       centered: whether to center the moving average.

    Examples:
       RmsProp optimizer.

       >>> from netket.optimizer import RmsProp
       >>> op = RmsProp(learning_rate=0.02)
    """
    from optax import rmsprop

    return rmsprop(learning_rate, decay=beta, eps=epscut, centered=centered)



_hide_submodules(__name__)