Source code for chemprop.schedulers

from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR



[docs]
def build_NoamLike_LRSched(
    optimizer: Optimizer,
    warmup_steps: int,
    cooldown_steps: int,
    init_lr: float,
    max_lr: float,
    final_lr: float,
):
    r"""Build a Noam-like learning rate scheduler which schedules the learning rate with a piecewise linear followed
    by an exponential decay.

    The learning rate increases linearly from ``init_lr`` to ``max_lr`` over the course of
    the first warmup_steps then decreases exponentially to ``final_lr`` over the course of the
    remaining ``total_steps - warmup_steps`` (where ``total_steps = total_epochs * steps_per_epoch``). This is roughly based on the learning rate schedule from [1]_, section 5.3.

    Formally, the learning rate schedule is defined as:

    .. math::
        \mathtt{lr}(i) &=
            \begin{cases}
                \mathtt{init\_lr} + \delta \cdot i &\text{if } i < \mathtt{warmup\_steps} \\
                \mathtt{max\_lr} \cdot \left( \frac{\mathtt{final\_lr}}{\mathtt{max\_lr}} \right)^{\gamma(i)} &\text{otherwise} \\
            \end{cases}
        \\
        \delta &\mathrel{:=}
            \frac{\mathtt{max\_lr} - \mathtt{init\_lr}}{\mathtt{warmup\_steps}} \\
        \gamma(i) &\mathrel{:=}
            \frac{i - \mathtt{warmup\_steps}}{\mathtt{total\_steps} - \mathtt{warmup\_steps}}


    Parameters
    -----------
    optimizer : Optimizer
        A PyTorch optimizer.
    warmup_steps : int
        The number of steps during which to linearly increase the learning rate.
    cooldown_steps : int
        The number of steps during which to exponential decay the learning rate.
    init_lr : float
        The initial learning rate.
    max_lr : float
        The maximum learning rate (achieved after ``warmup_steps``).
    final_lr : float
        The final learning rate (achieved after ``cooldown_steps``).

    References
    ----------
    .. [1] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, Ł. and Polosukhin, I. "Attention is all you need." Advances in neural information processing systems, 2017, 30. https://arxiv.org/abs/1706.03762
    """

    def lr_lambda(step: int):
        if step < warmup_steps:
            warmup_factor = (max_lr - init_lr) / warmup_steps
            return step * warmup_factor / init_lr + 1
        elif warmup_steps <= step < warmup_steps + cooldown_steps:
            cooldown_factor = (final_lr / max_lr) ** (1 / cooldown_steps)
            return (max_lr * (cooldown_factor ** (step - warmup_steps))) / init_lr
        else:
            return final_lr / init_lr

    return LambdaLR(optimizer, lr_lambda)