Source code for rai_toolbox.optim.frank_wolfe

# Copyright 2023, MASSACHUSETTS INSTITUTE OF TECHNOLOGY
# Subject to FAR 52.227-11 – Patent Rights – Ownership by the Contractor (May 2014).
# SPDX-License-Identifier: MIT

from typing import Union

import torch
from torch.optim import Optimizer

from rai_toolbox._typing import OptimParams
from rai_toolbox._utils import check_param_group_value, value_check

from .lp_space import L1qNormedGradientOptim, L2NormedGradientOptim, SignedGradientOptim
from .optimizer import ParamTransformingOptimizer

_TINY = torch.finfo(torch.float32).tiny

__all__ = [
    "FrankWolfe",
    "L1FrankWolfe",
    "L2FrankWolfe",
    "L1qFrankWolfe",
    "L1qNormedGradientOptim",
]



[docs]
class FrankWolfe(Optimizer):
    r"""Implements the Frank-Wolfe minimization algorithm [1]_.

    .. math::
        w_{k+1} = (1 - l_r) w_k + l_r * s_k

    where :math:`s_k` is the linear minimization oracle (LMO).

    It is critical to note that this optimizer assumes that the `.grad` attribute
    of each parameter has been modified so as to store the *negative* of the LMO
    for that parameter, and not the gradient itself.


    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm#Algorithm"""


[docs]
    def __init__(
        self,
        params: OptimParams,
        *,
        lr: float = 2.0,
        lmo_scaling_factor: float = 1.0,
        use_default_lr_schedule: bool = True,
        div_by_zero_eps: float = torch.finfo(torch.float32).tiny,
    ):
        r"""
        Parameters
        ----------
        params : Iterable
            Iterable of tensor parameters to optimize or dicts defining parameter groups.

        lr :  float, optional (default=2.0)
            Indicates the weight with which the LMO contributes to the parameter
            update. See ``use_default_lr_schedule`` for additional details. If
            ``use_default_lr_schedule=False`` then ``lr`` must be be in the
            domain [0, 1].

        lmo_scaling_factor : float, optional (default=1.0)
            A scaling factor applied to :math:`s_k` prior to each step.

        use_default_lr_schedule : bool, optional (default=True)
            If ``True``, then the per-parameter "learning rate" is scaled
            by :math:`\hat{l_r} = l_r / (l_r + k)` where k is the update index
            for that parameter.

        div_by_zero_eps : float, optional (default=`torch.finfo(torch.float32).tiny`)
            Prevents div-by-zero error in learning rate schedule.
        """
        self._eps = value_check("div_by_zero_eps", div_by_zero_eps, min_=0.0)
        self._use_default_lr_schedule = value_check(
            "use_default_lr_schedule", use_default_lr_schedule, type_=bool
        )

        defaults = dict(lr=lr, lmo_scaling_factor=lmo_scaling_factor)

        super().__init__(params, defaults)  # type: ignore

        check_param_group_value("lmo_scaling_factor", self.param_groups)

        check_param_group_value(
            "lr",
            self.param_groups,
            min_=0.0,
            max_=1.0 if not self._use_default_lr_schedule else None,
        )


    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group["lr"]
            lmo_scale = group["lmo_scaling_factor"]
            for p in group["params"]:
                if p.grad is not None:
                    state = self.state[p]
                    if not state:
                        state["step"] = 0

                    alpha = (
                        lr / max(self._eps, lr + state["step"])
                        if self._use_default_lr_schedule
                        else lr
                    )

                    # alpha weighting included in LMO (more efficient)
                    weighted_lmo = -(alpha * lmo_scale) * p.grad

                    p *= 1 - alpha
                    p += weighted_lmo

                    state["step"] += 1

        return loss




[docs]
class L1qFrankWolfe(L1qNormedGradientOptim):
    r"""A Frank-Wolfe [1]_ optimizer that, when computing the LMO, sparsifies a
    parameter's gradient. Each updated parameter is constrained to fall within an
    :math:`\epsilon`-sized ball in :math:`L^1` space, centered on the origin.

    The sparsification process retains only the signs (i.e., :math:`\pm 1`) of the
    gradient's elements. The transformation is applied to the gradient in accordance
    with `param_ndim`.

    Notes
    -----
    This parameter-transforming optimizer is useful for visual concept probing [2]_.

    See Also
    --------
    FrankWolfe
    L1FrankWolfe
    L2FrankWolfe
    LinfFrankWolfe

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm#Algorithm
    .. [2] Roberts, Jay, and Theodoros Tsiligkaridis. Controllably Sparse Perturbations of Robust Classifiers for Explaining Predictions and Probing Learned Concepts. (2021).
    """


[docs]
    def __init__(
        self,
        params: OptimParams,
        *,
        q: float,
        epsilon: float,
        dq: float = 0.0,
        lr: float = 2.0,
        use_default_lr_schedule: bool = True,
        param_ndim: Union[int, None] = -1,
        div_by_zero_eps: float = _TINY,
        generator: torch.Generator = torch.default_generator,
    ):
        r"""
        Parameters
        ----------
        params : Sequence[Tensor] | Iterable[Mapping[str, Any]]
            Iterable of parameters or dicts defining parameter groups.

        q : float
            Specifies the (fractional) percentile of absolute-largest gradient elements
            to retain when sparsifying the gradient. E.g., `q=0.9`means that only the
            gradient elements within the 90th-percentile will be retained.

            Must be within `[0.0, 1.0]`. The sparsification is applied to the gradient
            in accordance to `param_ndim`.

        epsilon : float
            Specifies the size of the L1-space ball that all parameters will be
            projected into, post optimization step.

        dq : float, optional (default=0.0)
            If specified, the sparsity factor for each gradient transformation will
            be drawn from a uniform distribution over :math:`[q - dq, q + dq] \in [0.0, 1.0]`.

        lr :  float, optional (default=2.0)
            Indicates the weight with which the LMO contributes to the parameter
            update. See `use_default_lr_schedule` for additional details. If
            `use_default_lr_schedule=False` then `lr` must be be in the
            domain `[0, 1]`.

        use_default_lr_schedule : bool, optional (default=True)
            If `True`, then the per-parameter "learning rate" is scaled
            by :math:`\hat{l_r} = l_r / (l_r + k)` where k is the update index
            for that parameter, which starts at 0.

        param_ndim : Union[int, None], optional (default=-1)
            Determines how a parameter and its gradient is temporarily reshaped prior
            to being passed to both `_pre_step_transform_` and `_post_step_transform_`.
            By default,the transformation broadcasts over the tensor's first dimension
            in a batch-like style. This can be specified per param-group

            - A positive number determines the dimensionality of the tensor that the transformation will act on.
            - A negative number indicates the 'offset' from the dimensionality of the tensor (see "Notes" for examples).
            - `None` means that the transformation will be applied directly to the tensor without any broadcasting.

            See `ParamTransformingOptimizer` for more details and examples.

        defaults : Optional[Dict[str, Any]]
            Specifies default parameters for all parameter groups.

        div_by_zero_eps : float, optional (default=`torch.finfo(torch.float32).tiny`)
            Prevents div-by-zero error in learning rate schedule.

        generator : torch.Generator, optional (default=`torch.default_generator`)
            Controls the RNG source.

        Examples
        --------
        Using `L1qFrankWolfe`, we'll sparsify the parameter's gradient to retain signs of
        the top 70% elements, and we'll constrain the updated parameter to fall within a
        :math:`L^1`-ball of radius `1.8`.

        >>> import torch as tr
        >>> from rai_toolbox.optim import L1qFrankWolfe

        Creating a parameter for our optimizer to update, and our optimizer. We
        specify `param_ndim=None` so that the sparsification/normalization occurs on the
        gradient without any broadcasting.

        >>> x = tr.tensor([1.0, 1.0, 1.0], requires_grad=True)
        >>> optim = L1qFrankWolfe(
        ...     [x],
        ...     q=0.30,
        ...     epsilon=1.8,
        ...     param_ndim=None,
        ... )

        Performing a simple calculation with `x` and performing backprop to create
        a gradient.

        >>> (tr.tensor([0.0, 1.0, 2.0]) * x).sum().backward()

        Performing a step with our optimizer uses the Frank-Wolfe algorithm to update
        its parameters; the resulting parameter was updated with a LMO based on a
        sparsified, sign-only gradient. Note that the parameter falls within/on the
        :math:`L^1`-ball of radius `1.8`.

        >>> optim.step()
        >>> x  # the updated parameter; has a L1-norm of 1.8
        tensor([ 0.0000, -0.9000, -0.9000], requires_grad=True)
        """
        super().__init__(
            params,
            InnerOpt=FrankWolfe,
            lmo_scaling_factor=epsilon,
            q=q,
            dq=dq,
            param_ndim=param_ndim,
            div_by_zero_eps=div_by_zero_eps,
            generator=generator,
            lr=lr,
            use_default_lr_schedule=use_default_lr_schedule,
        )





[docs]
class L1FrankWolfe(ParamTransformingOptimizer):
    r"""A Frank-Wolfe [1]_ optimizer that constrains each updated parameter to fall
    within an :math:`\epsilon`-sized ball in :math:`L^1` space, centered on the origin.

    Notes
    -----
    The method `L1NormedGradientOptim._pre_step_transform_` is responsible for
    computing the *negative* linear minimization oracle for a parameter and storing it
    on `param.grad`.

    See Also
    --------
    FrankWolfe
    L2FrankWolfe
    LinfFrankWolfe
    L1qFrankWolfe

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm#Algorithm
    """


[docs]
    def __init__(
        self,
        params: OptimParams,
        *,
        epsilon: float,
        lr: float = 2.0,
        use_default_lr_schedule: bool = True,
        param_ndim: Union[int, None] = -1,
        div_by_zero_eps: float = _TINY,
    ):
        r"""
        Parameters
        ----------
        params : Sequence[Tensor] | Iterable[Mapping[str, Any]]
            Iterable of parameters or dicts defining parameter groups.

        epsilon : float
            The radius of the of the L1 ball to which each updated parameter will be
            constrained. Can be specified per parameter-group.

        lr :  float, optional (default=2.0)
            Indicates the weight with which the LMO contributes to the parameter
            update. See `use_default_lr_schedule` for additional details. If
            `use_default_lr_schedule=False` then `lr` must be be in the
            domain `[0, 1]`.

        use_default_lr_schedule : bool, optional (default=True)
            If ``True``, then the per-parameter "learning rate" is scaled
            by :math:`\hat{l_r} = l_r / (l_r + k)` where k is the update index
            for that parameter, which starts at 0.

        param_ndim : Union[int, None], optional (default=-1)
            Determines how a parameter and its gradient is temporarily reshaped prior
            to being passed to both `_pre_step_transform_` and `_post_step_transform_`.
            By default,the transformation broadcasts over the tensor's first dimension
            in a batch-like style. This can be specified per param-group

            - A positive number determines the dimensionality of the tensor that the transformation will act on.
            - A negative number indicates the 'offset' from the dimensionality of the tensor (see "Notes" for examples).
            - `None` means that the transformation will be applied directly to the tensor without any broadcasting.

            See `ParamTransformingOptimizer` for more details and examples.

        div_by_zero_eps : float, optional (default=`torch.finfo(torch.float32).tiny`)
            Prevents div-by-zero error in learning rate schedule.

        Examples
        --------
        Using `L1FrankWolfe`, we'll constrain the updated parameter to fall within a
        :math:`L^1`-ball of radius `1.8`.

        >>> import torch as tr
        >>> from rai_toolbox.optim import L1FrankWolfe

        Creating a parameter for our optimizer to update, and our optimizer. We
        specify `param_ndim=None` so that the constrain occurs on the parameter without
        any broadcasting.

        >>> x = tr.tensor([1.0, 1.0], requires_grad=True)
        >>> optim = L1FrankWolfe([x], epsilon=1.8, param_ndim=None)

        Performing a simple calculation with `x` and performing backprop to create
        a gradient.

        >>> (tr.tensor([1.0, 2.0]) * x).sum().backward()

        Performing a step with our optimizer uses the Frank-Wolfe algorithm to update
        its parameters. Note that the updated parameter falls within/on the
        :math:`L^1`-ball of radius `1.8`.

        >>> optim.step()
        >>> x
        tensor([ 0.0000, -1.8000], requires_grad=True)
        """
        super().__init__(
            params,
            InnerOpt=FrankWolfe,
            lmo_scaling_factor=epsilon,
            lr=lr,
            use_default_lr_schedule=use_default_lr_schedule,
            param_ndim=param_ndim,
            div_by_zero_eps=div_by_zero_eps,
        )


    def _pre_step_transform_(self, param: torch.Tensor, **_unused_kwargs) -> None:
        if param.grad is None:  # pragma: no cover
            return
        # Computes the negative linear minimization oracle and sets it to
        # `param.grad`
        argmax = torch.argmax(torch.abs(param.grad), dim=1)
        signs = torch.sign(param.grad[torch.arange(len(param)), argmax])
        param.grad.mul_(0.0)
        param.grad[torch.arange(len(param)), argmax] = signs




[docs]
class L2FrankWolfe(L2NormedGradientOptim):
    r"""A Frank-Wolfe [1]_ optimizer that constrains each updated parameter to fall
    within an :math:`\epsilon`-sized ball in :math:`L^2` space, centered on the origin.


    This parameter-transforming optimizer is useful for producing error counter
    factuals and performing visual concept probing [2]_.

    Notes
    -----
    The method `L2NormedGradientOptim._pre_step_transform_` is responsible for
    computing the *negative* linear minimization oracle for a parameter and storing it
    on `param.grad`.

    See Also
    --------
    FrankWolfe
    L1FrankWolfe
    LinfFrankWolfe
    L1qFrankWolfe

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm#Algorithm
    .. [2] Roberts, Jay, and Theodoros Tsiligkaridis. Controllably Sparse Perturbations of Robust Classifiers for Explaining Predictions and Probing Learned Concepts. (2021).
    """


[docs]
    def __init__(
        self,
        params: OptimParams,
        *,
        epsilon: float,
        lr: float = 2.0,
        use_default_lr_schedule: bool = True,
        param_ndim: Union[int, None] = -1,
        div_by_zero_eps: float = _TINY,
    ):
        r"""
        Parameters
        ----------
        params : Sequence[Tensor] | Iterable[Mapping[str, Any]]
            Iterable of parameters or dicts defining parameter groups.

        epsilon : float
            The radius of the of the L2 ball to which each updated parameter will be
            constrained. Can be specified per parameter-group.

        lr :  float, optional (default=2.0)
            Indicates the weight with which the LMO contributes to the parameter
            update. See `use_default_lr_schedule` for additional details. If
            `use_default_lr_schedule=False` then `lr` must be be in the
            domain `[0, 1]`.

        use_default_lr_schedule : bool, optional (default=True)
            If ``True``, then the per-parameter "learning rate" is scaled
            by :math:`\hat{l_r} = l_r / (l_r + k)` where k is the update index
            for that parameter, which starts at 0.

        param_ndim : Union[int, None], optional (default=-1)
            Determines how a parameter and its gradient is temporarily reshaped prior
            to being passed to both `_pre_step_transform_` and `_post_step_transform_`.
            By default,the transformation broadcasts over the tensor's first dimension
            in a batch-like style. This can be specified per param-group

            - A positive number determines the dimensionality of the tensor that the transformation will act on.
            - A negative number indicates the 'offset' from the dimensionality of the tensor (see "Notes" for examples).
            - `None` means that the transformation will be applied directly to the tensor without any broadcasting.

            See `ParamTransformingOptimizer` for more details and examples.

        div_by_zero_eps : float, optional (default=`torch.finfo(torch.float32).tiny`)
            Prevents div-by-zero error in learning rate schedule.

        Examples
        --------
        Using `L2FrankWolfe`, we'll constrain the updated parameter to fall within a
        :math:`L^2`-ball of radius `1.8`.

        >>> import torch as tr
        >>> from rai_toolbox.optim import L2FrankWolfe

        Creating a parameter for our optimizer to update, and our optimizer. We
        specify `param_ndim=None` so that the constrain occurs on the parameter without any
        broadcasting.

        >>> x = tr.tensor([1.0, 1.0], requires_grad=True)
        >>> optim = L2FrankWolfe([x], epsilon=1.8, param_ndim=None)

        Performing a simple calculation with `x` and performing backprop to create
        a gradient.

        >>> (tr.tensor([1.0, 2.0]) * x).sum().backward()

        Performing a step with our optimizer uses the Frank-Wolfe algorithm to update
        its parameters. Note that the updated parameter falls within/on the
        :math:`L^2`-ball of radius `1.8`.

        >>> optim.step()
        >>> x
        tensor([-0.8050, -1.6100], requires_grad=True)
        """
        super().__init__(
            params,
            InnerOpt=FrankWolfe,
            lmo_scaling_factor=epsilon,
            lr=lr,
            use_default_lr_schedule=use_default_lr_schedule,
            param_ndim=param_ndim,
            div_by_zero_eps=div_by_zero_eps,
        )





[docs]
class LinfFrankWolfe(SignedGradientOptim):
    r"""A Frank-Wolfe [1]_ optimizer that constrains each updated parameter to fall
    within an :math:`\epsilon`-sized ball in :math:`L^\infty` space, centered on the origin.

    Notes
    -----
    The method `SignedGradientOptim._pre_step_transform_` is responsible for
    computing the *negative* linear minimization oracle for a parameter and storing it
    on `param.grad`.

    See Also
    --------
    FrankWolfe
    L1FrankWolfe
    L2FrankWolfe
    L1qFrankWolfe

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm#Algorithm

    Examples
    --------
    Using `LinfFrankWolfe`, we'll constrain the updated parameter to fall within a
    :math:`L^\infty`-ball of radius `1.8`.

    >>> import torch as tr
    >>> from rai_toolbox.optim import LinfFrankWolfe

    Creating a parameter for our optimizer to update, and our optimizer. We
    specify `param_ndim=None` so that the constrain occurs on the parameter without any
    broadcasting.

    >>> x = tr.tensor([1.0, 1.0], requires_grad=True)
    >>> optim = L2FrankWolfe([x], epsilon=1.8, param_ndim=None)

    Performing a simple calculation with `x` and performing backprop to create
    a gradient.

    >>> (tr.tensor([1.0, 2.0]) * x).sum().backward()

    Performing a step with our optimizer uses the Frank-Wolfe algorithm to update
    its parameters. Note that the updated parameter falls within/on the
    :math:`L^\infty`-ball of radius `1.8`.

    >>> optim.step()
    >>> x
    tensor([-1.8000, -1.8000], requires_grad=True)"""


[docs]
    def __init__(
        self,
        params: OptimParams,
        *,
        epsilon: float,
        lr: float = 2.0,
        use_default_lr_schedule: bool = True,
        param_ndim: Union[int, None] = -1,
        div_by_zero_eps: float = _TINY,
    ):
        r"""
        Parameters
        ----------
        params : Sequence[Tensor] | Iterable[Mapping[str, Any]]
            Iterable of parameters or dicts defining parameter groups.

        epsilon : float
            The radius of the of the L-inf ball to which each updated parameter will be
            constrained. Can be specified per parameter-group.

        lr :  float, optional (default=2.0)
            Indicates the weight with which the LMO contributes to the parameter
            update. See `use_default_lr_schedule` for additional details. If
            `use_default_lr_schedule=False` then `lr` must be be in the
            domain `[0, 1]`.

        use_default_lr_schedule : bool, optional (default=True)
            If ``True``, then the per-parameter "learning rate" is scaled
            by :math:`\hat{l_r} = l_r / (l_r + k)` where k is the update index
            for that parameter, which starts at 0.

        param_ndim : Union[int, None], optional (default=-1)
            Determines how a parameter and its gradient is temporarily reshaped prior
            to being passed to both `_pre_step_transform_` and `_post_step_transform_`.
            By default,the transformation broadcasts over the tensor's first dimension
            in a batch-like style. This can be specified per param-group

            - A positive number determines the dimensionality of the tensor that the transformation will act on.
            - A negative number indicates the 'offset' from the dimensionality of the tensor (see "Notes" for examples).
            - `None` means that the transformation will be applied directly to the tensor without any broadcasting.

            See `ParamTransformingOptimizer` for more details and examples.

        div_by_zero_eps : float, optional (default=`torch.finfo(torch.float32).tiny`)
            Prevents div-by-zero error in learning rate schedule.
        """
        super().__init__(
            params,
            InnerOpt=FrankWolfe,
            lmo_scaling_factor=epsilon,
            lr=lr,
            use_default_lr_schedule=use_default_lr_schedule,
            param_ndim=param_ndim,
            div_by_zero_eps=div_by_zero_eps,
        )