Utilization

`get_supported_optimizers(filters=None)`

Return list of available optimizer names, sorted alphabetically.

Parameters:

Name	Type	Description	Default
`filters`	`Optional[Union[str, List[str]]]`	Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will return the whole list.	`None`

Source code in pytorch_optimizer/optimizer/__init__.py

def get_supported_optimizers(filters: Optional[Union[str, List[str]]] = None) -> List[str]:
    r"""Return list of available optimizer names, sorted alphabetically.

    :param filters: Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will
        return the whole list.
    """
    if filters is None:
        return sorted(OPTIMIZERS.keys())

    include_filters: Sequence[str] = filters if isinstance(filters, (tuple, list)) else [filters]

    filtered_list: Set[str] = set()
    for include_filter in include_filters:
        filtered_list.update(fnmatch.filter(OPTIMIZERS.keys(), include_filter))

    return sorted(filtered_list)

`get_supported_lr_schedulers(filters=None)`

Return list of available lr scheduler names, sorted alphabetically.

Parameters:

Name	Type	Description	Default
`filters`	`Optional[Union[str, List[str]]]`	Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will return the whole list.	`None`

Source code in pytorch_optimizer/lr_scheduler/__init__.py

def get_supported_lr_schedulers(filters: Optional[Union[str, List[str]]] = None) -> List[str]:
    r"""Return list of available lr scheduler names, sorted alphabetically.

    :param filters: Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will
        return the whole list.
    """
    if filters is None:
        return sorted(LR_SCHEDULERS.keys())

    include_filters: Sequence[str] = filters if isinstance(filters, (tuple, list)) else [filters]

    filtered_list: Set[str] = set()
    for include_filter in include_filters:
        filtered_list.update(fnmatch.filter(LR_SCHEDULERS.keys(), include_filter))

    return sorted(filtered_list)

`get_supported_loss_functions(filters=None)`

Return list of available loss function names, sorted alphabetically.

Parameters:

Name	Type	Description	Default
`filters`	`Optional[Union[str, List[str]]]`	Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will return the whole list.	`None`

Source code in pytorch_optimizer/loss/__init__.py

def get_supported_loss_functions(filters: Optional[Union[str, List[str]]] = None) -> List[str]:
    r"""Return list of available loss function names, sorted alphabetically.

    :param filters: Optional[Union[str, List[str]]]. wildcard filter string that works with fmatch. if None, it will
        return the whole list.
    """
    if filters is None:
        return sorted(LOSS_FUNCTIONS.keys())

    include_filters: Sequence[str] = filters if isinstance(filters, (tuple, list)) else [filters]

    filtered_list: Set[str] = set()
    for include_filter in include_filters:
        filtered_list.update(fnmatch.filter(LOSS_FUNCTIONS.keys(), include_filter))

    return sorted(filtered_list)

`CPUOffloadOptimizer`

Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.

Reference: https://github.com/pytorch/ao/blob/main/torchao/prototype/low_bit_optim/cpu_offload.py

Parameters:

Name	Type	Description	Default
`params`	`PARAMETERS`	PARAMETERS. a list of parameters or parameter groups.	required
`optimizer_class`	`Type[Optimizer]`	Type[torch.optim.Optimizer]. constructor of the base optimizer. Defaults to :class:`torch.optim.AdamW`.	`AdamW`
`offload_gradients`	`bool`	bool. free GPU gradients once they are moved to CPU. Not compatible with gradient accumulation.	`False`
`kwargs`		other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.	`{}`

Source code in pytorch_optimizer/optimizer/utils.py

class CPUOffloadOptimizer:  # pragma: no cover
    """Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.

    Reference: https://github.com/pytorch/ao/blob/main/torchao/prototype/low_bit_optim/cpu_offload.py

    :param params: PARAMETERS. a list of parameters or parameter groups.
    :param optimizer_class: Type[torch.optim.Optimizer]. constructor of the base optimizer. Defaults to
        :class:`torch.optim.AdamW`.
    :param offload_gradients: bool. free GPU gradients once they are moved to CPU. Not compatible with gradient
        accumulation.
    :param kwargs: other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.
    """

    def __init__(
        self,
        params: PARAMETERS,
        optimizer_class: Type[torch.optim.Optimizer] = torch.optim.AdamW,
        *,
        offload_gradients: bool = False,
        **kwargs,
    ) -> None:
        if optimizer_class is torch.optim.AdamW and TORCH_VERSION_AT_LEAST_2_4 and 'fused' not in kwargs:
            kwargs.update(fused=True)

        param_groups = list(params)
        if len(param_groups) == 0:
            raise ValueError('optimizer got an empty parameter list')
        if not isinstance(param_groups[0], dict):
            param_groups = [{'params': param_groups}]

        self.param_cuda2cpu_map = {}
        self.optim_dict = {}
        self.stream = torch.cuda.Stream()

        self.queue = {}

        def backward_hook(p_cuda: torch.Tensor) -> None:
            if p_cuda.grad is None:
                return

            p_cpu = self.param_cuda2cpu_map[p_cuda]

            self.stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(self.stream):
                p_cpu.grad.copy_(p_cuda.grad, non_blocking=True)

            if p_cuda in self.queue:
                del self.queue[p_cuda]

            self.queue[p_cuda] = self.stream.record_event()

            if offload_gradients:
                p_cuda.grad.record_stream(self.stream)
                p_cuda.grad = None

        for param_group in param_groups:
            params = param_group.pop('params')

            for p_cuda in params:
                p_cpu = torch.empty_like(p_cuda, device='cpu', pin_memory=True)
                p_cpu.grad = torch.empty_like(p_cpu, pin_memory=True)

                p_cpu.copy_(p_cuda.detach(), non_blocking=True)
                self.param_cuda2cpu_map[p_cuda] = p_cpu

                p_cuda.register_post_accumulate_grad_hook(backward_hook)
                self.optim_dict[p_cuda] = optimizer_class([{'params': p_cpu, **param_group}], **kwargs)

    @torch.no_grad()
    def step(self, closure: CLOSURE = None) -> LOSS:
        loss = None
        if closure is not None:
            loss = closure()

        for p_cuda, grad_d2h_event in self.queue.items():
            grad_d2h_event.synchronize()
            self.optim_dict[p_cuda].step()

            p_cpu = self.param_cuda2cpu_map[p_cuda]
            with torch.cuda.stream(self.stream):
                p_cuda.copy_(p_cpu, non_blocking=True)

        self.queue.clear()

        return loss

    def zero_grad(self, _: bool = True) -> None:
        for p_cuda in self.param_cuda2cpu_map:
            p_cuda.grad = None

    @property
    def param_groups(self):
        return functools.reduce(operator.add, (optim.param_groups for optim in self.optim_dict.values()), [])

    def state_dict(self):
        return [optim.state_dict() for optim in self.optim_dict.values()]

    def load_state_dict(self, state_dict):
        for optim, optim_state_dict in zip(self.optim_dict.values(), state_dict):
            optim.load_state_dict(optim_state_dict)

`is_valid_parameters(parameters)`

Check where the parameters are valid.

Source code in pytorch_optimizer/optimizer/utils.py

def is_valid_parameters(parameters: PARAMETERS) -> bool:
    r"""Check where the parameters are valid."""
    return isinstance(parameters, (list, tuple)) and len(parameters) > 0 and isinstance(parameters[0], dict)

`has_overflow(grad_norm)`

Detect inf and NaN in grad_norm.

Source code in pytorch_optimizer/optimizer/utils.py

def has_overflow(grad_norm: torch.Tensor) -> bool:
    r"""Detect inf and NaN in grad_norm."""
    return bool(torch.logical_or(torch.isnan(grad_norm), torch.isinf(grad_norm)).any())

`to_real(x)`

Return real value of tensor.

Source code in pytorch_optimizer/optimizer/utils.py

def to_real(x: torch.Tensor) -> torch.Tensor:
    r"""Return real value of tensor."""
    return x.real if torch.is_complex(x) else x

`normalize_gradient(x, use_channels=False, epsilon=1e-08)`

Normalize gradient with stddev.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	torch.Tensor. gradient.	required
`use_channels`	`bool`	bool. channel-wise normalization.	`False`
`epsilon`	`float`	float. eps.	`1e-08`

Source code in pytorch_optimizer/optimizer/utils.py

def normalize_gradient(x: torch.Tensor, use_channels: bool = False, epsilon: float = 1e-8) -> None:
    r"""Normalize gradient with stddev.

    :param x: torch.Tensor. gradient.
    :param use_channels: bool. channel-wise normalization.
    :param epsilon: float. eps.
    """
    size: int = x.dim()
    if size > 1 and use_channels:
        s = x.std(dim=tuple(range(1, size)), keepdim=True).add_(epsilon)
        x.div_(s)
    elif torch.numel(x) > 2:
        s = x.std().add_(epsilon)
        x.div_(s)

`clip_grad_norm(parameters, max_norm=0.0, sync=False)`

Clip gradient norms.

During combination with FSDP, will also ensure that grad norms are aggregated across all workers,
since each worker only stores their shard of the gradients.

Parameters:

Name	Type	Description	Default
`parameters`	`PARAMETERS`	PARAMETERS. Parameters whose gradients we wish to clip.	required
`max_norm`	`float`	float. Maximum norm we wish the gradients to have. If non-positive, then we will not perform clipping.	`0.0`
`sync`	`bool`	bool. Boolean indicating whether we should aggregate across the distributed group. Used only in combination with FSDP.	`False`

Returns:

Type	Description
`Union[Tensor, float]`	The gradient norm across all parameters, before clipping.

Source code in pytorch_optimizer/optimizer/utils.py

def clip_grad_norm(
    parameters: PARAMETERS,
    max_norm: float = 0.0,
    sync: bool = False,
) -> Union[torch.Tensor, float]:  # pragma: no cover
    r"""Clip gradient norms.

        During combination with FSDP, will also ensure that grad norms are aggregated across all workers,
        since each worker only stores their shard of the gradients.

    :param parameters: PARAMETERS. Parameters whose gradients we wish to clip.
    :param max_norm: float. Maximum norm we wish the gradients to have. If non-positive, then we will not perform
        clipping.
    :param sync: bool. Boolean indicating whether we should aggregate across the distributed group. Used only in
        combination with FSDP.
    :returns: The gradient norm across all parameters, before clipping.
    """
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]

    # make sure any generators are expanded
    parameters = list(parameters)

    # if syncing we need to manually perform the clipping so that we aggregate properly
    if max_norm > 0 and not sync:
        return clip_grad_norm_(parameters, max_norm)

    norm_sq = sum(p.grad.norm() ** 2 for p in parameters if p.grad is not None)
    if sync:
        # also need to get the norms from all the other sharded works in FSDP
        all_reduce(norm_sq)

    grad_norm = math.sqrt(norm_sq)
    if max_norm > 0:
        clip_coefficient = max_norm / (grad_norm + 1e-6)
        for p in parameters:
            p.grad.detach().mul_(clip_coefficient)

    return grad_norm

`unit_norm(x, norm=2.0)`

Get norm of unit.

Source code in pytorch_optimizer/optimizer/utils.py

def unit_norm(x: torch.Tensor, norm: float = 2.0) -> torch.Tensor:
    r"""Get norm of unit."""
    keep_dim: bool = True
    dim: Optional[Union[int, Tuple[int, ...]]] = None

    x_len: int = len(x.shape)
    if x_len <= 1:
        keep_dim = False
    elif x_len in (2, 3):
        dim = 1
    elif x_len == 4:
        dim = (1, 2, 3)
    else:
        dim = tuple(range(1, x_len))

    return x.norm(p=norm, dim=dim, keepdim=keep_dim)

`disable_running_stats(model)`

Disable running stats (momentum) of BatchNorm.

Source code in pytorch_optimizer/optimizer/utils.py

def disable_running_stats(model):
    r"""Disable running stats (momentum) of BatchNorm."""

    def _disable(module):
        if isinstance(module, _BatchNorm):
            module.backup_momentum = module.momentum
            module.momentum = 0

    model.apply(_disable)

`enable_running_stats(model)`

Enable running stats (momentum) of BatchNorm.

Source code in pytorch_optimizer/optimizer/utils.py

def enable_running_stats(model):
    r"""Enable running stats (momentum) of BatchNorm."""

    def _enable(module):
        if isinstance(module, _BatchNorm) and hasattr(module, 'backup_momentum'):
            module.momentum = module.backup_momentum

    model.apply(_enable)

`get_global_gradient_norm(param_groups)`

Get global gradient norm.

Source code in pytorch_optimizer/optimizer/utils.py

@torch.no_grad()
def get_global_gradient_norm(param_groups: List[Dict]) -> torch.Tensor:
    r"""Get global gradient norm."""
    global_grad_norm = torch.zeros(1, dtype=torch.float32, device=param_groups[0]['params'][0].device)

    for group in param_groups:
        for p in group['params']:
            if p.grad is not None:
                global_grad_norm.add_(p.grad.norm().pow(2))

    return global_grad_norm

`reg_noise(network1, network2, num_data, lr, eta=0.008, temperature=0.0001)`

Entropy-MCMC: Sampling from flat basins with ease.

usage: https://github.com/lblaoke/EMCMC/blob/master/exp/cifar10_emcmc.py

Parameters:

Name	Type	Description	Default
`network1`	`Module`	nn.Module. network.	required
`network2`	`Module`	nn.Module. network.	required
`num_data`	`int`	int. number of training data.	required
`lr`	`float`	float. learning rate.	required
`eta`	`float`	float. eta.	`0.008`
`temperature`	`float`	float. temperature.	`0.0001`

Source code in pytorch_optimizer/optimizer/utils.py

@torch.no_grad()
def reg_noise(
    network1: nn.Module, network2: nn.Module, num_data: int, lr: float, eta: float = 8e-3, temperature: float = 1e-4
) -> Union[torch.Tensor, float]:
    r"""Entropy-MCMC: Sampling from flat basins with ease.

    usage: https://github.com/lblaoke/EMCMC/blob/master/exp/cifar10_emcmc.py

    :param network1: nn.Module. network.
    :param network2: nn.Module. network.
    :param num_data: int. number of training data.
    :param lr: float. learning rate.
    :param eta: float. eta.
    :param temperature: float. temperature.
    """
    reg_coef: float = 0.5 / (eta * num_data)
    noise_coef: float = math.sqrt(2.0 / lr / num_data * temperature)

    loss = torch.tensor(0.0, device=next(network1.parameters()).device)

    for param1, param2 in zip(network1.parameters(), network2.parameters()):
        reg = (param1 - param2).pow_(2).mul_(reg_coef).sum()

        noise = param1 * torch.randn_like(param1)
        noise.add_(param2 * torch.randn_like(param2))

        loss.add_(reg - noise.mul_(noise_coef).sum())

    return loss

`copy_stochastic(target, source)`

Copy stochastic.

reference: https://github.com/pytorch/pytorch/issues/120376#issuecomment-1974828905

Parameters:

Name	Type	Description	Default
`target`	`Tensor`	torch.Tensor. bfloat16 tensor.	required
`source`	`Tensor`	torch.Tensor. float32 tensor.	required

Source code in pytorch_optimizer/optimizer/utils.py

@torch.no_grad()
def copy_stochastic(target: torch.Tensor, source: torch.Tensor) -> None:
    r"""Copy stochastic.

    reference: https://github.com/pytorch/pytorch/issues/120376#issuecomment-1974828905

    :param target: torch.Tensor. bfloat16 tensor.
    :param source: torch.Tensor. float32 tensor.
    """
    result = torch.randint_like(
        source,
        dtype=torch.int32,
        low=0,
        high=1 << 16,
    )

    result.add_(source.view(dtype=torch.int32))

    result.bitwise_and_(-65536)

    target.copy_(result.view(dtype=torch.float32))