Repvgg block

`RepVGGBlock`

Bases: nn.Module

Repvgg block consists of three branches 3x3: a branch of a 3x3 Convolution + BatchNorm + Activation 1x1: a branch of a 1x1 Convolution + BatchNorm + Activation no_conv_branch: a branch with only BatchNorm which will only be used if input channel == output channel and use_residual_connection is True (usually in all but the first block of each stage)

Source code in src/super_gradients/modules/repvgg_block.py

class RepVGGBlock(nn.Module):
    """
    Repvgg block consists of three branches
    3x3: a branch of a 3x3 Convolution + BatchNorm + Activation
    1x1: a branch of a 1x1 Convolution + BatchNorm + Activation
    no_conv_branch: a branch with only BatchNorm which will only be used if
        input channel == output channel and use_residual_connection is True
    (usually in all but the first block of each stage)
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        dilation: int = 1,
        groups: int = 1,
        activation_type: Type[nn.Module] = nn.ReLU,
        activation_kwargs: Union[Mapping[str, Any], None] = None,
        se_type: Type[nn.Module] = nn.Identity,
        se_kwargs: Union[Mapping[str, Any], None] = None,
        build_residual_branches: bool = True,
        use_residual_connection: bool = True,
        use_alpha: bool = False,
    ):
        """

        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param activation_type: Type of the nonlinearity
        :param se_type: Type of the se block (Use nn.Identity to disable SE)
        :param stride: Output stride
        :param dilation: Dilation factor for 3x3 conv
        :param groups: Number of groups used in convolutions
        :param activation_kwargs: Additional arguments for instantiating activation module.
        :param se_kwargs: Additional arguments for instantiating SE module.
        :param build_residual_branches: Whether to initialize block with already fused paramters (for deployment)
        :param use_residual_connection: Whether to add input x to the output (Enabled in RepVGG, disabled in PP-Yolo)
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch (PP-Yolo-E Plus)
        """
        super().__init__()

        if activation_kwargs is None:
            activation_kwargs = {}
        if se_kwargs is None:
            se_kwargs = {}

        self.groups = groups
        self.in_channels = in_channels

        self.nonlinearity = activation_type(**activation_kwargs)
        self.se = se_type(**se_kwargs)

        if use_residual_connection and out_channels == in_channels and stride == 1:
            self.no_conv_branch = nn.BatchNorm2d(num_features=in_channels)
        else:
            self.no_conv_branch = None

        self.branch_3x3 = self._conv_bn(
            in_channels=in_channels,
            out_channels=out_channels,
            dilation=dilation,
            kernel_size=3,
            stride=stride,
            padding=dilation,
            groups=groups,
        )
        self.branch_1x1 = self._conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=0, groups=groups)

        if use_alpha:
            # If we are using alpha, we need to add some noise to the initial value of 1
            # When we are benchmarking the model we usually use random weights,
            # so when ONNX simplifies the model it will remove multiplication of alpha * residual branch and
            # replace it with simple addition (Since 1 * has no effect)
            # To prevent this we add some noise to the initial value of alpha which prevents this from happening
            # but since the noise is very small it should not affect the training process
            noise = torch.randn((1,)) * 0.01
            self.alpha = torch.nn.Parameter(torch.tensor([1.0]) + noise, requires_grad=True)
        else:
            self.alpha = 1

        if not build_residual_branches:
            self.fuse_block_residual_branches()
        else:
            self.build_residual_branches = True

    def forward(self, inputs):
        if not self.build_residual_branches:
            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))

        if self.no_conv_branch is None:
            id_out = 0
        else:
            id_out = self.no_conv_branch(inputs)

        return self.nonlinearity(self.se(self.branch_3x3(inputs) + self.alpha * self.branch_1x1(inputs) + id_out))

    def _get_equivalent_kernel_bias(self):
        """
        Fuses the 3x3, 1x1 and identity branches into a single 3x3 conv layer
        """
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.branch_3x3)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.branch_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.no_conv_branch)
        return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        """
        padding the 1x1 convolution weights with zeros to be able to fuse the 3x3 conv layer with the 1x1
        :param kernel1x1: weights of the 1x1 convolution
        :type kernel1x1:
        :return: padded 1x1 weights
        :rtype:
        """
        if kernel1x1 is None:
            return 0
        else:
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        """
        Fusing of the batchnorm into the conv layer.
        If the branch is the identity branch (no conv) the kernel will simply be eye.
        :param branch:
        :type branch:
        :return:
        :rtype:
        """
        if branch is None:
            return 0, 0
        if isinstance(branch, nn.Sequential):
            kernel = branch.conv.weight
            running_mean = branch.bn.running_mean
            running_var = branch.bn.running_var
            gamma = branch.bn.weight
            beta = branch.bn.bias
            eps = branch.bn.eps
        else:
            assert isinstance(branch, nn.BatchNorm2d)
            if not hasattr(self, "id_tensor"):
                input_dim = self.in_channels // self.groups
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
            kernel = self.id_tensor
            running_mean = branch.running_mean
            running_var = branch.running_var
            gamma = branch.weight
            beta = branch.bias
            eps = branch.eps
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape(-1, 1, 1, 1)
        return kernel * t, beta - running_mean * gamma / std

    def fuse_block_residual_branches(self):
        """
        converts a repvgg block from training model (with branches) to deployment mode (vgg like model)
        :return:
        :rtype:
        """
        if hasattr(self, "build_residual_branches") and not self.build_residual_branches:
            return
        kernel, bias = self._get_equivalent_kernel_bias()
        self.rbr_reparam = nn.Conv2d(
            in_channels=self.branch_3x3.conv.in_channels,
            out_channels=self.branch_3x3.conv.out_channels,
            kernel_size=self.branch_3x3.conv.kernel_size,
            stride=self.branch_3x3.conv.stride,
            padding=self.branch_3x3.conv.padding,
            dilation=self.branch_3x3.conv.dilation,
            groups=self.branch_3x3.conv.groups,
            bias=True,
        )
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        self.__delattr__("branch_3x3")
        self.__delattr__("branch_1x1")
        if hasattr(self, "no_conv_branch"):
            self.__delattr__("no_conv_branch")
        if hasattr(self, "alpha"):
            self.__delattr__("alpha")
        self.build_residual_branches = False

    @staticmethod
    def _conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1, dilation=1):
        result = nn.Sequential()
        result.add_module(
            "conv",
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                groups=groups,
                bias=False,
                dilation=dilation,
            ),
        )
        result.add_module("bn", nn.BatchNorm2d(num_features=out_channels))
        return result

    def prep_model_for_conversion(self, input_size: Optional[Union[tuple, list]] = None, **kwargs):
        self.fuse_block_residual_branches()

`init(in_channels, out_channels, stride=1, dilation=1, groups=1, activation_type=nn.ReLU, activation_kwargs=None, se_type=nn.Identity, se_kwargs=None, build_residual_branches=True, use_residual_connection=True, use_alpha=False)`

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required
`activation_type`	`Type[nn.Module]`	Type of the nonlinearity	`nn.ReLU`
`se_type`	`Type[nn.Module]`	Type of the se block (Use nn.Identity to disable SE)	`nn.Identity`
`stride`	`int`	Output stride	`1`
`dilation`	`int`	Dilation factor for 3x3 conv	`1`
`groups`	`int`	Number of groups used in convolutions	`1`
`activation_kwargs`	`Union[Mapping[str, Any], None]`	Additional arguments for instantiating activation module.	`None`
`se_kwargs`	`Union[Mapping[str, Any], None]`	Additional arguments for instantiating SE module.	`None`
`build_residual_branches`	`bool`	Whether to initialize block with already fused paramters (for deployment)	`True`
`use_residual_connection`	`bool`	Whether to add input x to the output (Enabled in RepVGG, disabled in PP-Yolo)	`True`
`use_alpha`	`bool`	If True, enables additional learnable weighting parameter for 1x1 branch (PP-Yolo-E Plus)	`False`

Source code in src/super_gradients/modules/repvgg_block.py

def __init__(
    self,
    in_channels: int,
    out_channels: int,
    stride: int = 1,
    dilation: int = 1,
    groups: int = 1,
    activation_type: Type[nn.Module] = nn.ReLU,
    activation_kwargs: Union[Mapping[str, Any], None] = None,
    se_type: Type[nn.Module] = nn.Identity,
    se_kwargs: Union[Mapping[str, Any], None] = None,
    build_residual_branches: bool = True,
    use_residual_connection: bool = True,
    use_alpha: bool = False,
):
    """

    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param activation_type: Type of the nonlinearity
    :param se_type: Type of the se block (Use nn.Identity to disable SE)
    :param stride: Output stride
    :param dilation: Dilation factor for 3x3 conv
    :param groups: Number of groups used in convolutions
    :param activation_kwargs: Additional arguments for instantiating activation module.
    :param se_kwargs: Additional arguments for instantiating SE module.
    :param build_residual_branches: Whether to initialize block with already fused paramters (for deployment)
    :param use_residual_connection: Whether to add input x to the output (Enabled in RepVGG, disabled in PP-Yolo)
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch (PP-Yolo-E Plus)
    """
    super().__init__()

    if activation_kwargs is None:
        activation_kwargs = {}
    if se_kwargs is None:
        se_kwargs = {}

    self.groups = groups
    self.in_channels = in_channels

    self.nonlinearity = activation_type(**activation_kwargs)
    self.se = se_type(**se_kwargs)

    if use_residual_connection and out_channels == in_channels and stride == 1:
        self.no_conv_branch = nn.BatchNorm2d(num_features=in_channels)
    else:
        self.no_conv_branch = None

    self.branch_3x3 = self._conv_bn(
        in_channels=in_channels,
        out_channels=out_channels,
        dilation=dilation,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
    )
    self.branch_1x1 = self._conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=0, groups=groups)

    if use_alpha:
        # If we are using alpha, we need to add some noise to the initial value of 1
        # When we are benchmarking the model we usually use random weights,
        # so when ONNX simplifies the model it will remove multiplication of alpha * residual branch and
        # replace it with simple addition (Since 1 * has no effect)
        # To prevent this we add some noise to the initial value of alpha which prevents this from happening
        # but since the noise is very small it should not affect the training process
        noise = torch.randn((1,)) * 0.01
        self.alpha = torch.nn.Parameter(torch.tensor([1.0]) + noise, requires_grad=True)
    else:
        self.alpha = 1

    if not build_residual_branches:
        self.fuse_block_residual_branches()
    else:
        self.build_residual_branches = True

`fuse_block_residual_branches()`

converts a repvgg block from training model (with branches) to deployment mode (vgg like model)

Returns:

Type	Description

Source code in src/super_gradients/modules/repvgg_block.py

def fuse_block_residual_branches(self):
    """
    converts a repvgg block from training model (with branches) to deployment mode (vgg like model)
    :return:
    :rtype:
    """
    if hasattr(self, "build_residual_branches") and not self.build_residual_branches:
        return
    kernel, bias = self._get_equivalent_kernel_bias()
    self.rbr_reparam = nn.Conv2d(
        in_channels=self.branch_3x3.conv.in_channels,
        out_channels=self.branch_3x3.conv.out_channels,
        kernel_size=self.branch_3x3.conv.kernel_size,
        stride=self.branch_3x3.conv.stride,
        padding=self.branch_3x3.conv.padding,
        dilation=self.branch_3x3.conv.dilation,
        groups=self.branch_3x3.conv.groups,
        bias=True,
    )
    self.rbr_reparam.weight.data = kernel
    self.rbr_reparam.bias.data = bias
    for para in self.parameters():
        para.detach_()
    self.__delattr__("branch_3x3")
    self.__delattr__("branch_1x1")
    if hasattr(self, "no_conv_branch"):
        self.__delattr__("no_conv_branch")
    if hasattr(self, "alpha"):
        self.__delattr__("alpha")
    self.build_residual_branches = False

`fuse_repvgg_blocks_residual_branches(model)`

Call fuse_block_residual_branches for all repvgg blocks in the model

Parameters:

Name	Type	Description	Default
`model`	`nn.Module`	torch.nn.Module with repvgg blocks. Doesn't have to be entirely consists of repvgg.	required

Source code in src/super_gradients/modules/repvgg_block.py

def fuse_repvgg_blocks_residual_branches(model: nn.Module):
    """
    Call fuse_block_residual_branches for all repvgg blocks in the model
    :param model: torch.nn.Module with repvgg blocks. Doesn't have to be entirely consists of repvgg.
    :type model: torch.nn.Module
    """
    if model.training:
        raise RuntimeError("To fuse RepVGG block residual branches, model must be on eval mode")
    from super_gradients.training.utils.utils import infer_model_device

    device = infer_model_device(model)
    for module in model.modules():
        if hasattr(module, "fuse_block_residual_branches"):
            module.fuse_block_residual_branches()
    model.build_residual_branches = False
    model.to(device)

Repvgg block

RepVGGBlock

__init__(in_channels, out_channels, stride=1, dilation=1, groups=1, activation_type=nn.ReLU, activation_kwargs=None, se_type=nn.Identity, se_kwargs=None, build_residual_branches=True, use_residual_connection=True, use_alpha=False)

fuse_block_residual_branches()

fuse_repvgg_blocks_residual_branches(model)

`RepVGGBlock`

`init(in_channels, out_channels, stride=1, dilation=1, groups=1, activation_type=nn.ReLU, activation_kwargs=None, se_type=nn.Identity, se_kwargs=None, build_residual_branches=True, use_residual_connection=True, use_alpha=False)`

`fuse_block_residual_branches()`

`fuse_repvgg_blocks_residual_branches(model)`