Models

`get_arch_params(config_name, overriding_params=None, recipes_dir_path=None)`

Class for creating arch parameters dictionary, taking defaults from yaml files in src/super_gradients/recipes/arch_params.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "resnet18_cifar_arch_params")	required
`overriding_params`	`Dict`	Dict, dictionary like object containing entries to override.	`None`
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "arch_params" folder, which itself should include the config file named after config_name.	`None`

Source code in src/super_gradients/training/models/arch_params_factory.py

def get_arch_params(config_name: str, overriding_params: Dict = None, recipes_dir_path: Optional[str] = None) -> DictConfig:
    """
    Class for creating arch parameters dictionary, taking defaults from yaml
     files in src/super_gradients/recipes/arch_params.

    :param config_name:         Name of the yaml to load (e.g. "resnet18_cifar_arch_params")
    :param overriding_params: Dict, dictionary like object containing entries to override.
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "arch_params" folder,
                                which itself should include the config file named after config_name.
    """
    overriding_params = overriding_params if overriding_params else dict()

    arch_params = load_arch_params(config_name=config_name, recipes_dir_path=recipes_dir_path)
    arch_params = hydra.utils.instantiate(arch_params)

    arch_params.update(**overriding_params)

    return arch_params

`BaseClassifier`

Bases: SgModule, HasPredict

Source code in src/super_gradients/training/models/classification_models/base_classifer.py

class BaseClassifier(SgModule, HasPredict):
    def __init__(
        self,
    ):
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        super(BaseClassifier, self).__init__()

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(self, class_names: Optional[List[str]] = None, image_processor: Optional[Processing] = None) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> ClassificationPipeline:
        """Instantiate the prediction pipeline of this model.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        if skip_image_resizing:
            raise ValueError("`skip_image_resizing` is not supported for classification models.")

        pipeline = ClassificationPipeline(
            model=self,
            image_processor=self._image_processor,
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesClassificationPrediction:
        """Predict an image or a list of images.

        :param images:      Images to predict.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> None:
        """Predict using webcam.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

`predict(images, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/classification_models/base_classifer.py

def predict(
    self,
    images: ImageSource,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesClassificationPrediction:
    """Predict an image or a list of images.

    :param images:      Images to predict.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16: If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/classification_models/base_classifer.py

def predict_webcam(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> None:
    """Predict using webcam.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16: If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

`set_dataset_processing_params(class_names=None, image_processor=None)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`class_names`	`Optional[List[str]]`	(Optional) Names of the dataset the model was trained on.	`None`
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`

Source code in src/super_gradients/training/models/classification_models/base_classifer.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(self, class_names: Optional[List[str]] = None, image_processor: Optional[Processing] = None) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor

BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)

Model from official source: https://github.com/microsoft/unilm/tree/master/beit

At this point only the 1k fine-tuned classification weights and model configs have been added, see original source above for pre-training models and procedure.

`Beit`

Bases: BaseClassifier

Vision Transformer with support for patch or hybrid CNN input stage

Source code in src/super_gradients/training/models/classification_models/beit.py

class Beit(BaseClassifier):
    """Vision Transformer with support for patch or hybrid CNN input stage"""

    def __init__(
        self,
        image_size=(224, 224),
        patch_size=16,
        in_chans=3,
        num_classes=1000,
        global_pool="avg",
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=True,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),
        init_values=None,
        use_abs_pos_emb=True,
        use_rel_pos_bias=False,
        use_shared_rel_pos_bias=False,
        head_init_scale=0.001,
        **kwargs,
    ):
        super().__init__()
        self.num_classes = num_classes
        self.global_pool = global_pool
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.grad_checkpointing = False

        self.image_size = image_size
        self.patch_size = patch_size
        self.patch_embed = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_chans, hidden_dim=self.embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None
        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.grid_size, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    init_values=init_values,
                    window_size=self.patch_embed.grid_size if use_rel_pos_bias else None,
                )
                for i in range(depth)
            ]
        )
        use_fc_norm = self.global_pool == "avg"
        self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else None
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)
        if self.pos_embed is not None:
            trunc_normal_(self.pos_embed, std=0.02)
        trunc_normal_(self.cls_token, std=0.02)
        # trunc_normal_(self.mask_token, std=.02)
        self.fix_init_weight()
        if isinstance(self.head, nn.Linear):
            trunc_normal_(self.head.weight, std=0.02)
            self.head.weight.data.mul_(head_init_scale)
            self.head.bias.data.mul_(head_init_scale)

    def fix_init_weight(self):
        def rescale(param, layer_id):
            param.div_(math.sqrt(2.0 * layer_id))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight.data, layer_id + 1)
            rescale(layer.mlp.fc2.weight.data, layer_id + 1)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        nwd = {"pos_embed", "cls_token"}
        for n, _ in self.named_parameters():
            if "relative_position_bias_table" in n:
                nwd.add(n)
        return nwd

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def group_matcher(self, coarse=False):
        matcher = dict(
            stem=r"^cls_token|pos_embed|patch_embed|rel_pos_bias",  # stem and embed
            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
        )
        return matcher

    @torch.jit.ignore
    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.global_pool = global_pool
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        x = self.patch_embed(x)
        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for blk in self.blocks:
            if self.grad_checkpointing and not torch.jit.is_scripting():
                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
            else:
                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
        x = self.norm(x)
        return x

    def forward_head(self, x, pre_logits: bool = False):
        if self.fc_norm is not None:
            x = x[:, 1:].mean(dim=1)
            x = self.fc_norm(x)
        else:
            x = x[:, 0]
        return x if pre_logits else self.head(x)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.patch_embed = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.embed_dim)

    def get_input_channels(self) -> int:
        return self.patch_embed.get_input_channels()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0.0}

`Mlp`

Bases: nn.Module

MLP as used in Vision Transformer, MLP-Mixer and related networks

Source code in src/super_gradients/training/models/classification_models/beit.py

class Mlp(nn.Module):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features

        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x

`trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0)`

Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:\mathcal{N}(\text{mean}, \text{std}^2) with values outside :math:[a, b] redrawn until they are within the bounds. The method used for generating the random values works best when :math:a \leq \text{mean} \leq b.

Parameters:

Name	Description	Default
`tensor`	an n-dimensional `torch.Tensor`	required
`mean`	the mean of the normal distribution	`0.0`
`std`	the standard deviation of the normal distribution	`1.0`
`a`	the minimum cutoff value	`-2.0`
`b`	the maximum cutoff value Examples: >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)	`2.0`

Source code in src/super_gradients/training/models/classification_models/beit.py

def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
    # type: (Tensor, float, float, float, float) -> Tensor
    r"""Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.

    :param tensor: an n-dimensional `torch.Tensor`
    :param mean: the mean of the normal distribution
    :param std: the standard deviation of the normal distribution
    :param a: the minimum cutoff value
    :param b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.trunc_normal_(w)
    """
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

`DenseNet`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/densenet.py

class DenseNet(BaseClassifier):
    def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int, in_channels: int = 3):
        """
        :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
        :param structure:           how many layers in each pooling block - sequentially
        :param num_init_features:   the number of filters to learn in the first convolutional layer
        :param bn_size:             multiplicative factor for the number of bottle neck layers
                                        (i.e. bn_size * k featurs in the bottleneck)
        :param drop_rate:           dropout rate after each dense layer
        :param num_classes:         number of classes in the classification task
        :param in_channels:         number of channels in the input image
        """
        super(DenseNet, self).__init__()

        # First convolution
        self.features = nn.Sequential(
            OrderedDict(
                [
                    ("conv0", nn.Conv2d(in_channels, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                    ("norm0", nn.BatchNorm2d(num_init_features)),
                    ("relu0", nn.ReLU(inplace=True)),
                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
                ]
            )
        )

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(structure):
            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module("denseblock%d" % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(structure) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module("transition%d" % (i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
        self.features.add_module("norm5", nn.BatchNorm2d(num_features))

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

        # Official init from torch repo.
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.features[0] = replace_conv2d_input_channels(conv=self.features[0], in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.features[0].in_channels

`init(growth_rate, structure, num_init_features, bn_size, drop_rate, num_classes, in_channels=3)`

Parameters:

Name	Type	Description	Default
`growth_rate`	`int`	number of filter to add each layer (noted as 'k' in the paper)	required
`structure`	`list`	how many layers in each pooling block - sequentially	required
`num_init_features`	`int`	the number of filters to learn in the first convolutional layer	required
`bn_size`	`int`	multiplicative factor for the number of bottle neck layers (i.e. bn_size * k featurs in the bottleneck)	required
`drop_rate`	`float`	dropout rate after each dense layer	required
`num_classes`	`int`	number of classes in the classification task	required
`in_channels`	`int`	number of channels in the input image	`3`

Source code in src/super_gradients/training/models/classification_models/densenet.py

def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int, in_channels: int = 3):
    """
    :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
    :param structure:           how many layers in each pooling block - sequentially
    :param num_init_features:   the number of filters to learn in the first convolutional layer
    :param bn_size:             multiplicative factor for the number of bottle neck layers
                                    (i.e. bn_size * k featurs in the bottleneck)
    :param drop_rate:           dropout rate after each dense layer
    :param num_classes:         number of classes in the classification task
    :param in_channels:         number of channels in the input image
    """
    super(DenseNet, self).__init__()

    # First convolution
    self.features = nn.Sequential(
        OrderedDict(
            [
                ("conv0", nn.Conv2d(in_channels, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                ("norm0", nn.BatchNorm2d(num_init_features)),
                ("relu0", nn.ReLU(inplace=True)),
                ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
            ]
        )
    )

    # Each denseblock
    num_features = num_init_features
    for i, num_layers in enumerate(structure):
        block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
        self.features.add_module("denseblock%d" % (i + 1), block)
        num_features = num_features + num_layers * growth_rate
        if i != len(structure) - 1:
            trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
            self.features.add_module("transition%d" % (i + 1), trans)
            num_features = num_features // 2

    # Final batch norm
    self.features.add_module("norm5", nn.BatchNorm2d(num_features))

    # Linear layer
    self.classifier = nn.Linear(num_features, num_classes)

    # Official init from torch repo.
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.constant_(m.bias, 0)

Dual Path Networks in PyTorch.

Credits: https://github.com/kuangliu/pytorch-cifar/blob/master/models/dpn.py

EfficientNet model class, based on "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" https://arxiv.org/abs/1905.11946` Code source: https://github.com/lukemelas/EfficientNet-PyTorch Pre-trained checkpoints converted to Deci's code base with the reported accuracy can be found in S3 repo

`BlockDecoder`

Bases: object

Block Decoder for readability, straight from the official TensorFlow repository.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class BlockDecoder(object):
    """Block Decoder for readability, straight from the official TensorFlow repository."""

    @staticmethod
    def _decode_block_string(block_string: str) -> BlockArgs:
        """Get a block through a string notation of arguments.

        :param block_string: A string notation of arguments. Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
        :return:     BlockArgs: The namedtuple defined at the top of this file.
        """
        assert isinstance(block_string, str)

        ops = block_string.split("_")
        options = {}
        for op in ops:
            splits = re.split(r"(\d.*)", op)
            if len(splits) >= 2:
                key, value = splits[:2]
                options[key] = value

        # Check stride
        assert ("s" in options and len(options["s"]) == 1) or (len(options["s"]) == 2 and options["s"][0] == options["s"][1])

        return BlockArgs(
            num_repeat=int(options["r"]),
            kernel_size=int(options["k"]),
            stride=[int(options["s"][0])],
            expand_ratio=int(options["e"]),
            input_filters=int(options["i"]),
            output_filters=int(options["o"]),
            se_ratio=float(options["se"]) if "se" in options else None,
            id_skip=("noskip" not in block_string),
        )

    @staticmethod
    def _encode_block_string(block) -> str:
        """Encode a block to a string.

        :param block: A BlockArgs type argument (NamedTuple)
        :return: block_string: A String form of BlockArgs.
        """
        args = [
            "r%d" % block.num_repeat,
            "k%d" % block.kernel_size,
            "s%d%d" % (block.strides[0], block.strides[1]),
            "e%s" % block.expand_ratio,
            "i%d" % block.input_filters,
            "o%d" % block.output_filters,
        ]
        if 0 < block.se_ratio <= 1:
            args.append("se%s" % block.se_ratio)
        if block.id_skip is False:
            args.append("noskip")
        return "_".join(args)

    @staticmethod
    def decode(string_list: List[str]) -> List[BlockArgs]:
        """Decode a list of string notations to specify blocks inside the network.

        :param string_list:     List of strings, each string is a notation of block.
        :return blocks_args:    List of BlockArgs namedtuples of block args.
        """
        assert isinstance(string_list, list)
        blocks_args = []
        for block_string in string_list:
            blocks_args.append(BlockDecoder._decode_block_string(block_string))
        return blocks_args

    @staticmethod
    def encode(blocks_args: List):
        """Encode a list of BlockArgs to a list of strings.

        :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
        :return: block_strings: A list of strings, each string is a notation of block.
        """
        block_strings = []
        for block in blocks_args:
            block_strings.append(BlockDecoder._encode_block_string(block))
        return block_strings

`decode(string_list)` `staticmethod`

Decode a list of string notations to specify blocks inside the network.

Parameters:

Name	Type	Description	Default
`string_list`	`List[str]`	List of strings, each string is a notation of block.	required

Returns:

Type	Description
`List[BlockArgs]`	List of BlockArgs namedtuples of block args.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

@staticmethod
def decode(string_list: List[str]) -> List[BlockArgs]:
    """Decode a list of string notations to specify blocks inside the network.

    :param string_list:     List of strings, each string is a notation of block.
    :return blocks_args:    List of BlockArgs namedtuples of block args.
    """
    assert isinstance(string_list, list)
    blocks_args = []
    for block_string in string_list:
        blocks_args.append(BlockDecoder._decode_block_string(block_string))
    return blocks_args

`encode(blocks_args)` `staticmethod`

Encode a list of BlockArgs to a list of strings.

Parameters:

Name	Type	Description	Default
`blocks_args`	`List`	A list of BlockArgs namedtuples of block args. (list[namedtuples])	required

Returns:

Type	Description
	block_strings: A list of strings, each string is a notation of block.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

@staticmethod
def encode(blocks_args: List):
    """Encode a list of BlockArgs to a list of strings.

    :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
    :return: block_strings: A list of strings, each string is a notation of block.
    """
    block_strings = []
    for block in blocks_args:
        block_strings.append(BlockDecoder._encode_block_string(block))
    return block_strings

`Conv2dDynamicSamePadding`

Bases: nn.Conv2d

2D Convolutions like TensorFlow, for a dynamic image size. The padding is operated in forward function by calculating dynamically.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class Conv2dDynamicSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow, for a dynamic image size.
    The padding is operated in forward function by calculating dynamically.
    """

    # Tips for 'SAME' mode padding.
    #     Given the following:
    #         i: width or height
    #         s: stride
    #         k: kernel size
    #         d: dilation
    #         p: padding
    #     Output after Conv2d:
    #         o = floor((i+p-((k-1)*d+1))/s+1)
    # If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
    # => p = (i-1)*s+((k-1)*d+1)-i

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)  # change the output size according to stride ! ! !
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)

`Conv2dStaticSamePadding`

Bases: nn.Conv2d

2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. The padding mudule is calculated in construction function, then used in forward.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class Conv2dStaticSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
    The padding mudule is calculated in construction function, then used in forward.
    """

    # With the same calculation as Conv2dDynamicSamePadding

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
        super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

        # Calculate padding based on image size and save it
        assert image_size is not None
        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d((pad_w - pad_w // 2, pad_w // 2, pad_h - pad_h // 2, pad_h // 2))
        else:
            self.static_padding = Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
        return x

`EfficientNet`

Bases: BaseClassifier

EfficientNet model.

References: [1] https://arxiv.org/abs/1905.11946 (EfficientNet)

Parameters:

Name	Type	Description	Default
`width_coefficient`	`float`	model's width coefficient. Used as the multiplier.	required
`depth_coefficient`	`float`	model's depth coefficient. Used as the multiplier.	required
`image_size`	`int`	Size of input image.	required
`dropout_rate`	`float`	Dropout probability in final layer	required
`num_classes`	`int`	Number of classes.	required
`batch_norm_momentum`	`Optional[float]`	Value used for the running_mean and running_var computation	`0.99`
`batch_norm_epsilon`	`Optional[float]`	Value added to the denominator for numerical stability	`0.001`
`drop_connect_rate`	`Optional[float]`	Connection dropout probability	`0.2`
`depth_divisor`	`Optional[int]`	Model's depth divisor. Used as the divisor.	`8`
`min_depth`	`Optional[int]`	Model's minimal depth, if given.	`None`
`backbone_mode`	`Optional[bool]`	If true, dropping the final linear layer	`False`
`blocks_args`	`Optional[list]`	List of BlockArgs to construct blocks. (list[namedtuple])	`None`

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class EfficientNet(BaseClassifier):
    """
    EfficientNet model.

    References:
        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)


    :param width_coefficient:   model's width coefficient. Used as the multiplier.
    :param depth_coefficient:   model's depth coefficient. Used as the multiplier.
    :param image_size:          Size of input image.
    :param dropout_rate:        Dropout probability in final layer
    :param num_classes:         Number of classes.
    :param batch_norm_momentum: Value used for the running_mean and running_var computation
    :param batch_norm_epsilon:  Value added to the denominator for numerical stability
    :param drop_connect_rate:   Connection dropout probability
    :param depth_divisor:       Model's depth divisor. Used as the divisor.
    :param min_depth:           Model's minimal depth, if given.
    :param backbone_mode:       If true, dropping the final linear layer
    :param blocks_args:         List of BlockArgs to construct blocks. (list[namedtuple])
    """

    def __init__(
        self,
        width_coefficient: float,
        depth_coefficient: float,
        image_size: int,
        dropout_rate: float,
        num_classes: int,
        batch_norm_momentum: Optional[float] = 0.99,
        batch_norm_epsilon: Optional[float] = 1e-3,
        drop_connect_rate: Optional[float] = 0.2,
        depth_divisor: Optional[int] = 8,
        min_depth: Optional[int] = None,
        backbone_mode: Optional[bool] = False,
        blocks_args: Optional[list] = None,
    ):
        super().__init__()
        assert isinstance(blocks_args, list), "blocks_args should be a list"
        assert len(blocks_args) > 0, "block args must be greater than 0"

        self._blocks_args = blocks_args
        self.backbone_mode = backbone_mode
        self.drop_connect_rate = drop_connect_rate

        # Batch norm parameters
        bn_mom = 1 - batch_norm_momentum
        bn_eps = batch_norm_epsilon

        # Get stem static or dynamic convolution depending on image size
        Conv2d = get_same_padding_conv2d(image_size=image_size)

        # Stem
        in_channels = 3  # rgb
        out_channels = round_filters(32, width_coefficient, depth_divisor, min_depth)  # number of output channels
        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
        image_size = calculate_output_image_size(image_size, 2)

        # Build blocks
        self._blocks = nn.ModuleList([])
        for block_args in self._blocks_args:

            # Update block input and output filters based on depth multiplier.
            block_args = block_args._replace(
                input_filters=round_filters(block_args.input_filters, width_coefficient, depth_divisor, min_depth),
                output_filters=round_filters(block_args.output_filters, width_coefficient, depth_divisor, min_depth),
                num_repeat=round_repeats(block_args.num_repeat, depth_coefficient),
            )

            # The first block needs to take care of stride and filter size increase.
            self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
            image_size = calculate_output_image_size(image_size, block_args.stride)
            if block_args.num_repeat > 1:  # modify block_args to keep same output size
                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
                self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1

        # Head
        in_channels = block_args.output_filters  # output of final block
        out_channels = round_filters(1280, width_coefficient, depth_divisor, min_depth)
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        # Final linear layer
        if not self.backbone_mode:
            self._avg_pooling = nn.AdaptiveAvgPool2d(1)
            self._dropout = nn.Dropout(dropout_rate)
            self._fc = nn.Linear(out_channels, num_classes)
        self._swish = nn.functional.silu

    def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
        """
        Use convolution layer to extract feature.

        :param inputs: Input tensor.
        :return: Output of the final convolution layer in the efficientnet model.
        """

        # Stem
        x = self._swish(self._bn0(self._conv_stem(inputs)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)

        # Head
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

    def forward(self, inputs):
        """
        EfficientNet's forward function.
        Calls extract_features to extract features, applies final linear layer, and returns logits.

        :param inputs: Input tensor.
        :return: Output of this model after processing.
        """
        bs = inputs.size(0)

        # Convolution layers
        x = self.extract_features(inputs)

        # Pooling and final linear layer, not needed for backbone mode
        if not self.backbone_mode:
            x = self._avg_pooling(x)
            x = x.view(bs, -1)
            x = self._dropout(x)
            x = self._fc(x)

        return x

    def replace_head(self, new_num_classes: Optional[int] = None, new_head: Optional[nn.Module] = None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self._fc = new_head
        else:
            self._fc = nn.Linear(self._fc.in_features, new_num_classes)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self._conv_stem = replace_conv2d_input_channels(conv=self._conv_stem, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self._conv_stem.in_channels

    def load_state_dict(self, state_dict: dict, strict: bool = True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            pretrained_model_weights_dict = pretrained_backbone_weights_dict

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_model_weights_dict, strict)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"_fc": lr, "default": 0.0}

`extract_features(inputs)`

Use convolution layer to extract feature.

Parameters:

Name	Type	Description	Default
`inputs`	`torch.Tensor`	Input tensor.	required

Returns:

Type	Description
`torch.Tensor`	Output of the final convolution layer in the efficientnet model.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
    """
    Use convolution layer to extract feature.

    :param inputs: Input tensor.
    :return: Output of the final convolution layer in the efficientnet model.
    """

    # Stem
    x = self._swish(self._bn0(self._conv_stem(inputs)))

    # Blocks
    for idx, block in enumerate(self._blocks):
        drop_connect_rate = self.drop_connect_rate
        if drop_connect_rate:
            drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
        x = block(x, drop_connect_rate=drop_connect_rate)

    # Head
    x = self._swish(self._bn1(self._conv_head(x)))

    return x

`forward(inputs)`

EfficientNet's forward function. Calls extract_features to extract features, applies final linear layer, and returns logits.

Parameters:

Name	Type	Description	Default
`inputs`		Input tensor.	required

Returns:

Type	Description
	Output of this model after processing.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def forward(self, inputs):
    """
    EfficientNet's forward function.
    Calls extract_features to extract features, applies final linear layer, and returns logits.

    :param inputs: Input tensor.
    :return: Output of this model after processing.
    """
    bs = inputs.size(0)

    # Convolution layers
    x = self.extract_features(inputs)

    # Pooling and final linear layer, not needed for backbone mode
    if not self.backbone_mode:
        x = self._avg_pooling(x)
        x = x.view(bs, -1)
        x = self._dropout(x)
        x = self._fc(x)

    return x

`load_state_dict(state_dict, strict=True)`

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name	Type	Description	Default
`state_dict`	`dict`	The state_dict to load	required
`strict`	`bool`	strict loading (see super() docs)	`True`

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def load_state_dict(self, state_dict: dict, strict: bool = True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        pretrained_model_weights_dict = pretrained_backbone_weights_dict

    # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
    super().load_state_dict(pretrained_model_weights_dict, strict)

`Identity`

Bases: nn.Module

Identity mapping. Send input to output directly.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class Identity(nn.Module):
    """Identity mapping.
    Send input to output directly.
    """

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input

`MBConvBlock`

Bases: nn.Module

Mobile Inverted Residual Bottleneck Block.

References: [1] https://arxiv.org/abs/1704.04861 (MobileNet v1) [2] https://arxiv.org/abs/1801.04381 (MobileNet v2) [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

Parameters:

Name	Type	Description	Default
`block_args`	`BlockArgs`	BlockArgs.	required
`batch_norm_momentum`	`float`	Batch norm momentum.	required
`batch_norm_epsilon`	`float`	Batch norm epsilon.	required
`image_size`	`Union[Tuple, List]`	[image_height, image_width].	`None`

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

class MBConvBlock(nn.Module):
    """Mobile Inverted Residual Bottleneck Block.

    References:
        [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
        [2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
        [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

    :param block_args: BlockArgs.
    :param batch_norm_momentum: Batch norm momentum.
    :param batch_norm_epsilon: Batch norm epsilon.
    :param image_size: [image_height, image_width].
    """

    def __init__(self, block_args: BlockArgs, batch_norm_momentum: float, batch_norm_epsilon: float, image_size: Union[Tuple, List] = None):
        super().__init__()
        self._block_args = block_args
        self._bn_mom = 1 - batch_norm_momentum  # pytorch's difference from tensorflow
        self._bn_eps = batch_norm_epsilon
        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip  # whether to use skip connection and drop connect

        # Expansion phase (Inverted Bottleneck)
        inp = self._block_args.input_filters  # number of input channels
        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
        if self._block_args.expand_ratio != 1:
            Conv2d = get_same_padding_conv2d(image_size=image_size)
            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        # Depthwise convolution phase
        k = self._block_args.kernel_size
        s = self._block_args.stride
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._depthwise_conv = Conv2d(in_channels=oup, out_channels=oup, groups=oup, kernel_size=k, stride=s, bias=False)  # groups makes it depthwise
        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
        image_size = calculate_output_image_size(image_size, s)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)

        # Pointwise convolution phase
        final_oup = self._block_args.output_filters
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
        self._swish = nn.functional.silu

    def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
        """MBConvBlock's forward function.

        :param inputs:              Input tensor.
        :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
        :return:                    Output of this block after processing.
        """

        # Expansion and Depthwise Convolution
        x = inputs
        if self._block_args.expand_ratio != 1:
            x = self._expand_conv(inputs)
            x = self._bn0(x)
            x = self._swish(x)

        x = self._depthwise_conv(x)
        x = self._bn1(x)
        x = self._swish(x)

        # Squeeze and Excitation
        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_reduce(x_squeezed)
            x_squeezed = self._swish(x_squeezed)
            x_squeezed = self._se_expand(x_squeezed)
            x = torch.sigmoid(x_squeezed) * x

        # Pointwise Convolution
        x = self._project_conv(x)
        x = self._bn2(x)

        # Skip connection and drop connect
        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            # The combination of skip connection and drop connect brings about stochastic depth.
            if drop_connect_rate:
                x = drop_connect(x, p=drop_connect_rate, training=self.training)
            x = x + inputs  # skip connection
        return x

`forward(inputs, drop_connect_rate=None)`

MBConvBlock's forward function.

Parameters:

Name	Type	Description	Default
`inputs`	`torch.Tensor`	Input tensor.	required
`drop_connect_rate`	`Optional[float]`	Drop connect rate (float, between 0 and 1).	`None`

Returns:

Type	Description
`torch.Tensor`	Output of this block after processing.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
    """MBConvBlock's forward function.

    :param inputs:              Input tensor.
    :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
    :return:                    Output of this block after processing.
    """

    # Expansion and Depthwise Convolution
    x = inputs
    if self._block_args.expand_ratio != 1:
        x = self._expand_conv(inputs)
        x = self._bn0(x)
        x = self._swish(x)

    x = self._depthwise_conv(x)
    x = self._bn1(x)
    x = self._swish(x)

    # Squeeze and Excitation
    if self.has_se:
        x_squeezed = F.adaptive_avg_pool2d(x, 1)
        x_squeezed = self._se_reduce(x_squeezed)
        x_squeezed = self._swish(x_squeezed)
        x_squeezed = self._se_expand(x_squeezed)
        x = torch.sigmoid(x_squeezed) * x

    # Pointwise Convolution
    x = self._project_conv(x)
    x = self._bn2(x)

    # Skip connection and drop connect
    input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
    if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
        # The combination of skip connection and drop connect brings about stochastic depth.
        if drop_connect_rate:
            x = drop_connect(x, p=drop_connect_rate, training=self.training)
        x = x + inputs  # skip connection
    return x

`calculate_output_image_size(input_image_size, stride)`

Calculates the output image size when using Conv2dSamePadding with a stride. Necessary for static padding. Thanks to mannatsingh for pointing this out.

Parameters:

Name	Type	Description	Default
`input_image_size`	`Union[int, Tuple, List]`	Size of input image.	required
`stride`	`Union[int, Tuple, List]`	Conv2d operation's stride.	required

Returns:

Type	Description
`Optional[List[int]]`	output_image_size: A list [H,W].

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def calculate_output_image_size(input_image_size: Union[int, Tuple, List], stride: Union[int, Tuple, List]) -> Optional[List[int]]:
    """Calculates the output image size when using Conv2dSamePadding with a stride.
    Necessary for static padding. Thanks to mannatsingh for pointing this out.

    :param input_image_size:    Size of input image.
    :param stride:              Conv2d operation's stride.
    :return: output_image_size: A list [H,W].
    """
    if input_image_size is None:
        return None
    elif isinstance(input_image_size, int):
        input_image_size = (input_image_size, input_image_size)

    image_height, image_width = input_image_size
    stride = stride if isinstance(stride, int) else stride[0]
    image_height = int(math.ceil(image_height / stride))
    image_width = int(math.ceil(image_width / stride))
    return [image_height, image_width]

`drop_connect(inputs, p, training)`

Drop connect.

Parameters:

Name	Type	Description	Default
	`inputs`	Input of this structure. (tensor: BCWH)	required
`training`	`bool`	Running mode.	required

Returns:

Type	Description
`torch.Tensor`	output: Output after drop connection.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def drop_connect(inputs: torch.Tensor, p: float, training: bool) -> torch.Tensor:
    """Drop connect.

    :param inputs :     Input of this structure. (tensor: BCWH)
    :param p :          Probability of drop connection. (float: 0.0~1.0)
    :param training:    Running mode.
    :return: output: Output after drop connection.
    """
    assert p >= 0 and p <= 1, "p must be in range of [0,1]"

    if not training:
        return inputs

    batch_size = inputs.shape[0]
    keep_prob = 1 - p

    # generate binary_tensor mask according to probability (p for 0, 1-p for 1)
    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
    binary_tensor = torch.floor(random_tensor)

    output = inputs / keep_prob * binary_tensor
    return output

`get_same_padding_conv2d(image_size=None)`

Chooses static padding if you have specified an image size, and dynamic padding otherwise. Static padding is necessary for ONNX exporting of models.

Parameters:

Name	Type	Description	Default
`image_size`	`Optional[Union[int, Tuple[int, int]]]`	Size of the image.	`None`

Returns:

Type	Description
	Conv2dDynamicSamePadding or Conv2dStaticSamePadding.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def get_same_padding_conv2d(image_size: Optional[Union[int, Tuple[int, int]]] = None):
    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
       Static padding is necessary for ONNX exporting of models.

    :param image_size: Size of the image.
    :return: Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
    """
    if image_size is None:
        return Conv2dDynamicSamePadding
    else:
        return partial(Conv2dStaticSamePadding, image_size=image_size)

`round_filters(filters, width_coefficient, depth_divisor, min_depth)`

Calculate and round number of filters based on width multiplier. Use width_coefficient, depth_divisor and min_depth.

Parameters:

Name	Type	Description	Default
`filters`	`int`	Filters number to be calculated. Params from arch_params:	required
`width_coefficient`	`int`	model's width coefficient. Used as the multiplier.	required
`depth_divisor`	`int`	model's depth divisor. Used as the divisor.	required
`min_depth`	`int`	model's minimal depth, if given.	required

Returns:

Type	Description
	new_filters: New filters number after calculating.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def round_filters(filters: int, width_coefficient: int, depth_divisor: int, min_depth: int):
    """Calculate and round number of filters based on width multiplier.
       Use width_coefficient, depth_divisor and min_depth.

    :param filters: Filters number to be calculated. Params from arch_params:
    :param width_coefficient: model's width coefficient. Used as the multiplier.
    :param depth_divisor: model's depth divisor. Used as the divisor.
    :param min_depth: model's minimal depth, if given.
    :return: new_filters: New filters number after calculating.
    """
    if not width_coefficient:
        return filters
    min_depth = min_depth
    filters *= width_coefficient
    min_depth = min_depth or depth_divisor  # pay attention to this line when using min_depth
    # follow the formula transferred from official TensorFlow implementation
    new_filters = max(min_depth, int(filters + depth_divisor / 2) // depth_divisor * depth_divisor)
    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
        new_filters += depth_divisor
    return int(new_filters)

`round_repeats(repeats, depth_coefficient)`

Calculate module's repeat number of a block based on depth multiplier. Use depth_coefficient.

Parameters:

Name	Type	Description	Default
`repeats`	`int`	num_repeat to be calculated.	required
`depth_coefficient`	`int`	the depth coefficient of the model. this func uses it as the multiplier.	required

Returns:

Type	Description
	new repeat: New repeat number after calculating.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py

def round_repeats(repeats: int, depth_coefficient: int):
    """Calculate module's repeat number of a block based on depth multiplier.
       Use depth_coefficient.

    :param repeats: num_repeat to be calculated.
    :param depth_coefficient: the depth coefficient of the model. this func uses it as the multiplier.
    :return: new repeat: New repeat number after calculating.
    """
    if not depth_coefficient:
        return repeats
    # follow the formula transferred from official TensorFlow implementation
    return int(math.ceil(depth_coefficient * repeats))

Googlenet code based on https://pytorch.org/vision/stable/_modules/torchvision/models/googlenet.html

`GoogLeNet`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/googlenet.py

class GoogLeNet(BaseClassifier):
    def __init__(self, num_classes=1000, aux_logits=True, init_weights=True, backbone_mode=False, dropout=0.3):
        super(GoogLeNet, self).__init__()

        self.num_classes = num_classes
        self.backbone_mode = backbone_mode

        self.aux_logits = aux_logits
        self.dropout_p = dropout

        self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        self.conv2 = BasicConv2d(64, 64, kernel_size=1)
        self.conv3 = BasicConv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)

        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)

        if aux_logits:
            self.aux1 = InceptionAux(512, num_classes)
            self.aux2 = InceptionAux(528, num_classes)
        else:
            self.aux1 = None
            self.aux2 = None

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        if not self.backbone_mode:
            self.dropout = nn.Dropout(self.dropout_p)
            self.fc = nn.Linear(1024, num_classes)

        if init_weights:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                import scipy.stats as stats

                x = stats.truncnorm(-2, 2, scale=0.01)
                values = torch.as_tensor(x.rvs(m.weight.numel()), dtype=m.weight.dtype)
                values = values.view(m.weight.size())
                with torch.no_grad():
                    m.weight.copy_(values)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _forward(self, x):
        # N x 3 x 224 x 224
        x = self.conv1(x)
        # N x 64 x 112 x 112
        x = self.maxpool1(x)
        # N x 64 x 56 x 56
        x = self.conv2(x)
        # N x 64 x 56 x 56
        x = self.conv3(x)
        # N x 192 x 56 x 56
        x = self.maxpool2(x)

        # N x 192 x 28 x 28
        x = self.inception3a(x)
        # N x 256 x 28 x 28
        x = self.inception3b(x)
        # N x 480 x 28 x 28
        x = self.maxpool3(x)
        # N x 480 x 14 x 14
        x = self.inception4a(x)
        # N x 512 x 14 x 14
        aux1 = None
        if self.aux1 is not None and self.training:
            aux1 = self.aux1(x)

        x = self.inception4b(x)
        # N x 512 x 14 x 14
        x = self.inception4c(x)
        # N x 512 x 14 x 14
        x = self.inception4d(x)
        # N x 528 x 14 x 14
        aux2 = None
        if self.aux2 is not None and self.training:
            aux2 = self.aux2(x)

        x = self.inception4e(x)
        # N x 832 x 14 x 14
        x = self.maxpool4(x)
        # N x 832 x 7 x 7
        x = self.inception5a(x)
        # N x 832 x 7 x 7
        x = self.inception5b(x)
        # N x 1024 x 7 x 7

        x = self.avgpool(x)
        # N x 1024 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 1024
        if not self.backbone_mode:
            x = self.dropout(x)
            x = self.fc(x)
        # N x num_classes
        return x, aux2, aux1

    def forward(self, x):
        x, aux1, aux2 = self._forward(x)
        if self.training and self.aux_logits:
            return GoogLeNetOutputs(x, aux2, aux1)
        else:
            return x

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights
            c_temp = torch.nn.Linear(1024, self.num_classes)
            torch.nn.init.xavier_uniform(c_temp.weight)
            pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
            pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

`load_state_dict(state_dict, strict=True)`

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name	Type	Description	Default
`state_dict`		The state_dict to load	required
`strict`		strict loading (see super() docs)	`True`

Source code in src/super_gradients/training/models/classification_models/googlenet.py

def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights
        c_temp = torch.nn.Linear(1024, self.num_classes)
        torch.nn.init.xavier_uniform(c_temp.weight)
        pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
        pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

LeNet in PyTorch.

https://yann.lecun.com/exdb/lenet/

MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" for more details.

`Block`

Bases: nn.Module

Depthwise conv + Pointwise conv

Source code in src/super_gradients/training/models/classification_models/mobilenet.py

class Block(nn.Module):
    """Depthwise conv + Pointwise conv"""

    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out

`MobileNet`

Bases: BaseClassifier, SupportsReplaceInputChannels

Source code in src/super_gradients/training/models/classification_models/mobilenet.py

class MobileNet(BaseClassifier, SupportsReplaceInputChannels):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, 128, (128, 2), 256, (256, 2), 512, 512, 512, 512, 512, (512, 2), 1024, (1024, 2)]

    def __init__(self, num_classes=10, backbone_mode=False, up_to_layer=None, in_channels: int = 3):
        super(MobileNet, self).__init__()
        self.backbone_mode = backbone_mode
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32, up_to_layer=up_to_layer if up_to_layer is not None else len(self.cfg))

        if not self.backbone_mode:
            self.linear = nn.Linear(self.cfg[-1], num_classes)

    def _make_layers(self, in_planes, up_to_layer):
        layers = []
        for x in self.cfg[:up_to_layer]:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
        """
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)

        if not self.backbone_mode:
            out = F.avg_pool2d(out, 2)
            out = out.view(out.size(0), -1)
            out = self.linear(out)

        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

`forward(x)`

Parameters:

Name	Type	Description	Default
`up_to_layer`		forward through the net layers up to a specific layer. if None, run all layers	required

Source code in src/super_gradients/training/models/classification_models/mobilenet.py

def forward(self, x):
    """
    :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
    """
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layers(out)

    if not self.backbone_mode:
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)

    return out

This is a PyTorch implementation of MobileNetV2 architecture as described in the paper: Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation. https://arxiv.org/pdf/1801.04381

Code taken from https://github.com/tonylins/pytorch-mobilenet-v2 License: Apache Version 2.0, January 2004 http://www.apache.org/licenses/

Pre-trained ImageNet model: 'deci-model-repository/mobilenet_v2/ckpt_best.pth'

`CustomMobileNetV2`

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

@register_model(Models.CUSTOM_MOBILENET_V2)
class CustomMobileNetV2(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params:–≠ HpmStruct
            must contain:
                'num_classes': int
                'width_mult': float
                'structure' : list. specify the mobilenetv2 architecture
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            structure=arch_params.structure,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

`init(arch_params)`

Parameters:

Name	Type	Description	Default
`arch_params`		–≠ HpmStruct must contain: 'num_classes': int 'width_mult': float 'structure' : list. specify the mobilenetv2 architecture	required

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

def __init__(self, arch_params):
    """
    :param arch_params:–≠ HpmStruct
        must contain:
            'num_classes': int
            'width_mult': float
            'structure' : list. specify the mobilenetv2 architecture
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=arch_params.width_mult,
        structure=arch_params.structure,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

`InvertedResidual`

Bases: nn.Module

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
        """
        :param inp: number of input channels
        :param oup: number of output channels
        :param stride: conv stride
        :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
        :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
        """
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        groups = int(hidden_dim / grouped_conv_size)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)

`init(inp, oup, stride, expand_ratio, grouped_conv_size=1)`

:grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1

Parameters:

Name	Description	Default
`inp`	number of input channels	required
`oup`	number of output channels	required
`stride`	conv stride	required
`expand_ratio`	expansion ratio of the hidden layer after pointwise conv	required

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
    """
    :param inp: number of input channels
    :param oup: number of output channels
    :param stride: conv stride
    :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
    :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
    """
    super(InvertedResidual, self).__init__()
    self.stride = stride
    assert stride in [1, 2]

    hidden_dim = int(inp * expand_ratio)
    groups = int(hidden_dim / grouped_conv_size)
    self.use_res_connect = self.stride == 1 and inp == oup

    if expand_ratio == 1:
        self.conv = nn.Sequential(
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )
    else:
        self.conv = nn.Sequential(
            # pw
            nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )

`MobileNetV2`

Bases: MobileNetBase

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

class MobileNetV2(MobileNetBase):
    def __init__(
        self,
        num_classes,
        dropout: float,
        width_mult=1.0,
        structure=None,
        backbone_mode: bool = False,
        grouped_conv_size=1,
        in_channels=3,
    ) -> object:
        super(MobileNetV2, self).__init__()
        self.in_channels = in_channels
        block = InvertedResidual
        last_channel = 1280
        # IF STRUCTURE IS NONE - USE THE DEFAULT STRUCTURE NOTED
        #                                                  t, c,  n, s    stage-0 is the first conv_bn layer
        self.interverted_residual_setting = structure or [
            [1, 16, 1, 1],  # stage-1
            [6, 24, 2, 2],  # stage-2
            [6, 32, 3, 2],  # stage-3
            [6, 64, 4, 2],  # stage-4
            [6, 96, 3, 1],  # stage-5
            [6, 160, 3, 2],  # stage-6
            [6, 320, 1, 1],
        ]  # stage-7
        #                                                                   stage-8  is the last_layer
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel

        curr_channels = 32
        self.features = [conv_bn(in_channels, curr_channels, 2)]
        # building inverted residual blocks
        for t, c, n, s in self.interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            for i in range(n):
                if i == 0:
                    self.features.append(block(curr_channels, output_channel, s, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                else:
                    self.features.append(block(curr_channels, output_channel, 1, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                curr_channels = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(curr_channels, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)
        self.backbone_mode = backbone_mode

        if self.backbone_mode:
            self.classifier = nn.Identity()
            # TODO: remove during migration of YOLOs to the new base
            self.backbone_connection_channels = self._extract_connection_layers_input_channel_size()
        else:
            # building classifier
            self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(self.last_channel, num_classes))
        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        if self.backbone_mode:
            return x
        else:
            x = x.mean(3).mean(2)
            return self.classifier(x)

    def _extract_connection_layers_input_channel_size(self):
        """
        Extracts the number of channels out when using mobilenetV2 as yolo backbone
        """
        curr_layer_input = torch.rand(1, self.in_channels, 320, 320)  # input dims are used to extract number of channels
        layers_num_to_extract = [np.array(self.interverted_residual_setting)[:stage, 2].sum() for stage in [3, 5]]
        connection_layers_input_channel_size = []
        for layer_idx, feature in enumerate(self.features):
            curr_layer_input = feature(curr_layer_input)
            if layer_idx in layers_num_to_extract:
                connection_layers_input_channel_size.append(curr_layer_input.shape[1])
        connection_layers_input_channel_size.append(self.last_channel)
        connection_layers_input_channel_size.reverse()
        return connection_layers_input_channel_size

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.features[0][0] = replace_conv2d_input_channels(conv=self.features[0][0], in_channels=in_channels, fn=compute_new_weights_fn)
        self.in_channels = self.get_input_channels()

    def get_input_channels(self) -> int:
        return self.features[0][0].in_channels

`MobileNetV2Base`

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

@register_model(Models.MOBILENET_V2)
class MobileNetV2Base(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.0,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

`init(arch_params)`

Parameters:

Name	Type	Description	Default
`arch_params`		HpmStruct must contain: 'num_classes': int	required

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

def __init__(self, arch_params):
    """
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.0,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

`MobileNetV2_135`

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

@register_model(Models.MOBILE_NET_V2_135)
class MobileNetV2_135(MobileNetV2):
    def __init__(self, arch_params):
        """
        This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.35,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

`init(arch_params)`

This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50

Parameters:

Name	Type	Description	Default
`arch_params`		HpmStruct must contain: 'num_classes': int	required

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py

def __init__(self, arch_params):
    """
    This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.35,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

Creates a MobileNetV3 Model as defined in: Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam. (2019). Searching for MobileNetV3 arXiv preprint arXiv:1905.02244.

`mobilenetv3_custom`

Bases: MobileNetV3

Constructs a MobileNetV3-Customized model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py

@register_model(Models.MOBILENET_V3_CUSTOM)
class mobilenetv3_custom(MobileNetV3):
    """
    Constructs a MobileNetV3-Customized model
    """

    def __init__(self, arch_params):
        super().__init__(
            cfgs=arch_params.structure,
            mode=arch_params.mode,
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            in_channels=get_param(arch_params, "in_channels", 3),
        )

`mobilenetv3_large`

Bases: MobileNetV3

Constructs a MobileNetV3-Large model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py

@register_model(Models.MOBILENET_V3_LARGE)
class mobilenetv3_large(MobileNetV3):
    """
    Constructs a MobileNetV3-Large model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 0, 0, 1],
            [3, 4, 24, 0, 0, 2],
            [3, 3, 24, 0, 0, 1],
            [5, 3, 40, 1, 0, 2],
            [5, 3, 40, 1, 0, 1],
            [5, 3, 40, 1, 0, 1],
            [3, 6, 80, 0, 1, 2],
            [3, 2.5, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [5, 6, 160, 1, 1, 2],
            [5, 6, 160, 1, 1, 1],
            [5, 6, 160, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="large", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

`mobilenetv3_small`

Bases: MobileNetV3

Constructs a MobileNetV3-Small model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py

@register_model(Models.MOBILENET_V3_SMALL)
class mobilenetv3_small(MobileNetV3):
    """
    Constructs a MobileNetV3-Small model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 1, 0, 2],
            [3, 4.5, 24, 0, 0, 2],
            [3, 3.67, 24, 0, 0, 1],
            [5, 4, 40, 1, 1, 2],
            [5, 6, 40, 1, 1, 1],
            [5, 6, 40, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 6, 96, 1, 1, 2],
            [5, 6, 96, 1, 1, 1],
            [5, 6, 96, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="small", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search

https://github.com/kuangliu/pytorch-cifar/blob/master/models/pnasnet.py

`SepConv`

Bases: nn.Module

Separable Convolution.

Source code in src/super_gradients/training/models/classification_models/pnasnet.py

class SepConv(nn.Module):
    """Separable Convolution."""

    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding=(kernel_size - 1) // 2, bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))

Pre-activation ResNet in PyTorch.

Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv:1603.05027

Based on https://github.com/kuangliu/pytorch-cifar/blob/master/models/preact_resnet.py

`PreActBlock`

Bases: nn.Module

Pre-activation version of the BasicBlock.

Source code in src/super_gradients/training/models/classification_models/preact_resnet.py

class PreActBlock(nn.Module):
    """Pre-activation version of the BasicBlock."""

    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out

`PreActBottleneck`

Bases: nn.Module

Pre-activation version of the original Bottleneck module.

Source code in src/super_gradients/training/models/classification_models/preact_resnet.py

class PreActBottleneck(nn.Module):
    """Pre-activation version of the original Bottleneck module."""

    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out

Regnet - from paper: Designing Network Design Spaces - https://arxiv.org/pdf/2003.13678.pdf Implementation of paradigm described in paper published by Facebook AI Research (FAIR) @author: Signatrix GmbH Code taken from: https://github.com/signatrix/regnet - MIT Licence

`CustomAnyNet`

Bases: AnyNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py

@register_model(Models.CUSTOM_ANYNET)
class CustomAnyNet(AnyNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            ls_num_blocks=arch_params.ls_num_blocks,
            ls_block_width=arch_params.ls_block_width,
            ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
            ls_group_width=arch_params.ls_group_width,
            stride=arch_params.stride,
            num_classes=arch_params.num_classes,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            backbone_mode=get_param(arch_params, "backbone_mode", False),
            dropout_prob=get_param(arch_params, "dropout_prob", 0),
            droppath_prob=get_param(arch_params, "droppath_prob", 0),
            input_channels=get_param(arch_params, "input_channels", 3),
        )

`init(arch_params)`

All parameters must be provided in arch_params other than SE

Source code in src/super_gradients/training/models/classification_models/regnet.py

def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        ls_num_blocks=arch_params.ls_num_blocks,
        ls_block_width=arch_params.ls_block_width,
        ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
        ls_group_width=arch_params.ls_group_width,
        stride=arch_params.stride,
        num_classes=arch_params.num_classes,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        backbone_mode=get_param(arch_params, "backbone_mode", False),
        dropout_prob=get_param(arch_params, "dropout_prob", 0),
        droppath_prob=get_param(arch_params, "droppath_prob", 0),
        input_channels=get_param(arch_params, "input_channels", 3),
    )

`CustomRegNet`

Bases: RegNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py

@register_model(Models.CUSTOM_REGNET)
class CustomRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            initial_width=arch_params.initial_width,
            slope=arch_params.slope,
            quantized_param=arch_params.quantized_param,
            network_depth=arch_params.network_depth,
            bottleneck_ratio=arch_params.bottleneck_ratio,
            group_width=arch_params.group_width,
            stride=arch_params.stride,
            arch_params=arch_params,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            input_channels=get_param(arch_params, "input_channels", 3),
        )

`init(arch_params)`

All parameters must be provided in arch_params other than SE

Source code in src/super_gradients/training/models/classification_models/regnet.py

def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        initial_width=arch_params.initial_width,
        slope=arch_params.slope,
        quantized_param=arch_params.quantized_param,
        network_depth=arch_params.network_depth,
        bottleneck_ratio=arch_params.bottleneck_ratio,
        group_width=arch_params.group_width,
        stride=arch_params.stride,
        arch_params=arch_params,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        input_channels=get_param(arch_params, "input_channels", 3),
    )

`NASRegNet`

Bases: RegNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py

@register_model(Models.NAS_REGNET)
class NASRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters are provided as a single structure list: arch_params.structure"""
        structure = arch_params.structure
        super().__init__(
            initial_width=structure[0],
            slope=structure[1],
            quantized_param=structure[2],
            network_depth=structure[3],
            bottleneck_ratio=structure[4],
            group_width=structure[5],
            stride=structure[6],
            se_ratio=structure[7] if structure[7] > 0 else None,
            arch_params=arch_params,
        )

`init(arch_params)`

All parameters are provided as a single structure list: arch_params.structure

Source code in src/super_gradients/training/models/classification_models/regnet.py

def __init__(self, arch_params):
    """All parameters are provided as a single structure list: arch_params.structure"""
    structure = arch_params.structure
    super().__init__(
        initial_width=structure[0],
        slope=structure[1],
        quantized_param=structure[2],
        network_depth=structure[3],
        bottleneck_ratio=structure[4],
        group_width=structure[5],
        stride=structure[6],
        se_ratio=structure[7] if structure[7] > 0 else None,
        arch_params=arch_params,
    )

`verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width)`

VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER

Source code in src/super_gradients/training/models/classification_models/regnet.py

def verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width):
    """VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER"""
    err_message = "Parameters don't fit"
    assert len(set(ls_bottleneck_ratio)) == 1, f"{err_message} AnyNetXb"
    assert len(set(ls_group_width)) == 1, f"{err_message} AnyNetXc"
    assert all(i <= j for i, j in zip(ls_block_width, ls_block_width[1:])) is True, f"{err_message} AnyNetXd"
    if len(ls_num_blocks) > 2:
        assert all(i <= j for i, j in zip(ls_num_blocks[:-2], ls_num_blocks[1:-1])) is True, f"{err_message} AnyNetXe"
    # For each stage & each layer, number of channels (block width / bottleneck ratio) must be divisible by group width
    for block_width, bottleneck_ratio, group_width in zip(ls_block_width, ls_bottleneck_ratio, ls_group_width):
        assert int(block_width // bottleneck_ratio) % group_width == 0

Repvgg Pytorch Implementation. This model trains a vgg with residual blocks but during inference (in deployment mode) will convert the model to vgg model. Pretrained models: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq Refrerences: [1] https://github.com/DingXiaoH/RepVGG [2] https://arxiv.org/pdf/2101.03697.pdf

Based on https://github.com/DingXiaoH/RepVGG

`RepVGG`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/repvgg.py

class RepVGG(BaseClassifier):
    def __init__(
        self,
        struct,
        num_classes=1000,
        width_multiplier=None,
        build_residual_branches=True,
        use_se=False,
        backbone_mode=False,
        in_channels=3,
    ):
        """
        :param struct: list containing number of blocks per repvgg stage
        :param num_classes: number of classes if nut in backbone mode
        :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
        :param build_residual_branches: whether to add residual connections or not
        :param use_se: use squeeze and excitation layers
        :param backbone_mode: if true, dropping the final linear layer
        :param in_channels: input channels
        """
        super(RepVGG, self).__init__()

        if isinstance(width_multiplier, float):
            width_multiplier = [width_multiplier] * 4
        else:
            assert len(width_multiplier) == 4

        self.build_residual_branches = build_residual_branches
        self.use_se = use_se
        self.backbone_mode = backbone_mode

        self.in_planes = int(64 * width_multiplier[0])

        self.stem = RepVGGBlock(
            in_channels=in_channels,
            out_channels=self.in_planes,
            stride=2,
            build_residual_branches=build_residual_branches,
            activation_type=nn.ReLU,
            activation_kwargs=dict(inplace=True),
            se_type=SEBlock if self.use_se else nn.Identity,
            se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
        )
        self.cur_layer_idx = 1
        self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
        self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
        self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
        self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
            self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

        if not build_residual_branches:
            self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
            fuse_repvgg_blocks_residual_branches(self)

        self.final_width_mult = width_multiplier[3]

    def _make_stage(self, planes, struct, stride):
        strides = [stride] + [1] * (struct - 1)
        blocks = []
        for stride in strides:
            blocks.append(
                RepVGGBlock(
                    in_channels=self.in_planes,
                    out_channels=planes,
                    stride=stride,
                    groups=1,
                    build_residual_branches=self.build_residual_branches,
                    activation_type=nn.ReLU,
                    activation_kwargs=dict(inplace=True),
                    se_type=SEBlock if self.use_se else nn.Identity,
                    se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
                )
            )
            self.in_planes = planes
            self.cur_layer_idx += 1
        return nn.Sequential(*blocks)

    def forward(self, x):
        out = self.stem(x)
        out = self.stage1(out)
        out = self.stage2(out)
        out = self.stage3(out)
        out = self.stage4(out)
        if not self.backbone_mode:
            out = self.avgpool(out)
            out = out.view(out.size(0), -1)
            out = self.linear(out)
        return out

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        if self.build_residual_branches:
            fuse_repvgg_blocks_residual_branches(self)

    def train(self, mode: bool = True):

        assert (
            not mode or self.build_residual_branches
        ), "Trying to train a model without residual branches, set arch_params.build_residual_branches to True and retrain the model"
        super(RepVGG, self).train(mode=mode)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(int(512 * self.final_width_mult), new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"linear": lr, "default": 0}

`init(struct, num_classes=1000, width_multiplier=None, build_residual_branches=True, use_se=False, backbone_mode=False, in_channels=3)`

Parameters:

Name	Description	Default
`struct`	list containing number of blocks per repvgg stage	required
`num_classes`	number of classes if nut in backbone mode	`1000`
`width_multiplier`	list of per stage width multiplier or float if using single value for all stages	`None`
`build_residual_branches`	whether to add residual connections or not	`True`
`use_se`	use squeeze and excitation layers	`False`
`backbone_mode`	if true, dropping the final linear layer	`False`
`in_channels`	input channels	`3`

Source code in src/super_gradients/training/models/classification_models/repvgg.py

def __init__(
    self,
    struct,
    num_classes=1000,
    width_multiplier=None,
    build_residual_branches=True,
    use_se=False,
    backbone_mode=False,
    in_channels=3,
):
    """
    :param struct: list containing number of blocks per repvgg stage
    :param num_classes: number of classes if nut in backbone mode
    :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
    :param build_residual_branches: whether to add residual connections or not
    :param use_se: use squeeze and excitation layers
    :param backbone_mode: if true, dropping the final linear layer
    :param in_channels: input channels
    """
    super(RepVGG, self).__init__()

    if isinstance(width_multiplier, float):
        width_multiplier = [width_multiplier] * 4
    else:
        assert len(width_multiplier) == 4

    self.build_residual_branches = build_residual_branches
    self.use_se = use_se
    self.backbone_mode = backbone_mode

    self.in_planes = int(64 * width_multiplier[0])

    self.stem = RepVGGBlock(
        in_channels=in_channels,
        out_channels=self.in_planes,
        stride=2,
        build_residual_branches=build_residual_branches,
        activation_type=nn.ReLU,
        activation_kwargs=dict(inplace=True),
        se_type=SEBlock if self.use_se else nn.Identity,
        se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
    )
    self.cur_layer_idx = 1
    self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
    self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
    self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
    self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
    if not self.backbone_mode:
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

    if not build_residual_branches:
        self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
        fuse_repvgg_blocks_residual_branches(self)

    self.final_width_mult = width_multiplier[3]

ResNet in PyTorch. For Pre-activation ResNet, see 'preact_resnet.py'. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385

Pre-trained ImageNet models: 'deci-model-repository/resnet?/ckpt_best.pth' => ? = the type of resnet (e.g. 18, 34...) Pre-trained CIFAR10 models: 'deci-model-repository/CIFAR_NAS_#??????/ckpt_best.pth' => ? = num of model, structure, width_mult

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

`ResNet`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/resnet.py

class ResNet(BaseClassifier):
    def __init__(
        self,
        block,
        num_blocks: list,
        num_classes: int = 10,
        width_mult: float = 1,
        expansion: int = 1,
        droppath_prob=0.0,
        input_batchnorm: bool = False,
        backbone_mode: bool = False,
        in_channels: int = 3,
    ):
        super(ResNet, self).__init__()
        self.expansion = expansion
        self.backbone_mode = backbone_mode
        self.structure = [num_blocks, width_mult]
        self.in_planes = width_multiplier(64, width_mult)
        self.input_batchnorm = input_batchnorm

        if self.input_batchnorm:
            self.bn0 = nn.BatchNorm2d(num_features=in_channels)

        self.conv1 = nn.Conv2d(in_channels, width_multiplier(64, width_mult), kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(width_multiplier(64, width_mult))
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, width_multiplier(64, width_mult), num_blocks[0], stride=1, droppath_prob=droppath_prob)
        self.layer2 = self._make_layer(block, width_multiplier(128, width_mult), num_blocks[1], stride=2, droppath_prob=droppath_prob)
        self.layer3 = self._make_layer(block, width_multiplier(256, width_mult), num_blocks[2], stride=2, droppath_prob=droppath_prob)
        self.layer4 = self._make_layer(block, width_multiplier(512, width_mult), num_blocks[3], stride=2, droppath_prob=droppath_prob)

        if not self.backbone_mode:
            # IF RESNET IS IN BACK_BONE MODE WE DON'T NEED THE FINAL CLASSIFIER LAYERS, BUT ONLY THE NET BLOCK STRUCTURE
            self.linear = nn.Linear(width_multiplier(512, width_mult) * self.expansion, num_classes)
            self.avgpool = nn.AdaptiveAvgPool2d(1)

        self.width_mult = width_mult

    def _make_layer(self, block, planes, num_blocks, stride, droppath_prob):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        if num_blocks == 0:
            # When the number of blocks is zero but spatial dimension and/or number of filters about to change we put 1
            # 3X3 conv layer to make this change to the new dimensions.
            if stride != 1 or self.in_planes != planes:
                layers.append(nn.Sequential(nn.Conv2d(self.in_planes, planes, kernel_size=3, stride=stride, bias=False, padding=1), nn.BatchNorm2d(planes)))
                self.in_planes = planes

        else:
            for stride in strides:
                layers.append(block(self.in_planes, planes, stride, droppath_prob=droppath_prob))
                self.in_planes = planes * self.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        if self.input_batchnorm:
            x = self.bn0(x)
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        if not self.backbone_mode:
            # IF RESNET IS *NOT* IN BACK_BONE MODE WE  NEED THE FINAL CLASSIFIER LAYERS OUTPUTS
            out = self.avgpool(out)
            out = out.squeeze(dim=2).squeeze(dim=2)
            out = self.linear(out)

        return out

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(width_multiplier(512, self.width_mult) * self.expansion, new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"linear": lr, "default": 0}

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        if self.input_batchnorm:
            self.bn0 = nn.BatchNorm2d(num_features=self.in_channels)

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

`load_state_dict(state_dict, strict=True)`

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name	Type	Description	Default
`state_dict`		The state_dict to load	required
`strict`		strict loading (see super() docs)	`True`

Source code in src/super_gradients/training/models/classification_models/resnet.py

def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

`GroupedConvBlock`

Bases: nn.Module

Grouped convolution block.

Source code in src/super_gradients/training/models/classification_models/resnext.py

class GroupedConvBlock(nn.Module):
    """Grouped convolution block."""

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None):
        super(GroupedConvBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        self.norm_layer = norm_layer
        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

`conv1x1(in_planes, out_planes, stride=1)`

1x1 convolution

Source code in src/super_gradients/training/models/classification_models/resnext.py

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

`conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1)`

3x3 convolution with padding

Source code in src/super_gradients/training/models/classification_models/resnext.py

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation)

SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.

Code adapted from https://github.com/fastai/imagenet-fast/blob/master/cifar10/models/cifar10/senet.py

ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.

https://github.com/kuangliu/pytorch-cifar/blob/master/models/shufflenet.py

`ShuffleBlock`

Bases: nn.Module

Source code in src/super_gradients/training/models/classification_models/shufflenet.py

class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
        N, C, H, W = x.size()
        g = self.groups
        return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

`forward(x)`

Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]

Source code in src/super_gradients/training/models/classification_models/shufflenet.py

def forward(self, x):
    """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
    N, C, H, W = x.size()
    g = self.groups
    return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

ShuffleNetV2 in PyTorch.

See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. (https://arxiv.org/abs/1807.11164)

Code taken from torchvision/models/shufflenetv2.py

`ChannelShuffleInvertedResidual`

Bases: nn.Module

Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

When stride > 1
the whole input goes through branch1,
the whole input goes through branch2 , and the arbitrary number of output channels are produced.
When stride == 1
half of input channels in are passed as identity,
another half of input channels goes through branch2, and the number of output channels after the block remains the same as in input.

Channel shuffle is performed on a concatenation in both cases.

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py

class ChannelShuffleInvertedResidual(nn.Module):
    """
    Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

    * When stride > 1
      - the whole input goes through branch1,
      - the whole input goes through branch2 ,
      and the arbitrary number of output channels are produced.
    * When stride == 1
      - half of input channels in are passed as identity,
      - another half of input channels goes through branch2,
      and the number of output channels after the block remains the same as in input.

    Channel shuffle is performed on a concatenation in both cases.
    """

    def __init__(self, inp: int, out: int, stride: int) -> None:
        super(ChannelShuffleInvertedResidual, self).__init__()

        assert 1 <= stride <= 3, "Illegal stride value"
        assert (stride != 1) or (inp == out), "When stride == 1 num of input channels should be equal to the requested num of out output channels"

        self.stride = stride
        # half of requested out channels will be produced by each branch
        branch_features = out // 2

        if self.stride > 1:
            self.branch1 = nn.Sequential(
                nn.Conv2d(inp, inp, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=inp),
                nn.BatchNorm2d(inp),
                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(branch_features),
                nn.ReLU(inplace=True),
            )
        else:
            # won't be called if self.stride == 1
            self.branch1 = nn.Identity()

        self.branch2 = nn.Sequential(
            # branch 2 operates on the whole input when stride > 1 and on half of it otherwise
            nn.Conv2d(inp if (self.stride > 1) else inp // 2, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=branch_features),
            nn.BatchNorm2d(branch_features),
            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
        )

    @staticmethod
    def channel_shuffle(x: Tensor, groups: int) -> Tensor:
        """
        From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
            A “channel shuffle” operation is then introduced to enable
            information communication between different groups of channels and improve accuracy.

        The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

        Example:
            If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
            then activation maps in x are:
            from_B1, from_B1, ... from_B2, from_B2
            After channel_shuffle activation maps in x will be:
            from_B1, from_B2, ... from_B1, from_B2
        """

        batch_size, num_channels, height, width = x.size()
        channels_per_group = num_channels // groups

        # reshape
        x = x.view(batch_size, groups, channels_per_group, height, width)
        x = torch.transpose(x, 1, 2).contiguous()

        # flatten
        x = x.view(batch_size, -1, height, width)
        return x

    def forward(self, x: Tensor) -> Tensor:
        if self.stride == 1:
            # num channels remains the same due to assert that inp == out in __init__
            x1, x2 = x.chunk(2, dim=1)
            out = torch.cat((x1, self.branch2(x2)), dim=1)
        else:
            # inp num channels can change to a requested arbitrary out num channels
            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)

        out = self.channel_shuffle(out, 2)
        return out

`channel_shuffle(x, groups)` `staticmethod`

From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164): A “channel shuffle” operation is then introduced to enable information communication between different groups of channels and improve accuracy.

The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

Example: If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle), then activation maps in x are: from_B1, from_B1, ... from_B2, from_B2 After channel_shuffle activation maps in x will be: from_B1, from_B2, ... from_B1, from_B2

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py

@staticmethod
def channel_shuffle(x: Tensor, groups: int) -> Tensor:
    """
    From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
        A “channel shuffle” operation is then introduced to enable
        information communication between different groups of channels and improve accuracy.

    The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

    Example:
        If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
        then activation maps in x are:
        from_B1, from_B1, ... from_B2, from_B2
        After channel_shuffle activation maps in x will be:
        from_B1, from_B2, ... from_B1, from_B2
    """

    batch_size, num_channels, height, width = x.size()
    channels_per_group = num_channels // groups

    # reshape
    x = x.view(batch_size, groups, channels_per_group, height, width)
    x = torch.transpose(x, 1, 2).contiguous()

    # flatten
    x = x.view(batch_size, -1, height, width)
    return x

`ShuffleNetV2Base`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py

class ShuffleNetV2Base(BaseClassifier):
    def __init__(
        self,
        structure: List[int],
        stages_out_channels: List[int],
        backbone_mode: bool = False,
        num_classes: int = 1000,
        block: nn.Module = ChannelShuffleInvertedResidual,
        in_channels: int = 3,
    ):
        super(ShuffleNetV2Base, self).__init__()

        self.backbone_mode = backbone_mode

        if len(structure) != 3:
            raise ValueError("expected structure as list of 3 positive ints")
        if len(stages_out_channels) != 5:
            raise ValueError("expected stages_out_channels as list of 5 positive ints")
        self.structure = structure
        self.out_channels = stages_out_channels

        output_channels = self.out_channels[0]
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, output_channels, 3, 2, 1, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )

        input_channels = output_channels

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Static annotations for mypy
        self.layer2 = self._make_layer(block, input_channels, self.out_channels[1], self.structure[0])
        self.layer3 = self._make_layer(block, self.out_channels[1], self.out_channels[2], self.structure[1])
        self.layer4 = self._make_layer(block, self.out_channels[2], self.out_channels[3], self.structure[2])

        input_channels = self.out_channels[3]
        output_channels = self.out_channels[-1]
        self.conv5 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )

        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(output_channels, num_classes)

    @staticmethod
    def _make_layer(block, input_channels, output_channels, repeats):
        # add first block with stride 2 to downsize the input
        seq = [block(input_channels, output_channels, 2)]

        for _ in range(repeats - 1):
            seq.append(block(output_channels, output_channels, 1))
        return nn.Sequential(*seq)

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # removing fc weights first not to break strict loading
            fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

            for key in fc_weights_keys:
                pretrained_model_weights_dict.pop(key)

        super().load_state_dict(pretrained_model_weights_dict, strict)

    def forward(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.conv5(x)

        if not self.backbone_mode:
            x = self.avgpool(x)
            x = x.view(x.size(0), -1)
            x = self.fc(x)
        return x

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1[0] = replace_conv2d_input_channels(conv=self.conv1[0], in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1[0].in_channels

`load_state_dict(state_dict, strict=True)`

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name	Type	Description	Default
`state_dict`		The state_dict to load	required
`strict`		strict loading (see super() docs)	`True`

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py

def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # removing fc weights first not to break strict loading
        fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

        for key in fc_weights_keys:
            pretrained_model_weights_dict.pop(key)

    super().load_state_dict(pretrained_model_weights_dict, strict)

VGG11/13/16/19 in Pytorch. Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py

Vision Transformer in PyTorch. Reference: [1] Dosovitskiy, Alexey, et al. "An image is worth 16x16 words: Transformers for image recognition at scale." arXiv preprint arXiv:2010.11929 (2020)

Code adapted from https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py

`Attention`

Bases: nn.Module

self attention layer with residual connection

Source code in src/super_gradients/training/models/classification_models/vit.py

class Attention(nn.Module):
    """
    self attention layer with residual connection
    """

    def __init__(self, hidden_dim, heads=8):
        super().__init__()
        dim_head = hidden_dim // heads
        inner_dim = dim_head * heads

        self.heads = heads
        self.scale = dim_head**-0.5

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(hidden_dim, inner_dim * 3, bias=True)  # Qx, Kx, Vx are calculated at once
        self.proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):

        B, N, C = x.shape
        # computing query, key and value matrices at once
        qkv = self.to_qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        out = (attn @ v).transpose(1, 2).reshape(B, N, C)

        out = self.proj(out)

        return out

`FeedForward`

Bases: nn.Module

feed forward block with residual connection

Source code in src/super_gradients/training/models/classification_models/vit.py

class FeedForward(nn.Module):
    """
    feed forward block with residual connection
    """

    def __init__(self, hidden_dim, mlp_dim, dropout=0.0):
        super().__init__()
        self.fc1 = nn.Linear(hidden_dim, mlp_dim)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(mlp_dim, hidden_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.act(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

`PatchEmbed`

Bases: nn.Module

2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)

Source code in src/super_gradients/training/models/classification_models/vit.py

class PatchEmbed(nn.Module):
    """
    2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)
    """

    def __init__(self, img_size: tuple, patch_size: tuple, in_channels=3, hidden_dim=768, norm_layer=None, flatten=True):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(in_channels, hidden_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(hidden_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x

    def get_input_channels(self) -> int:
        return self.proj.in_channels

`ViT`

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/vit.py

class ViT(BaseClassifier):
    def __init__(
        self,
        image_size: tuple,
        patch_size: tuple,
        num_classes: int,
        hidden_dim: int,
        depth: int,
        heads: int,
        mlp_dim: int,
        in_channels=3,
        dropout_prob=0.0,
        emb_dropout_prob=0.0,
        backbone_mode=False,
    ):
        """
        :param image_size: Image size tuple for data processing into patches done within the model.
        :param patch_size: Patch size tuple for data processing into patches done within the model.
        :param num_classes: Number of classes for the classification head.
        :param hidden_dim: Output dimension of each transformer block.
        :param depth: Number of transformer blocks
        :param heads: Number of attention heads
        :param mlp_dim: Intermediate dimension of the transformer block's feed forward
        :param in_channels: input channels
        :param dropout: Dropout ratio between the feed forward layers.
        :param emb_dropout: Dropout ratio between after the embedding layer
        :param backbone_mode: If True output after pooling layer
        """

        super().__init__()
        image_height, image_width = image_size
        patch_height, patch_width = patch_size

        assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
        assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

        num_patches = (image_height // patch_height) * (image_width // patch_width)

        self.image_size = image_size
        self.patch_size = patch_size
        self.hidden_dim = hidden_dim
        self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
        self.dropout = nn.Dropout(emb_dropout_prob)

        self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

        self.backbone_mode = backbone_mode
        self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
        self.head = nn.Linear(hidden_dim, num_classes)

    def forward(self, img):
        x = self.patch_embedding(img)  # Convert image to patches and embed
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, : (n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = self.pre_head_norm(x)
        x = x[:, 0]
        if self.backbone_mode:
            return x
        else:
            return self.head(x)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0}

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

    def get_input_channels(self) -> int:
        return self.patch_embedding.get_input_channels()

`init(image_size, patch_size, num_classes, hidden_dim, depth, heads, mlp_dim, in_channels=3, dropout_prob=0.0, emb_dropout_prob=0.0, backbone_mode=False)`

Parameters:

Name	Type	Description	Default
`image_size`	`tuple`	Image size tuple for data processing into patches done within the model.	required
`patch_size`	`tuple`	Patch size tuple for data processing into patches done within the model.	required
`num_classes`	`int`	Number of classes for the classification head.	required
`hidden_dim`	`int`	Output dimension of each transformer block.	required
`depth`	`int`	Number of transformer blocks	required
`heads`	`int`	Number of attention heads	required
`mlp_dim`	`int`	Intermediate dimension of the transformer block's feed forward	required
`in_channels`		input channels	`3`
`dropout`		Dropout ratio between the feed forward layers.	required
`emb_dropout`		Dropout ratio between after the embedding layer	required
`backbone_mode`		If True output after pooling layer	`False`

Source code in src/super_gradients/training/models/classification_models/vit.py

def __init__(
    self,
    image_size: tuple,
    patch_size: tuple,
    num_classes: int,
    hidden_dim: int,
    depth: int,
    heads: int,
    mlp_dim: int,
    in_channels=3,
    dropout_prob=0.0,
    emb_dropout_prob=0.0,
    backbone_mode=False,
):
    """
    :param image_size: Image size tuple for data processing into patches done within the model.
    :param patch_size: Patch size tuple for data processing into patches done within the model.
    :param num_classes: Number of classes for the classification head.
    :param hidden_dim: Output dimension of each transformer block.
    :param depth: Number of transformer blocks
    :param heads: Number of attention heads
    :param mlp_dim: Intermediate dimension of the transformer block's feed forward
    :param in_channels: input channels
    :param dropout: Dropout ratio between the feed forward layers.
    :param emb_dropout: Dropout ratio between after the embedding layer
    :param backbone_mode: If True output after pooling layer
    """

    super().__init__()
    image_height, image_width = image_size
    patch_height, patch_width = patch_size

    assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
    assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

    num_patches = (image_height // patch_height) * (image_width // patch_width)

    self.image_size = image_size
    self.patch_size = patch_size
    self.hidden_dim = hidden_dim
    self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

    self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
    self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
    self.dropout = nn.Dropout(emb_dropout_prob)

    self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

    self.backbone_mode = backbone_mode
    self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
    self.head = nn.Linear(hidden_dim, num_classes)

`ConvertableCompletePipelineModel`

Bases: torch.nn.Module

Exportable nn.Module that wraps the model, preprocessing and postprocessing.

Parameters:

Name	Type	Description	Default
`model`	`torch.nn.Module`	torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.	required
`pre_process`	`torch.nn.Module`	torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().	`None`
`**prep_model_for_conversion_kwargs`		for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call.	`{}`

Source code in src/super_gradients/training/models/conversion.py

class ConvertableCompletePipelineModel(torch.nn.Module):
    """
    Exportable nn.Module that wraps the model, preprocessing and postprocessing.

    :param model: torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.
    :param pre_process: torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().
    :param pre_process: torch.nn.Module, postprocessing module, its output is the final output. When none (default), set to Identity().
    :param **prep_model_for_conversion_kwargs: for SgModules- args to be passed to model.prep_model_for_conversion
            prior to torch.onnx.export call.
    """

    def __init__(self, model: torch.nn.Module, pre_process: torch.nn.Module = None, post_process: torch.nn.Module = None, **prep_model_for_conversion_kwargs):
        super(ConvertableCompletePipelineModel, self).__init__()
        model.eval()
        pre_process = pre_process or Identity()
        post_process = post_process or Identity()
        if hasattr(model, "prep_model_for_conversion"):
            model.prep_model_for_conversion(**prep_model_for_conversion_kwargs)
        self.model = model
        self.pre_process = pre_process
        self.post_process = post_process

    def forward(self, x):
        return self.post_process(self.model(self.pre_process(x)))

`convert_from_config(cfg)`

Exports model according to cfg.

See: super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation, and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.

Parameters:

Name	Type	Description	Default
`cfg`	`DictConfig`		required

Returns:

Type	Description
`str`	out_path, the path of the saved .onnx file.

Source code in src/super_gradients/training/models/conversion.py

def convert_from_config(cfg: DictConfig) -> str:
    """
    Exports model according to cfg.

    See:
     super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation,
     and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.
    :param cfg:
    :return: out_path, the path of the saved .onnx file.
    """
    cfg, experiment_cfg = prepare_conversion_cfgs(cfg)
    model = models.get(
        model_name=experiment_cfg.architecture,
        num_classes=experiment_cfg.arch_params.num_classes,
        arch_params=experiment_cfg.arch_params,
        strict_load=cfg.strict_load,
        checkpoint_path=cfg.checkpoint_path,
    )
    cfg = parse_args(cfg, models.convert_to_onnx)
    out_path = models.convert_to_onnx(model=model, **cfg)
    logger.info(f"Successfully exported model at {out_path}")
    return out_path

`convert_to_coreml(model, out_path, input_size=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, export_as_ml_program=False, torch_trace_kwargs=None)`

Exports a given SG model to CoreML mlprogram or package.

:param model: torch.nn.Module, model to export to CoreML.
:param out_path: str, destination path for the .mlmodel file.
:param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
:param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
:param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
:param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
 prior to ct.convert call. Supported keys are:
- input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
:param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                    (Supports more iOS versions and devices, but this format will be deprecated at some point).
:param torch_trace_kwargs: kwargs for torch.jit.trace

Returns:

Type	Description
	Path

Source code in src/super_gradients/training/models/conversion.py

@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_coreml(
    model: torch.nn.Module,
    out_path: str,
    input_size: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    export_as_ml_program=False,
    torch_trace_kwargs=None,
):
    """
        Exports a given SG model to CoreML mlprogram or package.

        :param model: torch.nn.Module, model to export to CoreML.
        :param out_path: str, destination path for the .mlmodel file.
        :param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
        :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
        :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
        :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
         prior to ct.convert call. Supported keys are:
        - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
        :param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                            (Supports more iOS versions and devices, but this format will be deprecated at some point).
        :param torch_trace_kwargs: kwargs for torch.jit.trace
    :return: Path
    """
    if ct is None:
        raise ImportError(
            '"coremltools" is required for CoreML export, but is not installed. Please install CoreML Tools using:\n'
            '   "python3 -m pip install coremltools" and try again (Tested with version 6.3.0);'
        )

    logger.debug("Building model...")
    logger.debug(model)
    logger.debug("Model child nodes:")
    logger.debug(next(model.named_children()))

    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the CoreML file.")
    torch_trace_kwargs = torch_trace_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_size is not None:
        input_size = (1, *input_size)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_coreml(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    # TODO: support more than 1 input when prep_for_conversoin will support it.
    example_inputs = [torch.Tensor(np.zeros(input_size))]

    if not out_path.endswith(".mlpackage") and not out_path.endswith(".mlmodel"):
        out_path += ".mlpackage" if export_as_ml_program else ".mlmodel"

    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    # Set the model in evaluation mode.
    complete_model.eval()

    logger.info("Creating torch jit trace...")
    traced_model = torch.jit.trace(complete_model, example_inputs, **torch_trace_kwargs)
    logger.info("Tracing the model with the provided inputs...")
    out = traced_model(*example_inputs)  # using * because example_inputs is a list
    logger.info(f"Inferred output shapes: {[o.shape for o in out]}")
    if export_as_ml_program:
        coreml_model = ct.convert(
            traced_model, convert_to="mlprogram", inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)]
        )
    else:
        coreml_model = ct.convert(traced_model, inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)])

    spec = coreml_model.get_spec()
    logger.debug(spec.description)

    # Changing the input names:
    #   In CoreML, the input name is compiled into classes (named keyword argument in predict).
    #   We want to re-use the same names among different models to make research easier.
    #   We normalize the inputs names to be x_1, x_2, etc.
    for i, _input in enumerate(spec.description.input):
        new_input_name = "x_" + str(i + 1)
        logger.info(f"Renaming input {_input.name} to {new_input_name}")
        ct.utils.rename_feature(spec, _input.name, new_input_name)

    # Re-Initializing the model with the new spec
    coreml_model = ct.models.MLModel(spec, weights_dir=coreml_model.weights_dir)

    # Saving the model
    coreml_model.save(out_path)
    logger.info(f"CoreML model successfully save to {os.path.abspath(out_path)}")
    return out_path

`convert_to_onnx(model, out_path, input_shape=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, torch_onnx_export_kwargs=None, simplify=True)`

Exports model to ONNX.

Parameters:

Name	Type	Description	Default
`model`	`torch.nn.Module`	torch.nn.Module, model to export to ONNX.	required
`out_path`	`str`	str, destination path for the .onnx file.	required
`input_shape`	`tuple`	Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1. DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.	`None`
`pre_process`	`torch.nn.Module`	torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()	`None`
`post_process`	`torch.nn.Module`	torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()	`None`
`prep_model_for_conversion_kwargs`		dict, for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call. Supported keys are: - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.	`None`
`torch_onnx_export_kwargs`		kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call	`None`
`simplify`	`bool`	bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path. When true, the simplified model will be saved in out_path (default=True).	`True`

Returns:

Type	Description
	out_path

Source code in src/super_gradients/training/models/conversion.py

@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_onnx(
    model: torch.nn.Module,
    out_path: str,
    input_shape: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    torch_onnx_export_kwargs=None,
    simplify: bool = True,
):
    """
    Exports model to ONNX.

    :param model: torch.nn.Module, model to export to ONNX.
    :param out_path: str, destination path for the .onnx file.
    :param input_shape: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
    DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.
    :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
    :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
    :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
     prior to torch.onnx.export call. Supported keys are:
    - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
    :param torch_onnx_export_kwargs: kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call
    :param simplify: bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path.
     When true, the simplified model will be saved in out_path (default=True).

    :return: out_path
    """
    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the ONNX file.")
    torch_onnx_export_kwargs = torch_onnx_export_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_shape is not None:
        input_size = (1, *input_shape)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_onnx(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    onnx_input = torch.Tensor(np.zeros(input_size))
    if not out_path.endswith(".onnx"):
        out_path = out_path + ".onnx"
    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    torch.onnx.export(model=complete_model, args=onnx_input, f=out_path, **torch_onnx_export_kwargs)
    if simplify:
        onnx_simplify(out_path, out_path)
    return out_path

`onnx_simplify(onnx_path, onnx_sim_path)`

onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path

Parameters:

Name	Type	Description	Default
`onnx_path`	`str`	path to onnx model	required
`onnx_sim_path`	`str`	path for output onnx simplified model	required

Source code in src/super_gradients/training/models/conversion.py

def onnx_simplify(onnx_path: str, onnx_sim_path: str):
    """
    onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path
    :param onnx_path: path to onnx model
    :param onnx_sim_path: path for output onnx simplified model
    """
    model_sim, check = simplify(model=onnx_path)
    if not check:
        raise RuntimeError("Simplified ONNX model could not be validated")
    onnx.save_model(model_sim, onnx_sim_path)

`prepare_conversion_cfgs(cfg)`

Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name) to be used by convert_recipe_example

Parameters:

Name	Type	Description	Default
`cfg`	`DictConfig`	DictConfig, converion_params config	required

Returns:

Type	Description
	cfg, experiment_cfg

Source code in src/super_gradients/training/models/conversion.py

def prepare_conversion_cfgs(cfg: DictConfig):
    """
    Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name)
     to be used by convert_recipe_example

    :param cfg: DictConfig, converion_params config
    :return: cfg, experiment_cfg
    """
    cfg = hydra.utils.instantiate(cfg)
    # CREATE THE EXPERIMENT CFG

    # Load the latest experiment config
    run_id = get_param(cfg, "run_id")
    if run_id is None:
        run_id = get_latest_run_id(experiment_name=cfg.experiment_name, checkpoints_root_dir=cfg.ckpt_root_dir)
    experiment_cfg = load_experiment_cfg(ckpt_root_dir=cfg.ckpt_root_dir, experiment_name=cfg.experiment_name, run_id=run_id)

    hydra.utils.instantiate(experiment_cfg)
    if cfg.checkpoint_path is None:
        logger.info(
            "checkpoint_params.checkpoint_path was not provided, so the model will be converted using weights from "
            "checkpoints_dir/training_hyperparams.ckpt_name "
        )
        checkpoints_dir = get_checkpoints_dir_path(experiment_name=cfg.experiment_name, ckpt_root_dir=cfg.ckpt_root_dir, run_id=run_id)
        cfg.checkpoint_path = os.path.join(checkpoints_dir, cfg.ckpt_name)

    cfg.out_path = cfg.out_path or cfg.checkpoint_path.replace(".pth", ".onnx")
    logger.info(f"Exporting checkpoint: {cfg.checkpoint_path} to ONNX.")
    return cfg, experiment_cfg

CSP Darknet

`CSPLayer`

Bases: nn.Module

CSP Bottleneck with 3 convolutions

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	int, input channels.	required
`out_channels`	`int`	int, output channels.	required
`num_bottlenecks`	`int`	int, number of bottleneck conv layers.	required
`act`	`Type[nn.Module]`	Type[nn.module], activation type.	required
`shortcut`	`bool`	bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).	`True`
`depthwise`	`bool`	bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).	`False`
`expansion`	`float`	float, determines the number of hidden channels (default=0.5).	`0.5`

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py

class CSPLayer(nn.Module):
    """
    CSP Bottleneck with 3 convolutions

    :param in_channels: int, input channels.
    :param out_channels: int, output channels.
    :param num_bottlenecks: int, number of bottleneck conv layers.
    :param act: Type[nn.module], activation type.
    :param shortcut: bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).
    :param depthwise: bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).
    :param expansion: float, determines the number of hidden channels (default=0.5).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        act: Type[nn.Module],
        shortcut: bool = True,
        depthwise: bool = False,
        expansion: float = 0.5,
    ):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv3 = Conv(2 * hidden_channels, out_channels, 1, stride=1, activation_type=act)
        module_list = [Bottleneck(hidden_channels, hidden_channels, shortcut, act, depthwise) for _ in range(num_bottlenecks)]
        self.bottlenecks = nn.Sequential(*module_list)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((x_1, x_2), dim=1)
        return self.conv3(x)

`GroupedConvBlock`

Bases: nn.Module

Grouped Conv KxK -> usual Conv 1x1

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py

class GroupedConvBlock(nn.Module):
    """
    Grouped Conv KxK -> usual Conv 1x1
    """

    def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
        """
        :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                        (groups = input channels)
        """
        super().__init__()

        self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
        self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

    def forward(self, x):
        return self.conv(self.dconv(x))

`init(input_channels, output_channels, kernel, stride, activation_type, padding=None, groups=None)`

Parameters:

Name	Type	Description	Default
`groups`	`int`	num of groups in the first conv; if None depthwise separable conv will be used (groups = input channels)	`None`

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py

def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
    """
    :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                    (groups = input channels)
    """
    super().__init__()

    self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
    self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

`SPP`

Bases: BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py

@register_detection_module()
class SPP(BaseDetectionModule):
    # SPATIAL PYRAMID POOLING LAYER
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(self, in_channels, output_channels, k: Tuple, activation_type: Type[nn.Module]):
        super().__init__(in_channels)
        self._output_channels = output_channels

        hidden_channels = in_channels // 2
        self.cv1 = Conv(in_channels, hidden_channels, 1, 1, activation_type)
        self.cv2 = Conv(hidden_channels * (len(k) + 1), output_channels, 1, 1, activation_type)
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        x = self.cv1(x)
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))

    @property
    def out_channels(self):
        """
        :return: channels of tensor(s) that will be returned by a module  in forward
        """
        return self._output_channels

`out_channels` `property`

Returns:

Type	Description
	channels of tensor(s) that will be returned by a module in forward

`ViewModule`

Bases: nn.Module

Returns a reshaped version of the input, to be used in None-Backbone Mode

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py

class ViewModule(nn.Module):
    """
    Returns a reshaped version of the input, to be used in None-Backbone Mode
    """

    def __init__(self, features=1024):
        super(ViewModule, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(-1, self.features)

`CSPResNetBackbone`

Bases: nn.Module, SupportsReplaceInputChannels

CSPResNet backbone

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

@register_detection_module()
class CSPResNetBackbone(nn.Module, SupportsReplaceInputChannels):
    """
    CSPResNet backbone
    """

    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        layers: Tuple[int, ...],
        channels: Tuple[int, ...],
        activation: Type[nn.Module],
        return_idx: Tuple[int, int, int],
        use_large_stem: bool,
        width_mult: float,
        depth_mult: float,
        use_alpha: bool,
        pretrained_weights: Optional[str] = None,
        in_channels: int = 3,
    ):
        """

        :param layers: Number of blocks in each stage
        :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
        :param activation: Used activation type for all child modules.
        :param return_idx: Indexes of returned feature maps
        :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
        :param width_mult: Scaling factor for a number of channels
        :param depth_mult: Scaling factor for a number of blocks in each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        :param pretrained_weights:
        :param in_channels: Number of input channels. Default: 3
        """
        super().__init__()
        channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
        layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

        if use_large_stem:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(
                                channels[0] // 2,
                                channels[0] // 2,
                                3,
                                stride=1,
                                padding=1,
                                activation_type=activation,
                                bias=False,
                            ),
                        ),
                        (
                            "conv3",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )
        else:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )

        n = len(channels) - 1
        self.stages = nn.ModuleList(
            [
                CSPResStage(
                    channels[i],
                    channels[i + 1],
                    layers[i],
                    stride=2,
                    activation_type=activation,
                    use_alpha=use_alpha,
                )
                for i in range(n)
            ]
        )

        self._out_channels = channels[1:]
        self._out_strides = [4 * 2**i for i in range(n)]
        self.return_idx = tuple(return_idx)

        if pretrained_weights:
            if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
                state_dict = torch.load(str(pretrained_weights), map_location="cpu")
            elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
                with wait_for_the_master(get_local_rank()):
                    state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
            else:
                raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
            self.load_state_dict(state_dict)

    def forward(self, x: Tensor) -> List[Tensor]:
        x = self.stem(x)
        outs = []
        for idx, stage in enumerate(self.stages):
            x = stage(x)
            if idx in self.return_idx:
                outs.append(x)

        return outs

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """
        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.fuse_block_residual_branches()

    @property
    def out_channels(self) -> Tuple[int]:
        return tuple(self._out_channels)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        first_layer: ConvBNAct = self.stem[0]
        first_layer.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        first_layer: ConvBNAct = self.stem[0]
        return first_layer.get_input_channels()

`init(layers, channels, activation, return_idx, use_large_stem, width_mult, depth_mult, use_alpha, pretrained_weights=None, in_channels=3)`

Parameters:

Name	Type	Description	Default
`layers`	`Tuple[int, ...]`	Number of blocks in each stage	required
`channels`	`Tuple[int, ...]`	Number of channels [stem, stage 0, stage 1, stage 2, ...]	required
`activation`	`Type[nn.Module]`	Used activation type for all child modules.	required
`return_idx`	`Tuple[int, int, int]`	Indexes of returned feature maps	required
`use_large_stem`	`bool`	If True, uses 3 conv+bn+act instead of 2 in stem blocks	required
`width_mult`	`float`	Scaling factor for a number of channels	required
`depth_mult`	`float`	Scaling factor for a number of blocks in each stage	required
`use_alpha`	`bool`	If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock	required
`pretrained_weights`	`Optional[str]`		`None`
`in_channels`	`int`	Number of input channels. Default: 3	`3`

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

@resolve_param("activation", ActivationsTypeFactory())
def __init__(
    self,
    layers: Tuple[int, ...],
    channels: Tuple[int, ...],
    activation: Type[nn.Module],
    return_idx: Tuple[int, int, int],
    use_large_stem: bool,
    width_mult: float,
    depth_mult: float,
    use_alpha: bool,
    pretrained_weights: Optional[str] = None,
    in_channels: int = 3,
):
    """

    :param layers: Number of blocks in each stage
    :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
    :param activation: Used activation type for all child modules.
    :param return_idx: Indexes of returned feature maps
    :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
    :param width_mult: Scaling factor for a number of channels
    :param depth_mult: Scaling factor for a number of blocks in each stage
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    :param pretrained_weights:
    :param in_channels: Number of input channels. Default: 3
    """
    super().__init__()
    channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
    layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

    if use_large_stem:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(
                            channels[0] // 2,
                            channels[0] // 2,
                            3,
                            stride=1,
                            padding=1,
                            activation_type=activation,
                            bias=False,
                        ),
                    ),
                    (
                        "conv3",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )
    else:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )

    n = len(channels) - 1
    self.stages = nn.ModuleList(
        [
            CSPResStage(
                channels[i],
                channels[i + 1],
                layers[i],
                stride=2,
                activation_type=activation,
                use_alpha=use_alpha,
            )
            for i in range(n)
        ]
    )

    self._out_channels = channels[1:]
    self._out_strides = [4 * 2**i for i in range(n)]
    self.return_idx = tuple(return_idx)

    if pretrained_weights:
        if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
            state_dict = torch.load(str(pretrained_weights), map_location="cpu")
        elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
            with wait_for_the_master(get_local_rank()):
                state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
        else:
            raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
        self.load_state_dict(state_dict)

`prep_model_for_conversion(input_size=None, **kwargs)`

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name	Type	Description	Default
`input_size`	`Union[tuple, list]`	[H,W]	`None`

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """
    for module in self.modules():
        if isinstance(module, RepVGGBlock):
            module.fuse_block_residual_branches()

`CSPResNetBasicBlock`

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

class CSPResNetBasicBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
        """

        :param in_channels:
        :param out_channels:
        :param activation_type:
        :param use_residual_connection: Whether to add input x to the output
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        """
        super().__init__()
        if use_residual_connection and in_channels != out_channels:
            raise RuntimeError(
                f"Number of input channels (got {in_channels}) must be equal to the "
                f"number of output channels (got {out_channels}) when use_residual_connection=True"
            )
        self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
        self.conv2 = RepVGGBlock(
            out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
        )
        self.use_residual_connection = use_residual_connection

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        if self.use_residual_connection:
            return x + y
        else:
            return y

`init(in_channels, out_channels, activation_type, use_residual_connection=True, use_alpha=False)`

Parameters:

Name	Type	Description	Default
`in_channels`	`int`		required
`out_channels`	`int`		required
`activation_type`	`Type[nn.Module]`		required
`use_residual_connection`	`bool`	Whether to add input x to the output	`True`
`use_alpha`		If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock	`False`

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
    """

    :param in_channels:
    :param out_channels:
    :param activation_type:
    :param use_residual_connection: Whether to add input x to the output
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    """
    super().__init__()
    if use_residual_connection and in_channels != out_channels:
        raise RuntimeError(
            f"Number of input channels (got {in_channels}) must be equal to the "
            f"number of output channels (got {out_channels}) when use_residual_connection=True"
        )
    self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
    self.conv2 = RepVGGBlock(
        out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
    )
    self.use_residual_connection = use_residual_connection

`CSPResStage`

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

class CSPResStage(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks,
        stride: int,
        activation_type: Type[nn.Module],
        use_attention: bool = True,
        use_alpha: bool = False,
    ):
        """

        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of blocks in stage
        :param stride: Desired down-sampling for the stage (Usually 2)
        :param activation_type: Non-linearity type used in child modules.
        :param use_attention: If True, adds EffectiveSEBlock at the end of each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)
        """
        super().__init__()

        mid_channels = (in_channels + out_channels) // 2
        half_mid_channels = mid_channels // 2
        mid_channels = 2 * half_mid_channels

        if stride != 1:
            self.conv_down = ConvBNAct(in_channels, mid_channels, 3, stride=stride, padding=1, activation_type=activation_type, bias=False)
        else:
            self.conv_down = None
        self.conv1 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
        self.conv2 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
        self.blocks = nn.Sequential(
            *[
                CSPResNetBasicBlock(
                    in_channels=half_mid_channels,
                    out_channels=half_mid_channels,
                    activation_type=activation_type,
                    use_alpha=use_alpha,
                )
                for _ in range(num_blocks)
            ]
        )
        if use_attention:
            self.attn = EffectiveSEBlock(mid_channels)
        else:
            self.attn = nn.Identity()

        self.conv3 = ConvBNAct(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)

    def forward(self, x):
        if self.conv_down is not None:
            x = self.conv_down(x)
        y1 = self.conv1(x)
        y2 = self.blocks(self.conv2(x))
        y = torch.cat([y1, y2], dim=1)
        y = self.attn(y)
        y = self.conv3(y)
        return y

`init(in_channels, out_channels, num_blocks, stride, activation_type, use_attention=True, use_alpha=False)`

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required
`num_blocks`		Number of blocks in stage	required
`stride`	`int`	Desired down-sampling for the stage (Usually 2)	required
`activation_type`	`Type[nn.Module]`	Non-linearity type used in child modules.	required
`use_attention`	`bool`	If True, adds EffectiveSEBlock at the end of each stage	`True`
`use_alpha`	`bool`	If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)	`False`

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py

def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_blocks,
    stride: int,
    activation_type: Type[nn.Module],
    use_attention: bool = True,
    use_alpha: bool = False,
):
    """

    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param num_blocks: Number of blocks in stage
    :param stride: Desired down-sampling for the stage (Usually 2)
    :param activation_type: Non-linearity type used in child modules.
    :param use_attention: If True, adds EffectiveSEBlock at the end of each stage
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)
    """
    super().__init__()

    mid_channels = (in_channels + out_channels) // 2
    half_mid_channels = mid_channels // 2
    mid_channels = 2 * half_mid_channels

    if stride != 1:
        self.conv_down = ConvBNAct(in_channels, mid_channels, 3, stride=stride, padding=1, activation_type=activation_type, bias=False)
    else:
        self.conv_down = None
    self.conv1 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
    self.conv2 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
    self.blocks = nn.Sequential(
        *[
            CSPResNetBasicBlock(
                in_channels=half_mid_channels,
                out_channels=half_mid_channels,
                activation_type=activation_type,
                use_alpha=use_alpha,
            )
            for _ in range(num_blocks)
        ]
    )
    if use_attention:
        self.attn = EffectiveSEBlock(mid_channels)
    else:
        self.attn = nn.Identity()

    self.conv3 = ConvBNAct(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)

A base for a detection network built according to the following scheme: * constructed from nested arch_params; * inside arch_params each nested level (module) has an explicit type and its required parameters * each module accepts in_channels and other parameters * each module defines out_channels property on construction

`CustomizableDetector`

Bases: HasPredict, SgModule

A customizable detector with backbone -> neck -> heads Each submodule with its parameters must be defined explicitly. Modules should follow the interface of BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

class CustomizableDetector(HasPredict, SgModule):
    """
    A customizable detector with backbone -> neck -> heads
    Each submodule with its parameters must be defined explicitly.
    Modules should follow the interface of BaseDetectionModule
    """

    @arch_params_deprecated
    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        """
        :param backbone:    Backbone configuration.
        :param heads:       Head configuration.
        :param neck:        Neck configuration.
        :param num_classes: num classes to predict.
        :param bn_eps:      Epsilon for batch norm.
        :param bn_momentum: Momentum for batch norm.
        :param inplace_act: If True, do the operations operation in-place when possible.
        :param in_channels: number of input channels
        """
        super().__init__()

        self.heads_params = heads
        self.bn_eps = bn_eps
        self.bn_momentum = bn_momentum
        self.inplace_act = inplace_act
        self.in_channels = in_channels
        factory = det_factory.DetectionModulesFactory()

        # move num_classes into heads params
        if num_classes is not None:
            self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", num_classes)

        self.backbone = factory.get(factory.insert_module_param(backbone, "in_channels", in_channels))
        if neck is not None:
            self.neck = factory.get(factory.insert_module_param(neck, "in_channels", self.backbone.out_channels))
            self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.neck.out_channels))
        else:
            self.neck = nn.Identity()
            self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.backbone.out_channels))

        self._initialize_weights(bn_eps, bn_momentum, inplace_act)

        # Processing params
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: float = 0.7
        self._default_nms_conf: float = 0.5
        self._default_nms_top_k: int = 1024
        self._default_max_predictions = 300
        self._default_multi_label_per_box = True
        self._default_class_agnostic_nms = False

    def forward(self, x):
        x = self.backbone(x)
        x = self.neck(x)
        return self.heads(x)

    def _initialize_weights(self, bn_eps: Optional[float] = None, bn_momentum: Optional[float] = None, inplace_act: Optional[bool] = True):
        for m in self.modules():
            t = type(m)
            if t is nn.BatchNorm2d:
                m.eps = bn_eps if bn_eps else m.eps
                m.momentum = bn_momentum if bn_momentum else m.momentum
            elif inplace_act and t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, nn.Mish]:
                m.inplace = True

    def prep_model_for_conversion(self, input_size: Optional[Union[tuple, list]] = None, **kwargs):
        for module in self.modules():
            if module != self and hasattr(module, "prep_model_for_conversion"):
                module.prep_model_for_conversion(input_size, **kwargs)

    def replace_head(self, new_num_classes: Optional[int] = None, new_head: Optional[nn.Module] = None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.heads = new_head
        elif isinstance(self.heads, SupportsReplaceNumClasses):
            self.heads.replace_num_classes(new_num_classes, replace_num_classes_with_random_weights)
        else:
            factory = det_factory.DetectionModulesFactory()
            self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", new_num_classes)
            self.heads = factory.get(factory.insert_module_param(self.heads_params, "in_channels", self.neck.out_channels))
            self._initialize_weights(self.bn_eps, self.bn_momentum, self.inplace_act)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
            self.in_channels = self.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            return self.backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> DetectionPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        raise NotImplementedError

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:         (Optional) Names of the dataset the model was trained on.
        :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:                 (Optional) IoU threshold for the nms algorithm
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if class_names is not None:
            self._class_names = tuple(class_names)
        if image_processor is not None:
            self._image_processor = image_processor
        if iou is not None:
            self._default_nms_iou = float(iou)
        if conf is not None:
            self._default_nms_conf = float(conf)
        if nms_top_k is not None:
            self._default_nms_top_k = int(nms_top_k)
        if max_predictions is not None:
            self._default_max_predictions = int(max_predictions)
        if multi_label_per_box is not None:
            self._default_multi_label_per_box = bool(multi_label_per_box)
        if class_agnostic_nms is not None:
            self._default_class_agnostic_nms = bool(class_agnostic_nms)

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        *,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = self._default_nms_iou if iou is None else iou
        conf = self._default_nms_conf if conf is None else conf
        nms_top_k = self._default_nms_top_k if nms_top_k is None else nms_top_k
        max_predictions = self._default_max_predictions if max_predictions is None else max_predictions
        multi_label_per_box = self._default_multi_label_per_box if multi_label_per_box is None else multi_label_per_box
        class_agnostic_nms = self._default_class_agnostic_nms if class_agnostic_nms is None else class_agnostic_nms

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                nms_top_k=nms_top_k,
                max_predictions=max_predictions,
                multi_label_per_box=multi_label_per_box,
                class_agnostic_nms=class_agnostic_nms,
            ),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:              Images to predict.
        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                        If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def get_finetune_lr_dict(self, lr: float):
        return {"heads": lr, "default": 0}

`init(backbone, heads, neck=None, num_classes=None, bn_eps=None, bn_momentum=None, inplace_act=True, in_channels=3)`

Parameters:

Name	Type	Description	Default
`backbone`	`Union[str, dict, HpmStruct, DictConfig]`	Backbone configuration.	required
`heads`	`Union[str, dict, HpmStruct, DictConfig]`	Head configuration.	required
`neck`	`Optional[Union[str, dict, HpmStruct, DictConfig]]`	Neck configuration.	`None`
`num_classes`	`int`	num classes to predict.	`None`
`bn_eps`	`Optional[float]`	Epsilon for batch norm.	`None`
`bn_momentum`	`Optional[float]`	Momentum for batch norm.	`None`
`inplace_act`	`Optional[bool]`	If True, do the operations operation in-place when possible.	`True`
`in_channels`	`int`	number of input channels	`3`

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

@arch_params_deprecated
def __init__(
    self,
    backbone: Union[str, dict, HpmStruct, DictConfig],
    heads: Union[str, dict, HpmStruct, DictConfig],
    neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
    num_classes: int = None,
    bn_eps: Optional[float] = None,
    bn_momentum: Optional[float] = None,
    inplace_act: Optional[bool] = True,
    in_channels: int = 3,
):
    """
    :param backbone:    Backbone configuration.
    :param heads:       Head configuration.
    :param neck:        Neck configuration.
    :param num_classes: num classes to predict.
    :param bn_eps:      Epsilon for batch norm.
    :param bn_momentum: Momentum for batch norm.
    :param inplace_act: If True, do the operations operation in-place when possible.
    :param in_channels: number of input channels
    """
    super().__init__()

    self.heads_params = heads
    self.bn_eps = bn_eps
    self.bn_momentum = bn_momentum
    self.inplace_act = inplace_act
    self.in_channels = in_channels
    factory = det_factory.DetectionModulesFactory()

    # move num_classes into heads params
    if num_classes is not None:
        self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", num_classes)

    self.backbone = factory.get(factory.insert_module_param(backbone, "in_channels", in_channels))
    if neck is not None:
        self.neck = factory.get(factory.insert_module_param(neck, "in_channels", self.backbone.out_channels))
        self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.neck.out_channels))
    else:
        self.neck = nn.Identity()
        self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.backbone.out_channels))

    self._initialize_weights(bn_eps, bn_momentum, inplace_act)

    # Processing params
    self._class_names: Optional[List[str]] = None
    self._image_processor: Optional[Processing] = None
    self._default_nms_iou: float = 0.7
    self._default_nms_conf: float = 0.5
    self._default_nms_top_k: int = 1024
    self._default_max_predictions = 300
    self._default_multi_label_per_box = True
    self._default_class_agnostic_nms = False

`get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)`

Get a post prediction callback for this model.

Parameters:

Name	Type	Description	Default
`conf`	`float`	A minimum confidence threshold for predictions to be used in post-processing.	required
`iou`	`float`	A IoU threshold for boxes non-maximum suppression.	required
`nms_top_k`	`int`	The maximum number of detections to consider for NMS.	required
`max_predictions`	`int`	The maximum number of detections to return.	required
`multi_label_per_box`	`bool`	If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	required
`class_agnostic_nms`	`bool`	If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	required

Returns:

Type	Description
`DetectionPostPredictionCallback`

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> DetectionPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    raise NotImplementedError

`predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:              Images to predict.
    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                        If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`		Maximum number of images to process at the same time.	required
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    pipeline.predict_webcam()

`set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`class_names`	`Optional[List[str]]`	(Optional) Names of the dataset the model was trained on.	`None`
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded	`None`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:         (Optional) Names of the dataset the model was trained on.
    :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:                 (Optional) IoU threshold for the nms algorithm
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    """
    if class_names is not None:
        self._class_names = tuple(class_names)
    if image_processor is not None:
        self._image_processor = image_processor
    if iou is not None:
        self._default_nms_iou = float(iou)
    if conf is not None:
        self._default_nms_conf = float(conf)
    if nms_top_k is not None:
        self._default_nms_top_k = int(nms_top_k)
    if max_predictions is not None:
        self._default_max_predictions = int(max_predictions)
    if multi_label_per_box is not None:
        self._default_multi_label_per_box = bool(multi_label_per_box)
    if class_agnostic_nms is not None:
        self._default_class_agnostic_nms = bool(class_agnostic_nms)

`DarkResidualBlock`

Bases: nn.Module

DarkResidualBlock - The Darknet Residual Block

Source code in src/super_gradients/training/models/detection_models/darknet53.py

class DarkResidualBlock(nn.Module):
    """
    DarkResidualBlock - The Darknet Residual Block
    """

    def __init__(self, in_channels, shortcut=True):
        super(DarkResidualBlock, self).__init__()
        self.shortcut = shortcut
        reduced_channels = int(in_channels / 2)

        self.layer1 = create_conv_module(in_channels, reduced_channels, kernel_size=1)
        self.layer2 = create_conv_module(reduced_channels, in_channels)

    def forward(self, x):
        residual = x

        out = self.layer1(x)
        out = self.layer2(out)
        out += residual if self.shortcut else out
        return out

`Darknet53`

Bases: Darknet53Base

Source code in src/super_gradients/training/models/detection_models/darknet53.py

@register_model(Models.DARKNET53)
class Darknet53(Darknet53Base):
    def __init__(self, arch_params=None, backbone_mode=True, num_classes=None):
        super(Darknet53, self).__init__()

        # IN ORDER TO ALLOW PASSING PARAMETERS WITH ARCH_PARAMS BUT NOT BREAK YOLOV3 INTEGRATION
        self.backbone_mode = get_param(arch_params, "backbone_mode", backbone_mode)
        self.num_classes = get_param(arch_params, "num_classes", num_classes)

        if not self.backbone_mode:
            # IF NOT USED AS A BACKEND BUT AS A CLASSIFIER WE ADD THE CLASSIFICATION LAYERS
            if self.num_classes is not None:
                nn_sequential_block = nn.Sequential()
                nn_sequential_block.add_module("global_avg_pool", nn.AdaptiveAvgPool2d((1, 1)))
                nn_sequential_block.add_module("view", ViewModule(1024))
                nn_sequential_block.add_module("fc", nn.Linear(1024, self.num_classes))
                self.modules_list.append(nn_sequential_block)
            else:
                raise ValueError("num_classes must be specified to use Darknet53 as a classifier")

    def get_modules_list(self):
        return self.modules_list

    def forward(self, x):
        """
        forward - Forward pass on the modules list
            :param x: The input data
            :return: forward pass for backbone pass or classification pass
        """
        return super().forward(x)

`forward(x)`

forward - Forward pass on the modules list :param x: The input data :return: forward pass for backbone pass or classification pass

Source code in src/super_gradients/training/models/detection_models/darknet53.py

def forward(self, x):
    """
    forward - Forward pass on the modules list
        :param x: The input data
        :return: forward pass for backbone pass or classification pass
    """
    return super().forward(x)

`ViewModule`

Bases: nn.Module

Returns a reshaped version of the input, to be used in None-Backbone Mode

Source code in src/super_gradients/training/models/detection_models/darknet53.py

class ViewModule(nn.Module):
    """
    Returns a reshaped version of the input, to be used in None-Backbone Mode
    """

    def __init__(self, features=1024):
        super(ViewModule, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(-1, self.features)

`PPYoloEPostPredictionCallback`

Bases: DetectionPostPredictionCallback

Non-Maximum Suppression (NMS) module

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py

class PPYoloEPostPredictionCallback(DetectionPostPredictionCallback):
    """Non-Maximum Suppression (NMS) module"""

    def __init__(
        self,
        *,
        score_threshold: float,
        nms_threshold: float,
        nms_top_k: int,
        max_predictions: int,
        multi_label_per_box: bool = True,
        class_agnostic_nms: bool = False,
    ):
        """
        :param score_threshold:     Predictions confidence threshold.
                                    Predictions with score lower than score_threshold will not participate in Top-K & NMS
        :param nms_threshold:       IoU threshold for NMS step.
        :param nms_top_k:           Number of predictions participating in NMS step
        :param max_predictions:     Maximum number of boxes to return after NMS step
        :param multi_label_per_box: Controls whether to decode multiple labels per box.
                                    True - each anchor can produce multiple labels of different classes
                                           that pass confidence threshold check (default).
                                    False - each anchor can produce only one label of the class with the highest score.
        """
        super(PPYoloEPostPredictionCallback, self).__init__()
        self.score_threshold = score_threshold
        self.nms_threshold = nms_threshold
        self.nms_top_k = nms_top_k
        self.max_predictions = max_predictions
        self.multi_label_per_box = multi_label_per_box
        self.class_agnostic_nms = class_agnostic_nms

    @torch.no_grad()
    def forward(self, outputs: Any, device: str = None) -> List[List[Tensor]]:
        """

        :param outputs: Outputs of model's forward() method
        :param device:  (Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class.
                        Will be removed in the SG 3.7.0.
                        A device parameter in case we want to move tensors to a specific device.
        :return:        List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image.
                        Format of each row is [x1, y1, x2, y2, confidence, class]
        """
        nms_result = []
        predictions = self._get_decoded_predictions_from_model_output(outputs)

        for pred_bboxes, pred_scores in zip(*predictions):
            # Cast to float to avoid lack of fp16 support in torchvision.ops.boxes.batched_nms when doing CPU inference
            pred_bboxes = pred_bboxes.float()  # [Anchors, 4]
            pred_scores = pred_scores.float()  # [Anchors, C]

            # Filter all predictions by self.score_threshold
            if self.multi_label_per_box:
                i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
                pred_bboxes = pred_bboxes[i]
                pred_cls_conf = pred_scores[i, j]
                pred_cls_label = j[:]

            else:
                pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
                conf_mask = pred_cls_conf >= self.score_threshold

                pred_cls_conf = pred_cls_conf[conf_mask]
                pred_cls_label = pred_cls_label[conf_mask]
                pred_bboxes = pred_bboxes[conf_mask, :]

            # Filter all predictions by self.nms_top_k
            if pred_cls_conf.size(0) > self.nms_top_k:
                topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
                pred_cls_conf = pred_cls_conf[topk_candidates.indices]
                pred_cls_label = pred_cls_label[topk_candidates.indices]
                pred_bboxes = pred_bboxes[topk_candidates.indices, :]

            # NMS
            if self.class_agnostic_nms:
                idx_to_keep = torchvision.ops.boxes.nms(pred_bboxes, pred_cls_conf, iou_threshold=self.nms_threshold)
            else:
                idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)

            pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
            pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
            pred_bboxes = pred_bboxes[idx_to_keep, :]

            #  nx6 (x1, y1, x2, y2, confidence, class) in pixel units
            final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1)  # [N,6]

            nms_result.append(final_boxes)

        return self._filter_max_predictions(nms_result)

    def _get_decoded_predictions_from_model_output(self, outputs: Any) -> Tuple[Tensor, Tensor]:
        """
        Get the decoded predictions from the PPYoloE/YoloNAS output.
        Depending on the model regime (train/eval) the output format may differ so this method picks the right output.

        :param outputs: Model's forward() return value
        :return:        Tuple of (bboxes, scores) of shape [B, Anchors, 4], [B, Anchors, C]
        """
        if isinstance(outputs, tuple) and len(outputs) == 2:
            if torch.is_tensor(outputs[0]) and torch.is_tensor(outputs[1]) and outputs[0].shape[1] == outputs[1].shape[1] and outputs[0].shape[2] == 4:
                # This path happens when we are using traced model or ONNX model without postprocessing for inference.
                predictions = outputs
            else:
                # First is model predictions, second element of tuple is logits for loss computation
                predictions = outputs[0]
        else:
            raise ValueError(f"Unsupported output format: {outputs}")

        return predictions

    def _filter_max_predictions(self, res: List) -> List:
        res[:] = [im[: self.max_predictions] if (im is not None and im.shape[0] > self.max_predictions) else im for im in res]

        return res

`init(*, score_threshold, nms_threshold, nms_top_k, max_predictions, multi_label_per_box=True, class_agnostic_nms=False)`

Parameters:

Name	Type	Description	Default
`score_threshold`	`float`	Predictions confidence threshold. Predictions with score lower than score_threshold will not participate in Top-K & NMS	required
`nms_threshold`	`float`	IoU threshold for NMS step.	required
`nms_top_k`	`int`	Number of predictions participating in NMS step	required
`max_predictions`	`int`	Maximum number of boxes to return after NMS step	required
`multi_label_per_box`	`bool`	Controls whether to decode multiple labels per box. True - each anchor can produce multiple labels of different classes that pass confidence threshold check (default). False - each anchor can produce only one label of the class with the highest score.	`True`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py

def __init__(
    self,
    *,
    score_threshold: float,
    nms_threshold: float,
    nms_top_k: int,
    max_predictions: int,
    multi_label_per_box: bool = True,
    class_agnostic_nms: bool = False,
):
    """
    :param score_threshold:     Predictions confidence threshold.
                                Predictions with score lower than score_threshold will not participate in Top-K & NMS
    :param nms_threshold:       IoU threshold for NMS step.
    :param nms_top_k:           Number of predictions participating in NMS step
    :param max_predictions:     Maximum number of boxes to return after NMS step
    :param multi_label_per_box: Controls whether to decode multiple labels per box.
                                True - each anchor can produce multiple labels of different classes
                                       that pass confidence threshold check (default).
                                False - each anchor can produce only one label of the class with the highest score.
    """
    super(PPYoloEPostPredictionCallback, self).__init__()
    self.score_threshold = score_threshold
    self.nms_threshold = nms_threshold
    self.nms_top_k = nms_top_k
    self.max_predictions = max_predictions
    self.multi_label_per_box = multi_label_per_box
    self.class_agnostic_nms = class_agnostic_nms

`forward(outputs, device=None)`

Parameters:

Name	Type	Description	Default
`outputs`	`Any`	Outputs of model's forward() method	required
`device`	`str`	(Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class. Will be removed in the SG 3.7.0. A device parameter in case we want to move tensors to a specific device.	`None`

Returns:

Type	Description
`List[List[Tensor]]`	List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image. Format of each row is [x1, y1, x2, y2, confidence, class]

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py

@torch.no_grad()
def forward(self, outputs: Any, device: str = None) -> List[List[Tensor]]:
    """

    :param outputs: Outputs of model's forward() method
    :param device:  (Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class.
                    Will be removed in the SG 3.7.0.
                    A device parameter in case we want to move tensors to a specific device.
    :return:        List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image.
                    Format of each row is [x1, y1, x2, y2, confidence, class]
    """
    nms_result = []
    predictions = self._get_decoded_predictions_from_model_output(outputs)

    for pred_bboxes, pred_scores in zip(*predictions):
        # Cast to float to avoid lack of fp16 support in torchvision.ops.boxes.batched_nms when doing CPU inference
        pred_bboxes = pred_bboxes.float()  # [Anchors, 4]
        pred_scores = pred_scores.float()  # [Anchors, C]

        # Filter all predictions by self.score_threshold
        if self.multi_label_per_box:
            i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
            pred_bboxes = pred_bboxes[i]
            pred_cls_conf = pred_scores[i, j]
            pred_cls_label = j[:]

        else:
            pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
            conf_mask = pred_cls_conf >= self.score_threshold

            pred_cls_conf = pred_cls_conf[conf_mask]
            pred_cls_label = pred_cls_label[conf_mask]
            pred_bboxes = pred_bboxes[conf_mask, :]

        # Filter all predictions by self.nms_top_k
        if pred_cls_conf.size(0) > self.nms_top_k:
            topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
            pred_cls_conf = pred_cls_conf[topk_candidates.indices]
            pred_cls_label = pred_cls_label[topk_candidates.indices]
            pred_bboxes = pred_bboxes[topk_candidates.indices, :]

        # NMS
        if self.class_agnostic_nms:
            idx_to_keep = torchvision.ops.boxes.nms(pred_bboxes, pred_cls_conf, iou_threshold=self.nms_threshold)
        else:
            idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)

        pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
        pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
        pred_bboxes = pred_bboxes[idx_to_keep, :]

        #  nx6 (x1, y1, x2, y2, confidence, class) in pixel units
        final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1)  # [N,6]

        nms_result.append(final_boxes)

    return self._filter_max_predictions(nms_result)

`PPYoloE`

Bases: SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

class PPYoloE(SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck):
    def __init__(self, arch_params):
        super().__init__()
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()

        self.backbone = CSPResNetBackbone(**arch_params["backbone"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.neck = PPYoloECSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.head = PPYOLOEHead(**arch_params["head"], width_mult=arch_params["width_mult"], num_classes=arch_params["num_classes"])
        self.in_channels = 3

        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: float = 0.7
        self._default_nms_conf: float = 0.5
        self._default_nms_top_k: int = 1024
        self._default_max_predictions = 300
        self._default_multi_label_per_box = True
        self._default_class_agnostic_nms = False

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> PPYoloEPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        return PPYoloEPostPredictionCallback(
            score_threshold=conf,
            nms_threshold=iou,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
        )

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:         (Optional) Names of the dataset the model was trained on.
        :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:                 (Optional) IoU threshold for the nms algorithm
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if class_names is not None:
            self._class_names = tuple(class_names)
        if image_processor is not None:
            self._image_processor = image_processor
        if iou is not None:
            self._default_nms_iou = float(iou)
        if conf is not None:
            self._default_nms_conf = float(conf)
        if nms_top_k is not None:
            self._default_nms_top_k = int(nms_top_k)
        if max_predictions is not None:
            self._default_max_predictions = int(max_predictions)
        if multi_label_per_box is not None:
            self._default_multi_label_per_box = bool(multi_label_per_box)
        if class_agnostic_nms is not None:
            self._default_class_agnostic_nms = bool(class_agnostic_nms)

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        *,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = self._default_nms_iou if iou is None else iou
        conf = self._default_nms_conf if conf is None else conf
        nms_top_k = self._default_nms_top_k if nms_top_k is None else nms_top_k
        max_predictions = self._default_max_predictions if max_predictions is None else max_predictions
        multi_label_per_box = self._default_multi_label_per_box if multi_label_per_box is None else multi_label_per_box
        class_agnostic_nms = self._default_class_agnostic_nms if class_agnostic_nms is None else class_agnostic_nms

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                nms_top_k=nms_top_k,
                max_predictions=max_predictions,
                multi_label_per_box=multi_label_per_box,
                class_agnostic_nms=class_agnostic_nms,
            ),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:              Images to predict.
        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, the model will use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def forward(self, x: Tensor):
        features = self.backbone(x)
        features = self.neck(features)
        return self.head(features)

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """

        # There is some discrepancy of what input_size is.
        # When exporting to ONNX it is passed as 4-element tuple (B,C,H,W)
        # When called from predict() it is just (H,W)
        # So we take two last elements of the tuple which handles both cases but ultimately we should fix this
        h, w = input_size[-2:]

        self.head.cache_anchors((h, w))

        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.fuse_block_residual_branches()

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head.replace_num_classes(new_num_classes)

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return PPYoloEDecodingModule(num_pre_nms_predictions=num_pre_nms_predictions)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        self.in_channels = self.get_input_channels()

    def get_input_channels(self) -> int:
        return self.backbone.get_input_channels()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0}

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

`get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)`

Get a post prediction callback for this model.

Parameters:

Name	Type	Description	Default
`conf`	`float`	A minimum confidence threshold for predictions to be used in post-processing.	required
`iou`	`float`	A IoU threshold for boxes non-maximum suppression.	required
`nms_top_k`	`int`	The maximum number of detections to consider for NMS.	required
`max_predictions`	`int`	The maximum number of detections to return.	required
`multi_label_per_box`	`bool`	If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	required
`class_agnostic_nms`	`bool`	If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	required

Returns:

Type	Description
`PPYoloEPostPredictionCallback`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> PPYoloEPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    return PPYoloEPostPredictionCallback(
        score_threshold=conf,
        nms_threshold=iou,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
    )

`predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`
`fp16`	`bool`	If True, the model will use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:              Images to predict.
    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, the model will use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    pipeline.predict_webcam()

`prep_model_for_conversion(input_size=None, **kwargs)`

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name	Type	Description	Default
`input_size`	`Union[tuple, list]`	[H,W]	`None`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """

    # There is some discrepancy of what input_size is.
    # When exporting to ONNX it is passed as 4-element tuple (B,C,H,W)
    # When called from predict() it is just (H,W)
    # So we take two last elements of the tuple which handles both cases but ultimately we should fix this
    h, w = input_size[-2:]

    self.head.cache_anchors((h, w))

    for module in self.modules():
        if isinstance(module, RepVGGBlock):
            module.fuse_block_residual_branches()

`set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`class_names`	`Optional[List[str]]`	(Optional) Names of the dataset the model was trained on.	`None`
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded	`None`
`nms_top_k`	`Optional[int]`	(Optional) The maximum number of detections to consider for NMS.	`None`
`max_predictions`	`Optional[int]`	(Optional) The maximum number of detections to return.	`None`
`multi_label_per_box`	`Optional[bool]`	(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	`None`
`class_agnostic_nms`	`Optional[bool]`	(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	`None`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:         (Optional) Names of the dataset the model was trained on.
    :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:                 (Optional) IoU threshold for the nms algorithm
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    """
    if class_names is not None:
        self._class_names = tuple(class_names)
    if image_processor is not None:
        self._image_processor = image_processor
    if iou is not None:
        self._default_nms_iou = float(iou)
    if conf is not None:
        self._default_nms_conf = float(conf)
    if nms_top_k is not None:
        self._default_nms_top_k = int(nms_top_k)
    if max_predictions is not None:
        self._default_max_predictions = int(max_predictions)
    if multi_label_per_box is not None:
        self._default_multi_label_per_box = bool(multi_label_per_box)
    if class_agnostic_nms is not None:
        self._default_class_agnostic_nms = bool(class_agnostic_nms)

`PPYoloEDecodingModule`

Bases: AbstractObjectDetectionDecodingModule

Decoding module for PPYoloE model. This module used only to export model to ONNX/TensorRT and is not used during training.

Takes in the output of the model and returns the decoded boxes in the format Tuple[Tensor, Tensor] * boxes [batch_size, number_boxes, 4], boxes are in format (x1, y1, x2, y2) * scores [batch_size, number_boxes, number_classes]

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

class PPYoloEDecodingModule(AbstractObjectDetectionDecodingModule):
    """
    Decoding module for PPYoloE model. This module used only to export model to ONNX/TensorRT and is not used during training.

    Takes in the output of the model and returns the decoded boxes in the format Tuple[Tensor, Tensor]
    * boxes [batch_size, number_boxes, 4], boxes are in format (x1, y1, x2, y2)
    * scores [batch_size, number_boxes, number_classes]
    """

    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        """
        :param num_pre_nms_predictions: Number of predictions to keep before NMS. This is mainly to reject
        low-confidence predictions and thus reduce the number of boxes to process in NMS.

        """
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]) -> Tuple[Tensor, Tensor]:
        """

        :param inputs: Tuple [Tensor, Tensor]
            * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2)
            * scores [B, N, C]
        :return:
            * boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2)
            * scores [B, Nout, C]
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = inputs
        else:
            pred_bboxes, pred_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = predictions
        else:
            pred_bboxes, pred_scores = predictions[0]

        return pred_bboxes.size(1)

`init(num_pre_nms_predictions=1000)`

Parameters:

Name	Type	Description	Default
`num_pre_nms_predictions`	`int`	Number of predictions to keep before NMS. This is mainly to reject low-confidence predictions and thus reduce the number of boxes to process in NMS.	`1000`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def __init__(
    self,
    num_pre_nms_predictions: int = 1000,
):
    """
    :param num_pre_nms_predictions: Number of predictions to keep before NMS. This is mainly to reject
    low-confidence predictions and thus reduce the number of boxes to process in NMS.

    """
    super().__init__()
    self.num_pre_nms_predictions = num_pre_nms_predictions

`forward(inputs)`

Parameters:

Name	Type	Description	Default
`inputs`	`Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]`	Tuple [Tensor, Tensor] * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2) * scores [B, N, C]	required

Returns:

Type	Description
`Tuple[Tensor, Tensor]`	boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2) * scores [B, Nout, C]

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]) -> Tuple[Tensor, Tensor]:
    """

    :param inputs: Tuple [Tensor, Tensor]
        * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2)
        * scores [B, N, C]
    :return:
        * boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2)
        * scores [B, Nout, C]
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = inputs
    else:
        pred_bboxes, pred_scores = inputs[0]

    nms_top_k = self.num_pre_nms_predictions
    batch_size, num_anchors, _ = pred_scores.size()

    pred_cls_conf, _ = torch.max(pred_scores, dim=2)
    topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

    offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
    indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
    flat_indices = torch.flatten(indices_with_offset)

    output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
    output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

    return output_pred_bboxes, output_pred_scores

`infer_total_number_of_predictions(predictions)`

Parameters:

Name	Type	Description	Default
`inputs`			required

Returns:

Type	Description
`int`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py

@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = predictions
    else:
        pred_bboxes, pred_scores = predictions[0]

    return pred_bboxes.size(1)

`PPYOLOEHead`

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py

class PPYOLOEHead(nn.Module):
    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        activation: Type[nn.Module] = nn.SiLU,
        fpn_strides: Tuple[int, int, int] = (32, 16, 8),
        grid_cell_scale=5.0,
        grid_cell_offset=0.5,
        reg_max=16,
        eval_size: Tuple[int, int] = None,
        width_mult: float = 1.0,
    ):
        """

        :param num_classes:
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param activation: Type of the activation used in module
        :param fpn_strides: Output strides of the feature maps from the neck
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max:
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param exclude_nms:
        :param exclude_post_process:
        :param width_mult: A scaling factor applied to in_channels in order.
        """
        super(PPYOLOEHead, self).__init__()
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.fpn_strides = tuple(fpn_strides)
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # stem
        self.stem_cls = nn.ModuleList()
        self.stem_reg = nn.ModuleList()

        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
            self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
        # pred head
        self.pred_cls = nn.ModuleList()
        self.pred_reg = nn.ModuleList()
        for in_c in self.in_channels:
            self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

    @torch.jit.ignore
    def cache_anchors(self, input_size: Tuple[int, int]):
        self.eval_size = list(input_size)[-2:]
        device = infer_model_device(self.pred_cls)
        dtype = infer_model_dtype(self.pred_cls)
        anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
        self.register_buffer("anchor_points", anchor_points, persistent=False)
        self.register_buffer("stride_tensor", stride_tensor, persistent=False)

    @torch.jit.ignore
    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
            torch.nn.init.constant_(cls_.weight, 0.0)
            torch.nn.init.constant_(cls_.bias, bias_cls)
            torch.nn.init.constant_(reg_.weight, 0.0)
            torch.nn.init.constant_(reg_.bias, 1.0)

        if self.eval_size:
            device = infer_model_device(self.pred_cls)
            dtype = infer_model_dtype(self.pred_cls)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    @torch.jit.ignore
    def replace_num_classes(self, num_classes: int):
        bias_cls = bias_init_with_prob(0.01)
        device = self.pred_cls[0].weight.device
        self.pred_cls = nn.ModuleList()
        self.num_classes = num_classes

        for in_c in self.in_channels:
            predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1, device=device)
            torch.nn.init.constant_(predict_layer.weight, 0.0)
            torch.nn.init.constant_(predict_layer.bias, bias_cls)
            self.pred_cls.append(predict_layer)

    @torch.jit.ignore
    def forward_train(self, feats: Tuple[Tensor, ...]):
        anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
        )

        cls_score_list, reg_distri_list = [], []
        for i, feat in enumerate(feats):
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            # cls and reg
            # Note we don't apply sigmoid on class predictions to ensure good numerical stability at loss computation
            cls_score_list.append(torch.permute(cls_logit.flatten(2), [0, 2, 1]))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
        cls_score_list = torch.cat(cls_score_list, dim=1)
        reg_distri_list = torch.cat(reg_distri_list, dim=1)

        return cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor

    def forward_eval(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        if torch.jit.is_tracing():
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)

            # ONNX export does not support arange with float16, so it is created as fp32 and then casted to fp16
            # This produce correct fp16 weights in ONNX model when exported
            shift_x = torch.arange(end=w, dtype=torch.float32, device=device) + self.grid_cell_offset
            shift_y = torch.arange(end=h, dtype=torch.float32, device=device) + self.grid_cell_offset

            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype, device=device))

        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        return anchor_points, stride_tensor

    def forward(self, feats: Tuple[Tensor]):
        if self.training:
            return self.forward_train(feats)
        else:
            return self.forward_eval(feats)

`init(num_classes, in_channels, activation=nn.SiLU, fpn_strides=(32, 16, 8), grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, eval_size=None, width_mult=1.0)`

Parameters:

Name	Type	Description	Default
`num_classes`	`int`		required
`in_channels`	`Tuple[int, int, int]`	Number of channels for each feature map (See width_mult)	required
`activation`	`Type[nn.Module]`	Type of the activation used in module	`nn.SiLU`
`fpn_strides`	`Tuple[int, int, int]`	Output strides of the feature maps from the neck	`(32, 16, 8)`
`grid_cell_scale`			`5.0`
`grid_cell_offset`			`0.5`
`reg_max`			`16`
`eval_size`	`Tuple[int, int]`	(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.	`None`
`exclude_nms`			required
`exclude_post_process`			required
`width_mult`	`float`	A scaling factor applied to in_channels in order.	`1.0`

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py

@resolve_param("activation", ActivationsTypeFactory())
def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    activation: Type[nn.Module] = nn.SiLU,
    fpn_strides: Tuple[int, int, int] = (32, 16, 8),
    grid_cell_scale=5.0,
    grid_cell_offset=0.5,
    reg_max=16,
    eval_size: Tuple[int, int] = None,
    width_mult: float = 1.0,
):
    """

    :param num_classes:
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param activation: Type of the activation used in module
    :param fpn_strides: Output strides of the feature maps from the neck
    :param grid_cell_scale:
    :param grid_cell_offset:
    :param reg_max:
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param exclude_nms:
    :param exclude_post_process:
    :param width_mult: A scaling factor applied to in_channels in order.
    """
    super(PPYOLOEHead, self).__init__()
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.fpn_strides = tuple(fpn_strides)
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size

    # stem
    self.stem_cls = nn.ModuleList()
    self.stem_reg = nn.ModuleList()

    for in_c in self.in_channels:
        self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
        self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
    # pred head
    self.pred_cls = nn.ModuleList()
    self.pred_reg = nn.ModuleList()
    for in_c in self.in_channels:
        self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
        self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

`bias_init_with_prob(prior_prob=0.01)`

initialize conv/fc bias value according to a given probability value.

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py

def bias_init_with_prob(prior_prob=0.01):
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init

`generate_anchors_for_grid_cell(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.5, dtype=torch.float)`

Like ATSS, generate anchors based on grid size.

Parameters:

Name	Type	Description	Default
`feats`	`Tuple[Tensor, ...]`	shape[s, (b, c, h, w)]	required
`fpn_strides`	`Tuple[int, ...]`	shape[s], stride for each scale feature	required
`grid_cell_size`	`float`	anchor size	`5.0`
`grid_cell_offset`	`float`	The range is between 0 and 1.	`0.5`
`dtype`	`torch.dtype`	Type of the anchors.	`torch.float`

Returns:

Type	Description
`Tuple[Tensor, Tensor, List[int], Tensor]`	anchors: shape[l, 4], "xmin, ymin, xmax, ymax" format. - anchor_points: shape[l, 2], "x, y" format. - num_anchors_list: shape[s], contains [s_1, s_2, ...]. - stride_tensor: shape[l, 1], contains the stride for each scale.

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py

@torch.no_grad()
def generate_anchors_for_grid_cell(
    feats: Tuple[Tensor, ...],
    fpn_strides: Tuple[int, ...],
    grid_cell_size: float = 5.0,
    grid_cell_offset: float = 0.5,
    dtype: torch.dtype = torch.float,
) -> Tuple[Tensor, Tensor, List[int], Tensor]:
    """
    Like ATSS, generate anchors based on grid size.

    :param feats: shape[s, (b, c, h, w)]
    :param fpn_strides: shape[s], stride for each scale feature
    :param grid_cell_size: anchor size
    :param grid_cell_offset: The range is between 0 and 1.
    :param dtype: Type of the anchors.

    :return:
        - anchors: shape[l, 4], "xmin, ymin, xmax, ymax" format.
        - anchor_points: shape[l, 2], "x, y" format.
        - num_anchors_list: shape[s], contains [s_1, s_2, ...].
        - stride_tensor: shape[l, 1], contains the stride for each scale.
    """
    assert len(feats) == len(fpn_strides)
    device = feats[0].device
    anchors = []
    anchor_points = []
    num_anchors_list = []
    stride_tensor = []
    for feat, stride in zip(feats, fpn_strides):
        _, _, h, w = feat.shape
        cell_half_size = grid_cell_size * stride * 0.5
        shift_x = (torch.arange(end=w) + grid_cell_offset) * stride
        shift_y = (torch.arange(end=h) + grid_cell_offset) * stride

        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
        else:
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

        anchor = torch.stack(
            [shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size],
            dim=-1,
        ).to(dtype=dtype)
        anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)

        anchors.append(anchor.reshape([-1, 4]))
        anchor_points.append(anchor_point.reshape([-1, 2]))
        num_anchors_list.append(len(anchors[-1]))
        stride_tensor.append(torch.full([num_anchors_list[-1], 1], stride, dtype=dtype))

    anchors = torch.cat(anchors).to(device)
    anchor_points = torch.cat(anchor_points).to(device)
    stride_tensor = torch.cat(stride_tensor).to(device)
    return anchors, anchor_points, num_anchors_list, stride_tensor

`AbstractYoloBackbone`

Bases: SupportsReplaceInputChannels

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class AbstractYoloBackbone(SupportsReplaceInputChannels):
    def __init__(self, arch_params):
        # CREATE A LIST CONTAINING THE LAYERS TO EXTRACT FROM THE BACKBONE AND ADD THE FINAL LAYER
        self._layer_idx_to_extract = [idx for sub_l in arch_params.skip_connections_dict.values() for idx in sub_l]
        self._layer_idx_to_extract.append(len(self._modules_list) - 1)

    def forward(self, x):
        """:return A list, the length of self._modules_list containing the output of the layer if specified in
        self._layers_to_extract and None otherwise"""
        extracted_intermediate_layers = []
        for layer_idx, layer_module in enumerate(self._modules_list):
            # PREDICT THE NEXT LAYER'S OUTPUT
            x = layer_module(x)
            # IF INDICATED APPEND THE OUTPUT TO extracted_intermediate_layers O.W. APPEND None
            if layer_idx in self._layer_idx_to_extract:
                extracted_intermediate_layers.append(x)
            else:
                extracted_intermediate_layers.append(None)

        return extracted_intermediate_layers

`forward(x)`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def forward(self, x):
    """:return A list, the length of self._modules_list containing the output of the layer if specified in
    self._layers_to_extract and None otherwise"""
    extracted_intermediate_layers = []
    for layer_idx, layer_module in enumerate(self._modules_list):
        # PREDICT THE NEXT LAYER'S OUTPUT
        x = layer_module(x)
        # IF INDICATED APPEND THE OUTPUT TO extracted_intermediate_layers O.W. APPEND None
        if layer_idx in self._layer_idx_to_extract:
            extracted_intermediate_layers.append(x)
        else:
            extracted_intermediate_layers.append(None)

    return extracted_intermediate_layers

`Concat`

Bases: nn.Module

CONCATENATE A LIST OF TENSORS ALONG DIMENSION

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class Concat(nn.Module):
    """CONCATENATE A LIST OF TENSORS ALONG DIMENSION"""

    def __init__(self, dimension=1):
        super().__init__()
        self.dimension = dimension

    def forward(self, x):
        return torch.cat(x, self.dimension)

`DetectX`

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class DetectX(nn.Module):
    def __init__(
        self,
        num_classes: int,
        stride: np.ndarray,
        activation_func_type: type,
        channels: list,
        depthwise=False,
        groups: int = None,
        inter_channels: Union[int, List] = None,
    ):
        """
        :param stride:          strides of each predicting level
        :param channels:        input channels into all detecting layers
                                (from all neck layers that will be used for predicting)
        :param depthwise:       defines conv type in classification and regression branches (Conv or GroupedConvBlock)
                                depthwise is False by default in favor of a usual Conv
        :param groups:          num groups in convs in classification and regression branches;
                                if None default groups will be used according to conv type
                                (1 for Conv and depthwise for GroupedConvBlock)
        :param inter_channels:  channels in classification and regression branches;
                                if None channels[0] will be used by default
        """
        super().__init__()

        self.num_classes = num_classes
        self.detection_layers_num = len(channels)
        self.n_anchors = 1
        self.grid = [torch.zeros(1)] * self.detection_layers_num  # init grid

        if torch.is_tensor(stride):
            stride = stride.clone().detach()
        else:
            stride = torch.tensor(stride)

        self.register_buffer("stride", stride, persistent=False)

        self.cls_convs = nn.ModuleList()
        self.reg_convs = nn.ModuleList()
        self.cls_preds = nn.ModuleList()
        self.reg_preds = nn.ModuleList()
        self.obj_preds = nn.ModuleList()
        self.stems = nn.ModuleList()

        ConvBlock = GroupedConvBlock if depthwise else Conv

        inter_channels = inter_channels or channels[0]
        inter_channels = inter_channels if isinstance(inter_channels, list) else [inter_channels] * self.detection_layers_num
        for i in range(self.detection_layers_num):
            self.stems.append(Conv(channels[i], inter_channels[i], 1, 1, activation_func_type))

            self.cls_convs.append(
                nn.Sequential(
                    *[
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ]
                )
            )
            self.reg_convs.append(
                nn.Sequential(
                    *[
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ]
                )
            )

            self.cls_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * self.num_classes, 1, 1, 0))
            self.reg_preds.append(nn.Conv2d(inter_channels[i], 4, 1, 1, 0))
            self.obj_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * 1, 1, 1, 0))

    def forward(self, inputs):
        outputs = []
        outputs_logits = []
        for i in range(self.detection_layers_num):
            x = self.stems[i](inputs[i])

            cls_feat = self.cls_convs[i](x)
            cls_output = self.cls_preds[i](cls_feat)

            reg_feat = self.reg_convs[i](x)
            reg_output = self.reg_preds[i](reg_feat)
            obj_output = self.obj_preds[i](reg_feat)

            bs, _, ny, nx = reg_feat.shape
            output = torch.cat([reg_output, obj_output, cls_output], 1)
            output = output.view(bs, self.n_anchors, -1, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            if not self.training:
                outputs_logits.append(output.clone())
                if self.grid[i].shape[2:4] != output.shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny, dtype=reg_output.dtype, device=output.device)

                xy = (output[..., :2] + self.grid[i].to(output.device)) * self.stride[i]
                wh = torch.exp(output[..., 2:4]) * self.stride[i]
                output = torch.cat([xy, wh, output[..., 4:].sigmoid()], dim=4)
                output = output.view(bs, -1, output.shape[-1])

            outputs.append(output)

        return outputs if self.training else (torch.cat(outputs, 1), outputs_logits)

    @staticmethod
    def _make_grid(nx: int, ny: int, dtype: torch.dtype, device: torch.device):
        y, x = torch.arange(ny, dtype=torch.float32, device=device), torch.arange(nx, dtype=torch.float32, device=device)

        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([y, x], indexing="ij")
        else:
            yv, xv = torch.meshgrid([y, x])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).to(dtype)

`init(num_classes, stride, activation_func_type, channels, depthwise=False, groups=None, inter_channels=None)`

Parameters:

Name	Type	Description	Default
`stride`	`np.ndarray`	strides of each predicting level	required
`channels`	`list`	input channels into all detecting layers (from all neck layers that will be used for predicting)	required
`depthwise`		defines conv type in classification and regression branches (Conv or GroupedConvBlock) depthwise is False by default in favor of a usual Conv	`False`
`groups`	`int`	num groups in convs in classification and regression branches; if None default groups will be used according to conv type (1 for Conv and depthwise for GroupedConvBlock)	`None`
`inter_channels`	`Union[int, List]`	channels in classification and regression branches; if None channels[0] will be used by default	`None`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def __init__(
    self,
    num_classes: int,
    stride: np.ndarray,
    activation_func_type: type,
    channels: list,
    depthwise=False,
    groups: int = None,
    inter_channels: Union[int, List] = None,
):
    """
    :param stride:          strides of each predicting level
    :param channels:        input channels into all detecting layers
                            (from all neck layers that will be used for predicting)
    :param depthwise:       defines conv type in classification and regression branches (Conv or GroupedConvBlock)
                            depthwise is False by default in favor of a usual Conv
    :param groups:          num groups in convs in classification and regression branches;
                            if None default groups will be used according to conv type
                            (1 for Conv and depthwise for GroupedConvBlock)
    :param inter_channels:  channels in classification and regression branches;
                            if None channels[0] will be used by default
    """
    super().__init__()

    self.num_classes = num_classes
    self.detection_layers_num = len(channels)
    self.n_anchors = 1
    self.grid = [torch.zeros(1)] * self.detection_layers_num  # init grid

    if torch.is_tensor(stride):
        stride = stride.clone().detach()
    else:
        stride = torch.tensor(stride)

    self.register_buffer("stride", stride, persistent=False)

    self.cls_convs = nn.ModuleList()
    self.reg_convs = nn.ModuleList()
    self.cls_preds = nn.ModuleList()
    self.reg_preds = nn.ModuleList()
    self.obj_preds = nn.ModuleList()
    self.stems = nn.ModuleList()

    ConvBlock = GroupedConvBlock if depthwise else Conv

    inter_channels = inter_channels or channels[0]
    inter_channels = inter_channels if isinstance(inter_channels, list) else [inter_channels] * self.detection_layers_num
    for i in range(self.detection_layers_num):
        self.stems.append(Conv(channels[i], inter_channels[i], 1, 1, activation_func_type))

        self.cls_convs.append(
            nn.Sequential(
                *[
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                ]
            )
        )
        self.reg_convs.append(
            nn.Sequential(
                *[
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                ]
            )
        )

        self.cls_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * self.num_classes, 1, 1, 0))
        self.reg_preds.append(nn.Conv2d(inter_channels[i], 4, 1, 1, 0))
        self.obj_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * 1, 1, 1, 0))

`YoloBase`

Bases: SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloBase(SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck):
    def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
        super().__init__()
        # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
        self.arch_params = HpmStruct(**DEFAULT_YOLO_ARCH_PARAMS)
        # FIXME: REMOVE anchors ATTRIBUTE, WHICH HAS NO MEANING OTHER THAN COMPATIBILITY.
        self.arch_params.anchors = COCO_DETECTION_80_CLASSES_BBOX_ANCHORS
        self.arch_params.override(**arch_params.to_dict())
        self.arch_params.skip_connections_dict = {k: v for k, v in self.arch_params.skip_connections_list}
        self.in_channels = 3

        self.num_classes = self.arch_params.num_classes

        # THE MODEL'S MODULES
        self._backbone = backbone(arch_params=self.arch_params)
        if hasattr(self._backbone, "backbone_connection_channels"):
            self.arch_params.scaled_backbone_width = False
            self.arch_params.backbone_connection_channels = self._backbone.backbone_connection_channels

        self._nms = nn.Identity()

        # A FLAG TO DEFINE augment_forward IN INFERENCE
        self.augmented_inference = False

        if initialize_module:
            self._head = YoloHead(self.arch_params)
            self._initialize_module()

        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: Optional[float] = None
        self._default_nms_conf: Optional[float] = None
        self.register_buffer("strides", torch.tensor(self.arch_params.anchors.stride), persistent=False)

    @staticmethod
    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
        return YoloXPostPredictionCallback(conf=conf, iou=iou)

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:             (Optional) IoU threshold for the nms algorithm
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor
        self._default_nms_iou = iou or self._default_nms_iou
        self._default_nms_conf = conf or self._default_nms_conf

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:    If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = iou or self._default_nms_iou
        conf = conf or self._default_nms_conf

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:      Images to predict.
        :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                            If None, the default value associated to the training is used.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:        If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16=True):
        """Predict using webcam.

        :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:    If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def forward(self, x):
        out = self._backbone(x)
        out = self._head(out)
        # THIS HAS NO EFFECT IF add_nms() WAS NOT DONE
        out = self._nms(out)
        return out

    def load_state_dict(self, state_dict, strict=True):
        try:
            keys_dropped_in_sg_320 = {
                "stride",
                "_head.anchors._stride",
                "_head.anchors._anchors",
                "_head.anchors._anchor_grid",
                "_head._modules_list.14.stride",
            }
            state_dict = collections.OrderedDict([(k, v) for k, v in state_dict.items() if k not in keys_dropped_in_sg_320])

            super().load_state_dict(state_dict, strict)
        except RuntimeError as e:
            raise RuntimeError(
                f"Got exception {e}, if a mismatch between expected and given state_dict keys exist, "
                f"checkpoint may have been saved after fusing conv and bn. use fuse_conv_bn before loading."
            )

    def _initialize_module(self):
        self._check_strides()
        self._initialize_biases()
        self._initialize_weights()
        if self.arch_params.add_nms:
            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)

    def _check_strides(self):
        m = self._head._modules_list[-1]  # DetectX()
        # Do inference in train mode on a dummy image to get output stride of each head output layer
        s = 128  # twice the minimum acceptable image size
        device = infer_model_device(m)
        dtype = infer_model_dtype(m)

        dummy_input = torch.zeros((1, self.arch_params.channels_in, s, s), device=device, dtype=dtype)
        dummy_input = dummy_input.to(next(self._backbone.parameters()).device)

        stride = torch.tensor([s / x.shape[-2] for x in self.forward(dummy_input)])
        stride = stride.to(m.stride.device)
        if not torch.equal(m.stride, stride):
            raise RuntimeError("Provided anchor strides do not match the model strides")

    def _initialize_biases(self):
        """initialize biases into DetectX()"""
        detect_module = self._head._modules_list[-1]  # DetectX() module
        prior_prob = 1e-2
        for conv in detect_module.cls_preds:
            bias = conv.bias.view(detect_module.n_anchors, -1)
            bias.data.fill_(-math.log((1 - prior_prob) / prior_prob))
            conv.bias = torch.nn.Parameter(bias.view(-1), requires_grad=True)

        for conv in detect_module.obj_preds:
            bias = conv.bias.view(detect_module.n_anchors, -1)
            bias.data.fill_(-math.log((1 - prior_prob) / prior_prob))
            conv.bias = torch.nn.Parameter(bias.view(-1), requires_grad=True)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eps = 1e-3
                m.momentum = 0.03
            elif isinstance(m, (nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.Hardswish, nn.SiLU)):
                m.inplace = True

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)
        :param input_size: expected input size
        :return:
        """
        assert not self.training, "model has to be in eval mode to be converted"

        # Verify dummy_input from converter is of multiple of the grid size
        max_stride = int(max(self.strides))

        # Validate the image size
        image_dims = input_size[-2:]  # assume torch uses channels first layout
        for dim in image_dims:
            res_flag, suggestion = check_img_size_divisibility(dim, max_stride)
            if not res_flag:
                raise ValueError(
                    f"Invalid input size: {input_size}. The input size must be multiple of max stride: "
                    f"{max_stride}. The closest suggestions are: {suggestion[0]}x{suggestion[0]} or "
                    f"{suggestion[1]}x{suggestion[1]}"
                )

    def get_include_attributes(self) -> list:
        return ["grid", "anchors", "anchors_grid"]

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self._head = new_head
        else:
            self.arch_params.num_classes = new_num_classes
            self.num_classes = new_num_classes
            old_detectx = self._head._modules_list[-1]
            _, block, activation_type, width_mult, depth_mult = get_yolo_type_params(
                self.arch_params.yolo_type, self.arch_params.width_mult_factor, self.arch_params.depth_mult_factor
            )

            new_last_layer = DetectX(
                num_classes=new_num_classes,
                stride=self.strides,
                activation_func_type=activation_type,
                channels=[width_mult(v) for v in (256, 512, 1024)],
                depthwise=isinstance(old_detectx.cls_convs[0][0], GroupedConvBlock),
                groups=self.arch_params.xhead_groups,
                inter_channels=self.arch_params.xhead_inter_channels,
            )
            new_last_layer = new_last_layer.to(next(self.parameters()).device)
            self._head._modules_list[-1] = new_last_layer
            self._check_strides()
            self._initialize_biases()
            self._initialize_weights()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"_head": lr, "default": 0}

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return YoloXDecodingModule(num_pre_nms_predictions=num_pre_nms_predictions, **kwargs)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self._backbone, SupportsReplaceInputChannels):
            self._backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
            self.in_channels = self.get_input_channels()
        else:
            raise NotImplementedError(f"`{self._backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self._backbone, SupportsReplaceInputChannels):
            return self._backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self._backbone.__class__.__name__}` does not support `get_input_channels`")

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

`predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:      Images to predict.
    :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:        If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`		If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16=True):
    """Predict using webcam.

    :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:    If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

`prep_model_for_conversion(input_size=None, **kwargs)`

A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)

Parameters:

Name	Type	Description	Default
`input_size`	`Union[tuple, list]`	expected input size	`None`

Returns:

Type	Description

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)
    :param input_size: expected input size
    :return:
    """
    assert not self.training, "model has to be in eval mode to be converted"

    # Verify dummy_input from converter is of multiple of the grid size
    max_stride = int(max(self.strides))

    # Validate the image size
    image_dims = input_size[-2:]  # assume torch uses channels first layout
    for dim in image_dims:
        res_flag, suggestion = check_img_size_divisibility(dim, max_stride)
        if not res_flag:
            raise ValueError(
                f"Invalid input size: {input_size}. The input size must be multiple of max stride: "
                f"{max_stride}. The closest suggestions are: {suggestion[0]}x{suggestion[0]} or "
                f"{suggestion[1]}x{suggestion[1]}"
            )

`set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`class_names`	`Optional[List[str]]`	(Optional) Names of the dataset the model was trained on.	`None`
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded	`None`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:             (Optional) IoU threshold for the nms algorithm
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor
    self._default_nms_iou = iou or self._default_nms_iou
    self._default_nms_conf = conf or self._default_nms_conf

`YoloDarknetBackbone`

Bases: AbstractYoloBackbone, CSPDarknet53

Implements the CSP_Darknet53 module and inherit the forward pass to extract layers indicated in arch_params

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloDarknetBackbone(AbstractYoloBackbone, CSPDarknet53):
    """Implements the CSP_Darknet53 module and inherit the forward pass to extract layers indicated in arch_params"""

    def __init__(self, arch_params):
        arch_params.backbone_mode = True
        CSPDarknet53.__init__(self, arch_params)
        AbstractYoloBackbone.__init__(self, arch_params)

    def forward(self, x):
        return AbstractYoloBackbone.forward(self, x)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        CSPDarknet53.replace_input_channels(self, in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return CSPDarknet53.get_input_channels(self)

`YoloHead`

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloHead(nn.Module):
    def __init__(self, arch_params):
        super().__init__()
        # PARSE arch_params
        num_classes = arch_params.num_classes
        anchors = arch_params.anchors
        depthwise = arch_params.depthwise
        xhead_groups = arch_params.xhead_groups
        xhead_inter_channels = arch_params.xhead_inter_channels

        self._skip_connections_dict = arch_params.skip_connections_dict
        # FLATTEN THE SOURCE LIST INTO A LIST OF INDICES
        self._layer_idx_to_extract = [idx for sub_l in self._skip_connections_dict.values() for idx in sub_l]

        _, block, activation_type, width_mult, depth_mult = get_yolo_type_params(
            arch_params.yolo_type, arch_params.width_mult_factor, arch_params.depth_mult_factor
        )

        backbone_connector = [width_mult(c) if arch_params.scaled_backbone_width else c for c in arch_params.backbone_connection_channels]

        DownConv = GroupedConvBlock if depthwise else Conv

        self._modules_list = nn.ModuleList()
        self._modules_list.append(Conv(backbone_connector[0], width_mult(512), 1, 1, activation_type))  # 10
        self._modules_list.append(nn.Upsample(None, 2, "nearest"))  # 11
        self._modules_list.append(Concat(1))  # 12
        self._modules_list.append(block(backbone_connector[1] + width_mult(512), width_mult(512), depth_mult(3), activation_type, False, depthwise))  # 13

        self._modules_list.append(Conv(width_mult(512), width_mult(256), 1, 1, activation_type))  # 14
        self._modules_list.append(nn.Upsample(None, 2, "nearest"))  # 15
        self._modules_list.append(Concat(1))  # 16
        self._modules_list.append(block(backbone_connector[2] + width_mult(256), width_mult(256), depth_mult(3), activation_type, False, depthwise))  # 17

        self._modules_list.append(DownConv(width_mult(256), width_mult(256), 3, 2, activation_type))  # 18
        self._modules_list.append(Concat(1))  # 19
        self._modules_list.append(block(2 * width_mult(256), width_mult(512), depth_mult(3), activation_type, False, depthwise))  # 20

        self._modules_list.append(DownConv(width_mult(512), width_mult(512), 3, 2, activation_type))  # 21
        self._modules_list.append(Concat(1))  # 22
        self._modules_list.append(block(2 * width_mult(512), width_mult(1024), depth_mult(3), activation_type, False, depthwise))  # 23

        detect_input_channels = [width_mult(v) for v in (256, 512, 1024)]
        strides = anchors.stride
        self._modules_list.append(
            DetectX(
                num_classes,
                strides,
                activation_type,
                channels=detect_input_channels,
                depthwise=depthwise,
                groups=xhead_groups,
                inter_channels=xhead_inter_channels,
            )
        )  # 24

        self._shortcuts = nn.ModuleList([CrossModelSkipConnection() for _ in range(len(self._skip_connections_dict.keys()) - 1)])

        self.width_mult = width_mult

    def forward(self, intermediate_output):
        """
        :param intermediate_output: A list of the intermediate prediction of layers specified in the
        self._inter_layer_idx_to_extract from the Backbone
        """
        # COUNT THE NUMBER OF LAYERS IN THE BACKBONE TO CONTINUE THE COUNTER
        num_layers_in_backbone = len(intermediate_output)
        # INPUT TO HEAD IS THE LAST ELEMENT OF THE BACKBONE'S OUTPUT
        out = intermediate_output[-1]
        # RUN OVER THE MODULE LIST WITHOUT THE FINAL LAYER & START COUNTER FROM THE END OF THE BACKBONE
        i = 0
        for layer_idx, layer_module in enumerate(self._modules_list[:-1], start=num_layers_in_backbone):
            # IF THE LAYER APPEARS IN THE KEYS IT INSERT THE PRECIOUS OUTPUT AND THE INDICATED SKIP CONNECTIONS

            if layer_idx in self._skip_connections_dict.keys():
                out = layer_module([out, self._shortcuts[i](intermediate_output[self._skip_connections_dict[layer_idx][0]])])
                i += 1
            else:
                out = layer_module(out)

            # IF INDICATED APPEND THE OUTPUT TO inter_layer_idx_to_extract O.W. APPEND None
            if layer_idx in self._layer_idx_to_extract:
                intermediate_output.append(out)
            else:
                intermediate_output.append(None)

        # INSERT THE REMAINING LAYERS INTO THE Detect LAYER
        last_idx = len(self._modules_list) + num_layers_in_backbone - 1

        return self._modules_list[-1](
            [
                intermediate_output[self._skip_connections_dict[last_idx][0]],
                intermediate_output[self._skip_connections_dict[last_idx][1]],
                out,
            ]
        )

`forward(intermediate_output)`

Parameters:

Name	Type	Description	Default
`intermediate_output`		A list of the intermediate prediction of layers specified in the self._inter_layer_idx_to_extract from the Backbone	required

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def forward(self, intermediate_output):
    """
    :param intermediate_output: A list of the intermediate prediction of layers specified in the
    self._inter_layer_idx_to_extract from the Backbone
    """
    # COUNT THE NUMBER OF LAYERS IN THE BACKBONE TO CONTINUE THE COUNTER
    num_layers_in_backbone = len(intermediate_output)
    # INPUT TO HEAD IS THE LAST ELEMENT OF THE BACKBONE'S OUTPUT
    out = intermediate_output[-1]
    # RUN OVER THE MODULE LIST WITHOUT THE FINAL LAYER & START COUNTER FROM THE END OF THE BACKBONE
    i = 0
    for layer_idx, layer_module in enumerate(self._modules_list[:-1], start=num_layers_in_backbone):
        # IF THE LAYER APPEARS IN THE KEYS IT INSERT THE PRECIOUS OUTPUT AND THE INDICATED SKIP CONNECTIONS

        if layer_idx in self._skip_connections_dict.keys():
            out = layer_module([out, self._shortcuts[i](intermediate_output[self._skip_connections_dict[layer_idx][0]])])
            i += 1
        else:
            out = layer_module(out)

        # IF INDICATED APPEND THE OUTPUT TO inter_layer_idx_to_extract O.W. APPEND None
        if layer_idx in self._layer_idx_to_extract:
            intermediate_output.append(out)
        else:
            intermediate_output.append(None)

    # INSERT THE REMAINING LAYERS INTO THE Detect LAYER
    last_idx = len(self._modules_list) + num_layers_in_backbone - 1

    return self._modules_list[-1](
        [
            intermediate_output[self._skip_connections_dict[last_idx][0]],
            intermediate_output[self._skip_connections_dict[last_idx][1]],
            out,
        ]
    )

`YoloRegnetBackbone`

Bases: AbstractYoloBackbone, AnyNetX

Implements the Regnet module and inherits the forward pass to extract layers indicated in arch_params

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloRegnetBackbone(AbstractYoloBackbone, AnyNetX):
    """Implements the Regnet module and inherits the forward pass to extract layers indicated in arch_params"""

    def __init__(self, arch_params):
        backbone_params = {**arch_params.backbone_params, "backbone_mode": True, "num_classes": None}
        backbone_params.pop("spp_kernels", None)
        AnyNetX.__init__(self, **backbone_params)

        # LAST ANYNETX STAGE -> STAGE + SPP IF SPP_KERNELS IS GIVEN
        spp_kernels = get_param(arch_params.backbone_params, "spp_kernels", None)
        if spp_kernels:
            activation_type = nn.SiLU if arch_params.yolo_type == "yoloX" else nn.Hardswish
            self.net.stage_3 = self.add_spp_to_stage(self.net.stage_3, spp_kernels, activation_type=activation_type)
            self.initialize_weight()

        # CREATE A LIST CONTAINING THE LAYERS TO EXTRACT FROM THE BACKBONE AND ADD THE FINAL LAYER
        self._modules_list = nn.ModuleList()
        for layer in self.net:
            self._modules_list.append(layer)

        AbstractYoloBackbone.__init__(self, arch_params)

        # WE KEEP A LIST OF THE OUTPUTS WIDTHS (NUM FEATURES) TO BE CONNECTED TO THE HEAD
        self.backbone_connection_channels = arch_params.backbone_params["ls_block_width"][1:][::-1]

    @staticmethod
    def add_spp_to_stage(anynetx_stage: Stage, spp_kernels: Tuple[int], activation_type):
        """
        Add SPP in the end of an AnyNetX Stage
        """
        # Last block in a Stage -> conv_block_3 -> Conv2d -> out_channels
        out_channels = anynetx_stage.blocks[-1].conv_block_3[0].out_channels
        anynetx_stage.blocks.add_module("spp_block", SPP(out_channels, out_channels, spp_kernels, activation_type=activation_type))
        return anynetx_stage

    def forward(self, x):
        return AbstractYoloBackbone.forward(self, x)

`add_spp_to_stage(anynetx_stage, spp_kernels, activation_type)` `staticmethod`

Add SPP in the end of an AnyNetX Stage

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

@staticmethod
def add_spp_to_stage(anynetx_stage: Stage, spp_kernels: Tuple[int], activation_type):
    """
    Add SPP in the end of an AnyNetX Stage
    """
    # Last block in a Stage -> conv_block_3 -> Conv2d -> out_channels
    out_channels = anynetx_stage.blocks[-1].conv_block_3[0].out_channels
    anynetx_stage.blocks.add_module("spp_block", SPP(out_channels, out_channels, spp_kernels, activation_type=activation_type))
    return anynetx_stage

`YoloXDecodingModule`

Bases: AbstractObjectDetectionDecodingModule

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloXDecodingModule(AbstractObjectDetectionDecodingModule):
    __constants__ = ["num_pre_nms_predictions", "with_confidence"]

    def __init__(self, num_pre_nms_predictions: int, with_confidence: bool = True):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions
        self.with_confidence = with_confidence

    def forward(self, predictions):
        if isinstance(predictions, (tuple, list)):
            predictions = predictions[0]

        cxcywh = predictions[:, :, :4]
        conf = predictions[:, :, 4:5]
        pred_scores = predictions[:, :, 5:]
        pred_bboxes = convert_cxcywh_bbox_to_xyxy(cxcywh)

        if self.with_confidence:
            pred_scores = pred_scores * conf

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if isinstance(predictions, (tuple, list)):
            predictions = predictions[0]

        return predictions.size(1)

`infer_total_number_of_predictions(predictions)`

Parameters:

Name	Type	Description	Default
`inputs`			required

Returns:

Type	Description
`int`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if isinstance(predictions, (tuple, list)):
        predictions = predictions[0]

    return predictions.size(1)

`YoloXPostPredictionCallback`

Bases: DetectionPostPredictionCallback

Post-prediction callback to decode YoloX model's output and apply Non-Maximum Suppression (NMS) to get the final predictions.

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

class YoloXPostPredictionCallback(DetectionPostPredictionCallback):
    """Post-prediction callback to decode YoloX model's output and apply Non-Maximum Suppression (NMS) to get
    the final predictions.
    """

    def __init__(
        self,
        conf: float = 0.001,
        iou: float = 0.6,
        classes: List[int] = None,
        nms_type: NMS_Type = NMS_Type.ITERATIVE,
        max_predictions: int = 300,
        with_confidence: bool = True,
        class_agnostic_nms: bool = False,
        multi_label_per_box: bool = True,
    ):
        """
        :param conf: confidence threshold
        :param iou: IoU threshold                                       (used in NMS_Type.ITERATIVE)
        :param classes: (optional list) filter by class                 (used in NMS_Type.ITERATIVE)
        :param nms_type: the type of nms to use (iterative or matrix)
        :param max_predictions: maximum number of boxes to output       (used in NMS_Type.MATRIX)
        :param with_confidence: in NMS, whether to multiply objectness  (used in NMS_Type.ITERATIVE)
                                score with class score
        :param class_agnostic_nms: indicates how boxes of different classes will be treated during
                                   NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX)
                                   True - NMS will be performed on all classes together.
                                   False - NMS will be performed on each class separately (default).
        :param multi_label_per_box: controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE)
                                    True - each anchor can produce multiple labels of different classes
                                           that pass confidence threshold check (default).
                                    False - each anchor can produce only one label of the class with the highest score.
        """
        super(YoloXPostPredictionCallback, self).__init__()
        self.conf = conf
        self.iou = iou
        self.classes = classes
        self.nms_type = nms_type
        self.max_pred = max_predictions
        self.with_confidence = with_confidence
        self.class_agnostic_nms = class_agnostic_nms
        self.multi_label_per_box = multi_label_per_box

    def forward(self, x: Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]], device: str = None):
        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.

        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
        """
        # Use the main output features in case of multiple outputs.
        if isinstance(x, (tuple, list)):
            x = x[0]

        if self.nms_type == NMS_Type.ITERATIVE:
            nms_result = non_max_suppression(
                x,
                conf_thres=self.conf,
                iou_thres=self.iou,
                with_confidence=self.with_confidence,
                multi_label_per_box=self.multi_label_per_box,
                class_agnostic_nms=self.class_agnostic_nms,
            )
        else:
            nms_result = matrix_non_max_suppression(x, conf_thres=self.conf, max_num_of_detections=self.max_pred, class_agnostic_nms=self.class_agnostic_nms)

        return self._filter_max_predictions(nms_result)

    def _filter_max_predictions(self, res: List) -> List:
        res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
        return res

`init(conf=0.001, iou=0.6, classes=None, nms_type=NMS_Type.ITERATIVE, max_predictions=300, with_confidence=True, class_agnostic_nms=False, multi_label_per_box=True)`

Parameters:

Name	Type	Description	Default
`conf`	`float`	confidence threshold	`0.001`
`iou`	`float`	IoU threshold (used in NMS_Type.ITERATIVE)	`0.6`
`classes`	`List[int]`	(optional list) filter by class (used in NMS_Type.ITERATIVE)	`None`
`nms_type`	`NMS_Type`	the type of nms to use (iterative or matrix)	`NMS_Type.ITERATIVE`
`max_predictions`	`int`	maximum number of boxes to output (used in NMS_Type.MATRIX)	`300`
`with_confidence`	`bool`	in NMS, whether to multiply objectness (used in NMS_Type.ITERATIVE) score with class score	`True`
`class_agnostic_nms`	`bool`	indicates how boxes of different classes will be treated during NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX) True - NMS will be performed on all classes together. False - NMS will be performed on each class separately (default).	`False`
`multi_label_per_box`	`bool`	controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE) True - each anchor can produce multiple labels of different classes that pass confidence threshold check (default). False - each anchor can produce only one label of the class with the highest score.	`True`

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def __init__(
    self,
    conf: float = 0.001,
    iou: float = 0.6,
    classes: List[int] = None,
    nms_type: NMS_Type = NMS_Type.ITERATIVE,
    max_predictions: int = 300,
    with_confidence: bool = True,
    class_agnostic_nms: bool = False,
    multi_label_per_box: bool = True,
):
    """
    :param conf: confidence threshold
    :param iou: IoU threshold                                       (used in NMS_Type.ITERATIVE)
    :param classes: (optional list) filter by class                 (used in NMS_Type.ITERATIVE)
    :param nms_type: the type of nms to use (iterative or matrix)
    :param max_predictions: maximum number of boxes to output       (used in NMS_Type.MATRIX)
    :param with_confidence: in NMS, whether to multiply objectness  (used in NMS_Type.ITERATIVE)
                            score with class score
    :param class_agnostic_nms: indicates how boxes of different classes will be treated during
                               NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX)
                               True - NMS will be performed on all classes together.
                               False - NMS will be performed on each class separately (default).
    :param multi_label_per_box: controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE)
                                True - each anchor can produce multiple labels of different classes
                                       that pass confidence threshold check (default).
                                False - each anchor can produce only one label of the class with the highest score.
    """
    super(YoloXPostPredictionCallback, self).__init__()
    self.conf = conf
    self.iou = iou
    self.classes = classes
    self.nms_type = nms_type
    self.max_pred = max_predictions
    self.with_confidence = with_confidence
    self.class_agnostic_nms = class_agnostic_nms
    self.multi_label_per_box = multi_label_per_box

`forward(x, device=None)`

Apply NMS to the raw output of the model and keep only top max_predictions results.

Parameters:

Name	Type	Description	Default
`x`	`Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]`	Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)	required

Returns:

Type	Description
	List of Tensors of shape (x1, y1, x2, y2, conf, cls)

Source code in src/super_gradients/training/models/detection_models/yolo_base.py

def forward(self, x: Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]], device: str = None):
    """Apply NMS to the raw output of the model and keep only top `max_predictions` results.

    :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
    :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
    """
    # Use the main output features in case of multiple outputs.
    if isinstance(x, (tuple, list)):
        x = x[0]

    if self.nms_type == NMS_Type.ITERATIVE:
        nms_result = non_max_suppression(
            x,
            conf_thres=self.conf,
            iou_thres=self.iou,
            with_confidence=self.with_confidence,
            multi_label_per_box=self.multi_label_per_box,
            class_agnostic_nms=self.class_agnostic_nms,
        )
    else:
        nms_result = matrix_non_max_suppression(x, conf_thres=self.conf, max_num_of_detections=self.max_pred, class_agnostic_nms=self.class_agnostic_nms)

    return self._filter_max_predictions(nms_result)

`NDFLHeads`

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py

@register_detection_module()
class NDFLHeads(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        heads_list: Union[str, HpmStruct, DictConfig],
        grid_cell_scale: float = 5.0,
        grid_cell_offset: float = 0.5,
        reg_max: int = 16,
        eval_size: Optional[Tuple[int, int]] = None,
        width_mult: float = 1.0,
    ):
        """
        Initializes the NDFLHeads module.

        :param num_classes: Number of detection classes
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max: Number of bins in the regression head
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param width_mult: A scaling factor applied to in_channels.
        """
        super(NDFLHeads, self).__init__(in_channels)
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

        factory = det_factory.DetectionModulesFactory()
        heads_list = self._pass_args(heads_list, factory, num_classes, reg_max)

        self.num_heads = len(heads_list)
        fpn_strides: List[int] = []
        for i in range(self.num_heads):
            new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
            fpn_strides.append(new_head.stride)
            setattr(self, f"head{i + 1}", new_head)

        self.fpn_strides = tuple(fpn_strides)

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        for i in range(self.num_heads):
            head = getattr(self, f"head{i + 1}")
            head.replace_num_classes(num_classes, compute_new_weights_fn)

        self.num_classes = num_classes

    @staticmethod
    def _pass_args(heads_list, factory, num_classes, reg_max):
        for i in range(len(heads_list)):
            heads_list[i] = factory.insert_module_param(heads_list[i], "num_classes", num_classes)
            heads_list[i] = factory.insert_module_param(heads_list[i], "reg_max", reg_max)
        return heads_list

    @torch.jit.ignore
    def cache_anchors(self, input_size: Tuple[int, int]):
        self.eval_size = input_size
        device = infer_model_device(self)
        dtype = infer_model_dtype(self)

        anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
        self.register_buffer("anchor_points", anchor_points, persistent=False)
        self.register_buffer("stride_tensor", stride_tensor, persistent=False)

    @torch.jit.ignore
    def _init_weights(self):
        if self.eval_size:
            device = infer_model_device(self)
            dtype = infer_model_dtype(self)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def forward(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:
        feats = feats[: self.num_heads]
        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            reg_distri, cls_logit = getattr(self, f"head{i + 1}")(feat)
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        if torch.jit.is_tracing():
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    @property
    def out_channels(self):
        return None

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)

            # ONNX export does not support arange with float16, so it is created as fp32 and then casted to fp16
            # This produce correct fp16 weights in ONNX model when exported
            shift_x = torch.arange(end=w, dtype=torch.float32, device=device) + self.grid_cell_offset
            shift_y = torch.arange(end=h, dtype=torch.float32, device=device) + self.grid_cell_offset

            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype, device=device))

        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        return anchor_points, stride_tensor

`init(num_classes, in_channels, heads_list, grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, eval_size=None, width_mult=1.0)`

Initializes the NDFLHeads module.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of detection classes	required
`in_channels`	`Tuple[int, int, int]`	Number of channels for each feature map (See width_mult)	required
`grid_cell_scale`	`float`		`5.0`
`grid_cell_offset`	`float`		`0.5`
`reg_max`	`int`	Number of bins in the regression head	`16`
`eval_size`	`Optional[Tuple[int, int]]`	(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.	`None`
`width_mult`	`float`	A scaling factor applied to in_channels.	`1.0`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py

def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    heads_list: Union[str, HpmStruct, DictConfig],
    grid_cell_scale: float = 5.0,
    grid_cell_offset: float = 0.5,
    reg_max: int = 16,
    eval_size: Optional[Tuple[int, int]] = None,
    width_mult: float = 1.0,
):
    """
    Initializes the NDFLHeads module.

    :param num_classes: Number of detection classes
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param grid_cell_scale:
    :param grid_cell_offset:
    :param reg_max: Number of bins in the regression head
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param width_mult: A scaling factor applied to in_channels.
    """
    super(NDFLHeads, self).__init__(in_channels)
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

    factory = det_factory.DetectionModulesFactory()
    heads_list = self._pass_args(heads_list, factory, num_classes, reg_max)

    self.num_heads = len(heads_list)
    fpn_strides: List[int] = []
    for i in range(self.num_heads):
        new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
        fpn_strides.append(new_head.stride)
        setattr(self, f"head{i + 1}", new_head)

    self.fpn_strides = tuple(fpn_strides)

`YoloNASDFLHead`

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py

@register_detection_module()
class YoloNASDFLHead(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        in_channels: int,
        inter_channels: int,
        width_mult: float,
        first_conv_group_size: int,
        num_classes: int,
        stride: int,
        reg_max: int,
        cls_dropout_rate: float = 0.0,
        reg_dropout_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASDFLHead
        :param in_channels: Input channels
        :param inter_channels: Intermediate number of channels
        :param width_mult: Width multiplier
        :param first_conv_group_size: Group size
        :param num_classes: Number of detection classes
        :param stride: Output stride for this head
        :param reg_max: Number of bins in the regression head
        :param cls_dropout_rate: Dropout rate for the classification head
        :param reg_dropout_rate: Dropout rate for the regression head
        """
        super().__init__(in_channels)

        inter_channels = width_multiplier(inter_channels, width_mult, 8)
        if first_conv_group_size == 0:
            groups = 0
        elif first_conv_group_size == -1:
            groups = 1
        else:
            groups = inter_channels // first_conv_group_size

        self.num_classes = num_classes
        self.stem = ConvBNReLU(in_channels, inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

        first_cls_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        first_reg_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        self.cls_pred = nn.Conv2d(inter_channels, self.num_classes, 1, 1, 0)
        self.reg_pred = nn.Conv2d(inter_channels, 4 * (reg_max + 1), 1, 1, 0)

        self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
        self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

        self.grid = torch.zeros(1)
        self.stride = stride

        self.prior_prob = 1e-2
        self._initialize_biases()

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        self.cls_pred = compute_new_weights_fn(self.cls_pred, num_classes)
        self.num_classes = num_classes

    @property
    def out_channels(self):
        return None

    def forward(self, x):
        x = self.stem(x)

        cls_feat = self.cls_convs(x)
        cls_feat = self.cls_dropout_rate(cls_feat)
        cls_output = self.cls_pred(cls_feat)

        reg_feat = self.reg_convs(x)
        reg_feat = self.reg_dropout_rate(reg_feat)
        reg_output = self.reg_pred(reg_feat)

        return reg_output, cls_output

    def _initialize_biases(self):
        prior_bias = -math.log((1 - self.prior_prob) / self.prior_prob)
        torch.nn.init.constant_(self.cls_pred.bias, prior_bias)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij")
        else:
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

`init(in_channels, inter_channels, width_mult, first_conv_group_size, num_classes, stride, reg_max, cls_dropout_rate=0.0, reg_dropout_rate=0.0)`

Initialize the YoloNASDFLHead

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Input channels	required
`inter_channels`	`int`	Intermediate number of channels	required
`width_mult`	`float`	Width multiplier	required
`first_conv_group_size`	`int`	Group size	required
`num_classes`	`int`	Number of detection classes	required
`stride`	`int`	Output stride for this head	required
`reg_max`	`int`	Number of bins in the regression head	required
`cls_dropout_rate`	`float`	Dropout rate for the classification head	`0.0`
`reg_dropout_rate`	`float`	Dropout rate for the regression head	`0.0`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py

def __init__(
    self,
    in_channels: int,
    inter_channels: int,
    width_mult: float,
    first_conv_group_size: int,
    num_classes: int,
    stride: int,
    reg_max: int,
    cls_dropout_rate: float = 0.0,
    reg_dropout_rate: float = 0.0,
):
    """
    Initialize the YoloNASDFLHead
    :param in_channels: Input channels
    :param inter_channels: Intermediate number of channels
    :param width_mult: Width multiplier
    :param first_conv_group_size: Group size
    :param num_classes: Number of detection classes
    :param stride: Output stride for this head
    :param reg_max: Number of bins in the regression head
    :param cls_dropout_rate: Dropout rate for the classification head
    :param reg_dropout_rate: Dropout rate for the regression head
    """
    super().__init__(in_channels)

    inter_channels = width_multiplier(inter_channels, width_mult, 8)
    if first_conv_group_size == 0:
        groups = 0
    elif first_conv_group_size == -1:
        groups = 1
    else:
        groups = inter_channels // first_conv_group_size

    self.num_classes = num_classes
    self.stem = ConvBNReLU(in_channels, inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

    first_cls_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    first_reg_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    self.cls_pred = nn.Conv2d(inter_channels, self.num_classes, 1, 1, 0)
    self.reg_pred = nn.Conv2d(inter_channels, 4 * (reg_max + 1), 1, 1, 0)

    self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
    self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

    self.grid = torch.zeros(1)
    self.stride = stride

    self.prior_prob = 1e-2
    self._initialize_biases()

`YoloNASPANNeckWithC2`

Bases: BaseDetectionModule

A PAN (path aggregation network) neck with 4 stages (2 up-sampling and 2 down-sampling stages) where the up-sampling stages include a higher resolution skip Returns outputs of neck stage 2, stage 3, stage 4

Source code in src/super_gradients/training/models/detection_models/yolo_nas/panneck.py

@register_detection_module("YoloNASPANNeckWithC2")
class YoloNASPANNeckWithC2(BaseDetectionModule):
    """
    A PAN (path aggregation network) neck with 4 stages (2 up-sampling and 2 down-sampling stages)
    where the up-sampling stages include a higher resolution skip
    Returns outputs of neck stage 2, stage 3, stage 4
    """

    def __init__(
        self,
        in_channels: List[int],
        neck1: Union[str, HpmStruct, DictConfig],
        neck2: Union[str, HpmStruct, DictConfig],
        neck3: Union[str, HpmStruct, DictConfig],
        neck4: Union[str, HpmStruct, DictConfig],
    ):
        """
        Initialize the PAN neck

        :param in_channels: Input channels of the 4 feature maps from the backbone
        :param neck1: First neck stage config
        :param neck2: Second neck stage config
        :param neck3: Third neck stage config
        :param neck4: Fourth neck stage config
        """
        super().__init__(in_channels)
        c2_out_channels, c3_out_channels, c4_out_channels, c5_out_channels = in_channels

        factory = det_factory.DetectionModulesFactory()
        self.neck1 = factory.get(factory.insert_module_param(neck1, "in_channels", [c5_out_channels, c4_out_channels, c3_out_channels]))
        self.neck2 = factory.get(factory.insert_module_param(neck2, "in_channels", [self.neck1.out_channels[1], c3_out_channels, c2_out_channels]))
        self.neck3 = factory.get(factory.insert_module_param(neck3, "in_channels", [self.neck2.out_channels[1], self.neck2.out_channels[0]]))
        self.neck4 = factory.get(factory.insert_module_param(neck4, "in_channels", [self.neck3.out_channels, self.neck1.out_channels[0]]))

        self._out_channels = [
            self.neck2.out_channels[1],
            self.neck3.out_channels,
            self.neck4.out_channels,
        ]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
        c2, c3, c4, c5 = inputs

        x_n1_inter, x = self.neck1([c5, c4, c3])
        x_n2_inter, p3 = self.neck2([x, c3, c2])
        p4 = self.neck3([p3, x_n2_inter])
        p5 = self.neck4([p4, x_n1_inter])

        return p3, p4, p5

`init(in_channels, neck1, neck2, neck3, neck4)`

Initialize the PAN neck

Parameters:

Name	Type	Description	Default
`in_channels`	`List[int]`	Input channels of the 4 feature maps from the backbone	required
`neck1`	`Union[str, HpmStruct, DictConfig]`	First neck stage config	required
`neck2`	`Union[str, HpmStruct, DictConfig]`	Second neck stage config	required
`neck3`	`Union[str, HpmStruct, DictConfig]`	Third neck stage config	required
`neck4`	`Union[str, HpmStruct, DictConfig]`	Fourth neck stage config	required

Source code in src/super_gradients/training/models/detection_models/yolo_nas/panneck.py

def __init__(
    self,
    in_channels: List[int],
    neck1: Union[str, HpmStruct, DictConfig],
    neck2: Union[str, HpmStruct, DictConfig],
    neck3: Union[str, HpmStruct, DictConfig],
    neck4: Union[str, HpmStruct, DictConfig],
):
    """
    Initialize the PAN neck

    :param in_channels: Input channels of the 4 feature maps from the backbone
    :param neck1: First neck stage config
    :param neck2: Second neck stage config
    :param neck3: Third neck stage config
    :param neck4: Fourth neck stage config
    """
    super().__init__(in_channels)
    c2_out_channels, c3_out_channels, c4_out_channels, c5_out_channels = in_channels

    factory = det_factory.DetectionModulesFactory()
    self.neck1 = factory.get(factory.insert_module_param(neck1, "in_channels", [c5_out_channels, c4_out_channels, c3_out_channels]))
    self.neck2 = factory.get(factory.insert_module_param(neck2, "in_channels", [self.neck1.out_channels[1], c3_out_channels, c2_out_channels]))
    self.neck3 = factory.get(factory.insert_module_param(neck3, "in_channels", [self.neck2.out_channels[1], self.neck2.out_channels[0]]))
    self.neck4 = factory.get(factory.insert_module_param(neck4, "in_channels", [self.neck3.out_channels, self.neck1.out_channels[0]]))

    self._out_channels = [
        self.neck2.out_channels[1],
        self.neck3.out_channels,
        self.neck4.out_channels,
    ]

`YoloNAS`

Bases: ExportableObjectDetectionModel, SupportsInputShapeCheck, CustomizableDetector

Export to ONNX/TRT Support matrix ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

Batch Size	Export Engine	Format	OnnxRuntime 1.13.1	TensorRT 8.4.2	TensorRT 8.5.3	TensorRT 8.6.1
1	ONNX	Flat	Yes	Yes	Yes	Yes
>1	ONNX	Flat	Yes	No	No	No
1	ONNX	Batch	Yes	No	Yes	Yes
>1	ONNX	Batch	Yes	No	No	Yes
1	TensorRT	Flat	No	No	Yes	Yes
>1	TensorRT	Flat	No	No	Yes	Yes
1	TensorRT	Batch	No	Yes	Yes	Yes
>1	TensorRT	Batch	No	Yes	Yes	Yes

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py

class YoloNAS(ExportableObjectDetectionModel, SupportsInputShapeCheck, CustomizableDetector):
    """

    Export to ONNX/TRT Support matrix
    ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

    | Batch Size | Export Engine | Format | OnnxRuntime 1.13.1 | TensorRT 8.4.2 | TensorRT 8.5.3 | TensorRT 8.6.1 |
    |------------|---------------|--------|--------------------|----------------|----------------|----------------|
    | 1          | ONNX          | Flat   | Yes                | Yes            | Yes            | Yes            |
    | >1         | ONNX          | Flat   | Yes                | No             | No             | No             |
    | 1          | ONNX          | Batch  | Yes                | No             | Yes            | Yes            |
    | >1         | ONNX          | Batch  | Yes                | No             | No             | Yes            |
    | 1          | TensorRT      | Flat   | No                 | No             | Yes            | Yes            |
    | >1         | TensorRT      | Flat   | No                 | No             | Yes            | Yes            |
    | 1          | TensorRT      | Batch  | No                 | Yes            | Yes            | Yes            |
    | >1         | TensorRT      | Batch  | No                 | Yes            | Yes            | Yes            |

    """

    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        super().__init__(backbone, heads, neck, num_classes, bn_eps, bn_momentum, inplace_act, in_channels)

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> PPYoloEPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        return PPYoloEPostPredictionCallback(
            score_threshold=conf,
            nms_threshold=iou,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
        )

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return YoloNASDecodingModule(num_pre_nms_predictions)

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

`get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)`

Get a post prediction callback for this model.

Parameters:

Name	Type	Description	Default
`conf`	`float`	A minimum confidence threshold for predictions to be used in post-processing.	required
`iou`	`float`	A IoU threshold for boxes non-maximum suppression.	required
`nms_top_k`	`int`	The maximum number of detections to consider for NMS.	required
`max_predictions`	`int`	The maximum number of detections to return.	required
`multi_label_per_box`	`bool`	If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.	required
`class_agnostic_nms`	`bool`	If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.	required

Returns:

Type	Description
`PPYoloEPostPredictionCallback`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py

def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> PPYoloEPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    return PPYoloEPostPredictionCallback(
        score_threshold=conf,
        nms_threshold=iou,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
    )

`YoloNASDecodingModule`

Bases: AbstractObjectDetectionDecodingModule

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py

class YoloNASDecodingModule(AbstractObjectDetectionDecodingModule):
    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = predictions
        else:
            pred_bboxes, pred_scores = predictions[0]

        return pred_bboxes.size(1)

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = inputs
        else:
            pred_bboxes, pred_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)  # [B, Anchors]
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

`infer_total_number_of_predictions(predictions)`

Parameters:

Name	Type	Description	Default
`inputs`			required

Returns:

Type	Description
`int`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py

@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = predictions
    else:
        pred_bboxes, pred_scores = predictions[0]

    return pred_bboxes.size(1)

`SequentialWithIntermediates`

Bases: nn.Sequential

A Sequential module that can return all intermediate values as a list of Tensors

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

class SequentialWithIntermediates(nn.Sequential):
    """
    A Sequential module that can return all intermediate values as a list of Tensors
    """

    def __init__(self, output_intermediates: bool, *args):
        super(SequentialWithIntermediates, self).__init__(*args)
        self.output_intermediates = output_intermediates

    def forward(self, input: Tensor) -> List[Tensor]:
        if self.output_intermediates:
            output = [input]
            for module in self:
                output.append(module(output[-1]))
            return output
        #  For uniformity, we return a list even if we don't output intermediates
        return [super(SequentialWithIntermediates, self).forward(input)]

`YoloNASBottleneck`

Bases: nn.Module

A bottleneck block for YoloNAS. Consists of two consecutive blocks and optional residual connection.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

class YoloNASBottleneck(nn.Module):
    """
    A bottleneck block for YoloNAS. Consists of two consecutive blocks and optional residual connection.
    """

    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        block_type: Type[nn.Module],
        activation_type: Type[nn.Module],
        shortcut: bool,
        use_alpha: bool,
        drop_path_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASBottleneck block

        :param input_channels: Number of input channels
        :param output_channels: Number of output channels
        :param block_type: Type of the convolutional block
        :param activation_type: Activation type for the convolutional block
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        :param drop_path_rate: Drop path rate for the residual path of the block
        """
        super().__init__()

        self.cv1 = block_type(input_channels, output_channels, activation_type=activation_type)
        self.cv2 = block_type(output_channels, output_channels, activation_type=activation_type)
        self.add = shortcut and input_channels == output_channels
        self.shortcut = Residual() if self.add else None
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        if use_alpha:
            self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
        else:
            self.alpha = 1.0

    def forward(self, x):
        y = self.drop_path(self.cv2(self.cv1(x)))
        return self.alpha * self.shortcut(x) + y if self.add else y

`init(input_channels, output_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=0.0)`

Initialize the YoloNASBottleneck block

Parameters:

Name	Type	Description	Default
`input_channels`	`int`	Number of input channels	required
`output_channels`	`int`	Number of output channels	required
`block_type`	`Type[nn.Module]`	Type of the convolutional block	required
`activation_type`	`Type[nn.Module]`	Activation type for the convolutional block	required
`shortcut`	`bool`	If True, adds the residual connection from input to output.	required
`use_alpha`	`bool`	If True, adds the learnable alpha parameter (multiplier for the residual connection).	required
`drop_path_rate`	`float`	Drop path rate for the residual path of the block	`0.0`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

def __init__(
    self,
    input_channels: int,
    output_channels: int,
    block_type: Type[nn.Module],
    activation_type: Type[nn.Module],
    shortcut: bool,
    use_alpha: bool,
    drop_path_rate: float = 0.0,
):
    """
    Initialize the YoloNASBottleneck block

    :param input_channels: Number of input channels
    :param output_channels: Number of output channels
    :param block_type: Type of the convolutional block
    :param activation_type: Activation type for the convolutional block
    :param shortcut: If True, adds the residual connection from input to output.
    :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
    :param drop_path_rate: Drop path rate for the residual path of the block
    """
    super().__init__()

    self.cv1 = block_type(input_channels, output_channels, activation_type=activation_type)
    self.cv2 = block_type(output_channels, output_channels, activation_type=activation_type)
    self.add = shortcut and input_channels == output_channels
    self.shortcut = Residual() if self.add else None
    self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
    if use_alpha:
        self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
    else:
        self.alpha = 1.0

`YoloNASCSPLayer`

Bases: nn.Module

Cross-stage layer module for YoloNAS.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

class YoloNASCSPLayer(nn.Module):
    """
    Cross-stage layer module for YoloNAS.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        block_type: Type[nn.Module],
        activation_type: Type[nn.Module],
        shortcut: bool = True,
        use_alpha: bool = True,
        expansion: float = 0.5,
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
    ):
        """

        :param in_channels: Number of input channels.
        :param out_channels:  Number of output channels.
        :param num_bottlenecks: Number of bottleneck blocks.
        :param block_type: Bottleneck block type.
        :param activation_type: Activation type for all blocks.
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        :param expansion: If hidden_channels is None, hidden_channels is set to in_channels * expansion.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                                Must have the length equal to the num_bottlenecks or None.
        :param dropout_rate: Dropout probability before the last convolution in this layer.
        """
        if drop_path_rates is None:
            drop_path_rates = [0.0] * num_bottlenecks
        else:
            drop_path_rates = tuple(drop_path_rates)
        if len(drop_path_rates) != num_bottlenecks:
            raise ValueError(
                f"Argument drop_path_rates ({drop_path_rates}, len {len(drop_path_rates)} "
                f"must have the length equal to the num_bottlenecks ({num_bottlenecks})."
            )

        super(YoloNASCSPLayer, self).__init__()
        if hidden_channels is None:
            hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv3 = Conv(hidden_channels * (2 + concat_intermediates * num_bottlenecks), out_channels, 1, stride=1, activation_type=activation_type)
        module_list = [
            YoloNASBottleneck(hidden_channels, hidden_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=drop_path_rates[i])
            for i in range(num_bottlenecks)
        ]
        self.bottlenecks = SequentialWithIntermediates(concat_intermediates, *module_list)
        self.dropout = nn.Dropout2d(dropout_rate, inplace=True) if dropout_rate > 0.0 else nn.Identity()

    def forward(self, x: Tensor) -> Tensor:
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((*x_1, x_2), dim=1)
        x = self.dropout(x)
        return self.conv3(x)

`init(in_channels, out_channels, num_bottlenecks, block_type, activation_type, shortcut=True, use_alpha=True, expansion=0.5, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0)`

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels.	required
`out_channels`	`int`	Number of output channels.	required
`num_bottlenecks`	`int`	Number of bottleneck blocks.	required
`block_type`	`Type[nn.Module]`	Bottleneck block type.	required
`activation_type`	`Type[nn.Module]`	Activation type for all blocks.	required
`shortcut`	`bool`	If True, adds the residual connection from input to output.	`True`
`use_alpha`	`bool`	If True, adds the learnable alpha parameter (multiplier for the residual connection).	`True`
`expansion`	`float`	If hidden_channels is None, hidden_channels is set to in_channels * expansion.	`0.5`
`hidden_channels`	`int`	If not None, sets the number of hidden channels used inside the bottleneck blocks.	`None`
`concat_intermediates`	`bool`		`False`
`drop_path_rates`	`Union[Iterable[float], None]`	List of drop path probabilities for each bottleneck block. Must have the length equal to the num_bottlenecks or None.	`None`
`dropout_rate`	`float`	Dropout probability before the last convolution in this layer.	`0.0`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_bottlenecks: int,
    block_type: Type[nn.Module],
    activation_type: Type[nn.Module],
    shortcut: bool = True,
    use_alpha: bool = True,
    expansion: float = 0.5,
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
):
    """

    :param in_channels: Number of input channels.
    :param out_channels:  Number of output channels.
    :param num_bottlenecks: Number of bottleneck blocks.
    :param block_type: Bottleneck block type.
    :param activation_type: Activation type for all blocks.
    :param shortcut: If True, adds the residual connection from input to output.
    :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
    :param expansion: If hidden_channels is None, hidden_channels is set to in_channels * expansion.
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates:
    :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                            Must have the length equal to the num_bottlenecks or None.
    :param dropout_rate: Dropout probability before the last convolution in this layer.
    """
    if drop_path_rates is None:
        drop_path_rates = [0.0] * num_bottlenecks
    else:
        drop_path_rates = tuple(drop_path_rates)
    if len(drop_path_rates) != num_bottlenecks:
        raise ValueError(
            f"Argument drop_path_rates ({drop_path_rates}, len {len(drop_path_rates)} "
            f"must have the length equal to the num_bottlenecks ({num_bottlenecks})."
        )

    super(YoloNASCSPLayer, self).__init__()
    if hidden_channels is None:
        hidden_channels = int(out_channels * expansion)
    self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
    self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
    self.conv3 = Conv(hidden_channels * (2 + concat_intermediates * num_bottlenecks), out_channels, 1, stride=1, activation_type=activation_type)
    module_list = [
        YoloNASBottleneck(hidden_channels, hidden_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=drop_path_rates[i])
        for i in range(num_bottlenecks)
    ]
    self.bottlenecks = SequentialWithIntermediates(concat_intermediates, *module_list)
    self.dropout = nn.Dropout2d(dropout_rate, inplace=True) if dropout_rate > 0.0 else nn.Identity()

`YoloNASDownStage`

Bases: BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@register_detection_module()
class YoloNASDownStage(BaseDetectionModule):
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
    ):
        """
        Initializes a YoloNASDownStage.

        :param in_channels: Number of input channels.
        :param out_channels: Number of output channels.
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of blocks in the stage.
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Type of activation to use inside the blocks.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        """

        super().__init__(in_channels)

        in_channels, skip_in_channels = in_channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        self.conv = Conv(in_channels, out_channels // 2, 3, 2, activation_type)
        after_concat_channels = out_channels // 2 + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            in_channels=after_concat_channels,
            out_channels=out_channels,
            num_bottlenecks=num_blocks,
            block_type=partial(Conv, kernel=3, stride=1),
            activation_type=activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

        self._out_channels = out_channels

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        x, skip_x = inputs
        x = self.conv(x)
        x = torch.cat([x, skip_x], 1)
        x = self.blocks(x)
        return x

`init(in_channels, out_channels, width_mult, num_blocks, depth_mult, activation_type, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0)`

Initializes a YoloNASDownStage.

Parameters:

Name	Type	Description	Default
`in_channels`	`List[int]`	Number of input channels.	required
`out_channels`	`int`	Number of output channels.	required
`width_mult`	`float`	Multiplier for the number of channels in the stage.	required
`num_blocks`	`int`	Number of blocks in the stage.	required
`depth_mult`	`float`	Multiplier for the number of blocks in the stage.	required
`activation_type`	`Type[nn.Module]`	Type of activation to use inside the blocks.	required
`hidden_channels`	`int`	If not None, sets the number of hidden channels used inside the bottleneck blocks.	`None`
`concat_intermediates`	`bool`		`False`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@resolve_param("activation_type", ActivationsTypeFactory())
def __init__(
    self,
    in_channels: List[int],
    out_channels: int,
    width_mult: float,
    num_blocks: int,
    depth_mult: float,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
):
    """
    Initializes a YoloNASDownStage.

    :param in_channels: Number of input channels.
    :param out_channels: Number of output channels.
    :param width_mult: Multiplier for the number of channels in the stage.
    :param num_blocks: Number of blocks in the stage.
    :param depth_mult: Multiplier for the number of blocks in the stage.
    :param activation_type: Type of activation to use inside the blocks.
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates:
    """

    super().__init__(in_channels)

    in_channels, skip_in_channels = in_channels
    out_channels = width_multiplier(out_channels, width_mult, 8)
    num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

    self.conv = Conv(in_channels, out_channels // 2, 3, 2, activation_type)
    after_concat_channels = out_channels // 2 + skip_in_channels
    self.blocks = YoloNASCSPLayer(
        in_channels=after_concat_channels,
        out_channels=out_channels,
        num_bottlenecks=num_blocks,
        block_type=partial(Conv, kernel=3, stride=1),
        activation_type=activation_type,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

    self._out_channels = out_channels

`YoloNASStage`

Bases: BaseDetectionModule

A single stage module for YoloNAS. It consists of a downsample block (QARepVGGBlock) followed by YoloNASCSPLayer.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@register_detection_module()
class YoloNASStage(BaseDetectionModule):
    """
    A single stage module for YoloNAS. It consists of a downsample block (QARepVGGBlock) followed by YoloNASCSPLayer.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks: int,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
        stride: int = 2,
    ):
        """
        Initialize the YoloNASStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of bottleneck blocks in the YoloNASCSPLayer
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates: If True, concatenates the intermediate values from the YoloNASCSPLayer.
        :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                                Must have the length equal to the num_blocks or None.
        :param dropout_rate: Dropout probability before the last convolution in this layer.
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.downsample = QARepVGGBlock(in_channels, out_channels, stride=stride, activation_type=activation_type, use_residual_connection=False)
        self.blocks = YoloNASCSPLayer(
            out_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            True,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x):
        return self.blocks(self.downsample(x))

`init(in_channels, out_channels, num_blocks, activation_type, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0, stride=2)`

Initialize the YoloNASStage module

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required
`num_blocks`	`int`	Number of bottleneck blocks in the YoloNASCSPLayer	required
`activation_type`	`Type[nn.Module]`	Activation type for all blocks	required
`hidden_channels`	`int`	If not None, sets the number of hidden channels used inside the bottleneck blocks.	`None`
`concat_intermediates`	`bool`	If True, concatenates the intermediate values from the YoloNASCSPLayer.	`False`
`drop_path_rates`	`Union[Iterable[float], None]`	List of drop path probabilities for each bottleneck block. Must have the length equal to the num_blocks or None.	`None`
`dropout_rate`	`float`	Dropout probability before the last convolution in this layer.	`0.0`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@resolve_param("activation_type", ActivationsTypeFactory())
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_blocks: int,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
    stride: int = 2,
):
    """
    Initialize the YoloNASStage module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param num_blocks: Number of bottleneck blocks in the YoloNASCSPLayer
    :param activation_type: Activation type for all blocks
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates: If True, concatenates the intermediate values from the YoloNASCSPLayer.
    :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                            Must have the length equal to the num_blocks or None.
    :param dropout_rate: Dropout probability before the last convolution in this layer.
    """
    super().__init__(in_channels)
    self._out_channels = out_channels
    self.downsample = QARepVGGBlock(in_channels, out_channels, stride=stride, activation_type=activation_type, use_residual_connection=False)
    self.blocks = YoloNASCSPLayer(
        out_channels,
        out_channels,
        num_blocks,
        QARepVGGBlock,
        activation_type,
        True,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

`YoloNASStem`

Bases: BaseDetectionModule, SupportsReplaceInputChannels

Stem module for YoloNAS. Consists of a single QARepVGGBlock with stride of two.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@register_detection_module()
class YoloNASStem(BaseDetectionModule, SupportsReplaceInputChannels):
    """
    Stem module for YoloNAS. Consists of a single QARepVGGBlock with stride of two.
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
        """
        Initialize the YoloNASStem module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.conv = QARepVGGBlock(in_channels, out_channels, stride=stride, use_residual_connection=False)

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x: Tensor) -> Tensor:
        return self.conv(x)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.conv = QARepVGGBlock(in_channels, self._out_channels, stride=2, use_residual_connection=False)

    def get_input_channels(self) -> int:
        return self.conv.in_channels

`init(in_channels, out_channels, stride=2)`

Initialize the YoloNASStem module

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
    """
    Initialize the YoloNASStem module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    """
    super().__init__(in_channels)
    self._out_channels = out_channels
    self.conv = QARepVGGBlock(in_channels, out_channels, stride=stride, use_residual_connection=False)

`YoloNASUpStage`

Bases: BaseDetectionModule

Upsampling stage for YoloNAS.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@register_detection_module()
class YoloNASUpStage(BaseDetectionModule):
    """
    Upsampling stage for YoloNAS.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    @resolve_param("upsample_mode", TypeFactory.from_enum_cls(UpsampleMode))
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        reduce_channels: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
        upsample_mode: UpsampleMode = UpsampleMode.CONV_TRANSPOSE,
    ):
        """
        Initialize the YoloNASUpStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of bottleneck blocks
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks
        :param concat_intermediates:
        :param reduce_channels:
        """
        super().__init__(in_channels)

        num_inputs = len(in_channels)
        if num_inputs == 2:
            in_channels, skip_in_channels = in_channels
        else:
            in_channels, skip_in_channels1, skip_in_channels2 = in_channels
            skip_in_channels = skip_in_channels1 + out_channels  # skip2 downsample results in out_channels channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        if num_inputs == 2:
            self.reduce_skip = Conv(skip_in_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
        else:
            self.reduce_skip1 = Conv(skip_in_channels1, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
            self.reduce_skip2 = Conv(skip_in_channels2, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        self.conv = Conv(in_channels, out_channels, 1, 1, activation_type)

        self.upsample = make_upsample_module_with_explicit_channels(
            in_channels=out_channels, out_channels=out_channels, scale_factor=2, upsample_mode=upsample_mode, align_corners=True
        )
        if num_inputs == 3:
            self.downsample = Conv(out_channels if reduce_channels else skip_in_channels2, out_channels, kernel=3, stride=2, activation_type=activation_type)

        self.reduce_after_concat = Conv(num_inputs * out_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        after_concat_channels = out_channels if reduce_channels else out_channels + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            after_concat_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

        self._out_channels = [out_channels, out_channels]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        if len(inputs) == 2:
            x, skip_x = inputs
            skip_x = [self.reduce_skip(skip_x)]
        else:
            x, skip_x1, skip_x2 = inputs
            skip_x1, skip_x2 = self.reduce_skip1(skip_x1), self.reduce_skip2(skip_x2)
            skip_x = [skip_x1, self.downsample(skip_x2)]
        x_inter = self.conv(x)
        x = self.upsample(x_inter)
        x = torch.cat([x, *skip_x], 1)
        x = self.reduce_after_concat(x)
        x = self.blocks(x)
        return x_inter, x

`init(in_channels, out_channels, width_mult, num_blocks, depth_mult, activation_type, hidden_channels=None, concat_intermediates=False, reduce_channels=False, drop_path_rates=None, dropout_rate=0.0, upsample_mode=UpsampleMode.CONV_TRANSPOSE)`

Initialize the YoloNASUpStage module

Parameters:

Name	Type	Description	Default
`in_channels`	`List[int]`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required
`width_mult`	`float`	Multiplier for the number of channels in the stage.	required
`num_blocks`	`int`	Number of bottleneck blocks	required
`depth_mult`	`float`	Multiplier for the number of blocks in the stage.	required
`activation_type`	`Type[nn.Module]`	Activation type for all blocks	required
`hidden_channels`	`int`	If not None, sets the number of hidden channels used inside the bottleneck blocks	`None`
`concat_intermediates`	`bool`		`False`
`reduce_channels`	`bool`		`False`

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py

@resolve_param("activation_type", ActivationsTypeFactory())
@resolve_param("upsample_mode", TypeFactory.from_enum_cls(UpsampleMode))
def __init__(
    self,
    in_channels: List[int],
    out_channels: int,
    width_mult: float,
    num_blocks: int,
    depth_mult: float,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    reduce_channels: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
    upsample_mode: UpsampleMode = UpsampleMode.CONV_TRANSPOSE,
):
    """
    Initialize the YoloNASUpStage module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param width_mult: Multiplier for the number of channels in the stage.
    :param num_blocks: Number of bottleneck blocks
    :param depth_mult: Multiplier for the number of blocks in the stage.
    :param activation_type: Activation type for all blocks
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks
    :param concat_intermediates:
    :param reduce_channels:
    """
    super().__init__(in_channels)

    num_inputs = len(in_channels)
    if num_inputs == 2:
        in_channels, skip_in_channels = in_channels
    else:
        in_channels, skip_in_channels1, skip_in_channels2 = in_channels
        skip_in_channels = skip_in_channels1 + out_channels  # skip2 downsample results in out_channels channels
    out_channels = width_multiplier(out_channels, width_mult, 8)
    num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

    if num_inputs == 2:
        self.reduce_skip = Conv(skip_in_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
    else:
        self.reduce_skip1 = Conv(skip_in_channels1, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
        self.reduce_skip2 = Conv(skip_in_channels2, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

    self.conv = Conv(in_channels, out_channels, 1, 1, activation_type)

    self.upsample = make_upsample_module_with_explicit_channels(
        in_channels=out_channels, out_channels=out_channels, scale_factor=2, upsample_mode=upsample_mode, align_corners=True
    )
    if num_inputs == 3:
        self.downsample = Conv(out_channels if reduce_channels else skip_in_channels2, out_channels, kernel=3, stride=2, activation_type=activation_type)

    self.reduce_after_concat = Conv(num_inputs * out_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

    after_concat_channels = out_channels if reduce_channels else out_channels + skip_in_channels
    self.blocks = YoloNASCSPLayer(
        after_concat_channels,
        out_channels,
        num_blocks,
        QARepVGGBlock,
        activation_type,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

    self._out_channels = [out_channels, out_channels]

`KDModule`

Bases: SgModule

KDModule

class implementing Knowledge Distillation logic as an SgModule

attributes: student: SgModule - the student model teacher: torch.nn.Module- the teacher model run_teacher_on_eval: bool- whether to run self.teacher at eval mode regardless of self.train(mode) arch_params: HpmStruct- Architecture H.P.

    Additionally, by passing teacher_input_adapter (torch.nn.Module) one can modify the teacher net to act as if
    teacher = torch.nn.Sequential(teacher_input_adapter, teacher). This is useful when teacher net expects a
    different input format from the student (for example different normalization).
    Equivalent arg for the student model, can be passed through student_input_adapter.

Source code in src/super_gradients/training/models/kd_modules/kd_module.py

@register_model(Models.KD_MODULE)
@register_kd_model(Models.KD_MODULE)
class KDModule(SgModule):
    """
    KDModule

    class implementing Knowledge Distillation logic as an SgModule

    attributes:
        student: SgModule - the student model
        teacher: torch.nn.Module- the teacher model
        run_teacher_on_eval: bool- whether to run self.teacher at eval mode regardless of self.train(mode)
        arch_params: HpmStruct- Architecture H.P.

            Additionally, by passing teacher_input_adapter (torch.nn.Module) one can modify the teacher net to act as if
            teacher = torch.nn.Sequential(teacher_input_adapter, teacher). This is useful when teacher net expects a
            different input format from the student (for example different normalization).
            Equivalent arg for the student model, can be passed through student_input_adapter.

    """

    def __init__(self, arch_params: HpmStruct, student: SgModule, teacher: torch.nn.Module, run_teacher_on_eval=False):
        super(KDModule, self).__init__()
        self.arch_params = arch_params
        self.student = student
        self.teacher = teacher
        self.teacher_input_adapter = get_param(self.arch_params, "teacher_input_adapter")
        self.student_input_adapter = get_param(self.arch_params, "student_input_adapter")
        self.run_teacher_on_eval = run_teacher_on_eval
        self._freeze_teacher()

        # WHEN CREATING A MODULE SELF.TRAIN() ISN'T CALLED AND SO THE TEACHER MUST BE MOVED TO EVAL MODE EXPLICITLY
        if self.run_teacher_on_eval:
            self.teacher.eval()

    def _freeze_teacher(self):
        for p in self.teacher.parameters():
            p.requires_grad = False

        if self.teacher_input_adapter is not None:
            for p in self.teacher_input_adapter.parameters():
                p.requires_grad = False
            self.teacher_input_adapter.eval()

    def train(self, mode=True):
        self.student.train(mode)
        if not self.run_teacher_on_eval:
            self.teacher.train(mode)

    def eval(self):
        self.student.eval()
        self.teacher.eval()

    def forward(self, x):
        if self.student_input_adapter is not None:
            student_output = self.student(self.student_input_adapter(x))
        else:
            student_output = self.student(x)

        if self.teacher_input_adapter is not None:
            teacher_output = self.teacher(self.teacher_input_adapter(x))
        else:
            teacher_output = self.teacher(x)

        return KDOutput(student_output=student_output, teacher_output=teacher_output)

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        return self.student.initialize_param_groups(lr, training_params)

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        return self.student.update_param_groups(param_groups, lr, epoch, iter, training_params, total_batch)

    def replace_head(self, **kwargs):
        self.student.replace_head(**kwargs)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.student.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.student.get_input_channels()

`get(model_name, arch_params=None, num_classes=None, strict_load=StrictLoad.NO_KEY_MATCHING, checkpoint_path=None, pretrained_weights=None, load_backbone=False, download_required_code=True, checkpoint_num_classes=None, num_input_channels=None)`

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Defines the model's architecture from models/ALL_ARCHITECTURES	required
`arch_params`	`Optional[dict]`	Architecture hyper parameters. e.g.: block, num_blocks, etc.	`None`
`num_classes`	`Optional[int]`	Number of classes (defines the net's structure). If None is given, will try to derive from pretrained_weight's corresponding dataset.	`None`
`strict_load`	`Union[str, StrictLoad]`	See super_gradients.common.data_types.enum.strict_load.StrictLoad class documentation for details (default=NO_KEY_MATCHING to suport SG trained checkpoints)	`StrictLoad.NO_KEY_MATCHING`
`checkpoint_path`	`Optional[str]`	The path to the external checkpoint to be loaded. Can be absolute or relative (ie: path/to/checkpoint.pth) path or URL. If provided, will automatically attempt to load the checkpoint.	`None`
`pretrained_weights`	`Optional[str]`	Describe the dataset of the pretrained weights (for example "imagenent").	`None`
`load_backbone`	`bool`	Load the provided checkpoint to model.backbone instead of model.	`False`
`download_required_code`	`bool`	If model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.	`True`
`checkpoint_num_classes`	`Optional[int]`	num_classes of checkpoint_path/ pretrained_weights, when checkpoint_path is not None. Used when num_classes != checkpoint_num_class. In this case, the module will be initialized with checkpoint_num_class, then weights will be loaded. Finaly replace_head(new_num_classes=num_classes) is called (useful when wanting to perform transfer learning, from a checkpoint outside of then ones offered in SG model zoo).	`None`
`num_input_channels`	`Optional[int]`	Number of input channels. If None, use the default model's input channels (most likely 3). NOTE: Passing pretrained_weights and checkpoint_path is ill-defined and will raise an error.	`None`

Source code in src/super_gradients/training/models/model_factory.py

@resolve_param("strict_load", TypeFactory.from_enum_cls(StrictLoad))
def get(
    model_name: str,
    arch_params: Optional[dict] = None,
    num_classes: Optional[int] = None,
    strict_load: Union[str, StrictLoad] = StrictLoad.NO_KEY_MATCHING,
    checkpoint_path: Optional[str] = None,
    pretrained_weights: Optional[str] = None,
    load_backbone: bool = False,
    download_required_code: bool = True,
    checkpoint_num_classes: Optional[int] = None,
    num_input_channels: Optional[int] = None,
) -> Union[SgModule, torch.nn.Module]:
    """
    :param model_name:              Defines the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:             Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param num_classes:             Number of classes (defines the net's structure).
                                        If None is given, will try to derive from pretrained_weight's corresponding dataset.
    :param strict_load:             See super_gradients.common.data_types.enum.strict_load.StrictLoad class documentation for details
                                        (default=NO_KEY_MATCHING to suport SG trained checkpoints)
    :param checkpoint_path:         The path to the external checkpoint to be loaded. Can be absolute or relative (ie: path/to/checkpoint.pth) path or URL.
                                        If provided, will automatically attempt to load the checkpoint.
    :param pretrained_weights:      Describe the dataset of the pretrained weights (for example "imagenent").
    :param load_backbone:           Load the provided checkpoint to model.backbone instead of model.
    :param download_required_code:  If model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                        will prevent additional code from being downloaded. This affects only models from remote client.
    :param checkpoint_num_classes:  num_classes of checkpoint_path/ pretrained_weights, when checkpoint_path is not None.
                                        Used when num_classes != checkpoint_num_class. In this case, the module will be initialized with checkpoint_num_class,
                                        then weights will be loaded.
                                        Finaly replace_head(new_num_classes=num_classes) is called (useful when wanting to perform transfer learning,
                                        from a checkpoint outside of then ones offered in SG model zoo).
    :param num_input_channels:      Number of input channels.
                                        If None, use the default model's input channels (most likely 3).

    NOTE: Passing pretrained_weights and checkpoint_path is ill-defined and will raise an error.
    """
    checkpoint_num_classes = checkpoint_num_classes or num_classes

    if checkpoint_num_classes:
        net = instantiate_model(model_name, arch_params, checkpoint_num_classes, pretrained_weights, download_required_code)
    else:
        net = instantiate_model(model_name, arch_params, num_classes, pretrained_weights, download_required_code)

    if load_backbone and not checkpoint_path:
        raise ValueError("Please set checkpoint_path when load_backbone=True")

    if checkpoint_path:
        ckpt_entries = read_ckpt_state_dict(ckpt_path=checkpoint_path).keys()
        load_processing = "processing_params" in ckpt_entries
        load_ema_as_net = "ema_net" in ckpt_entries
        _ = load_checkpoint_to_model(
            ckpt_local_path=checkpoint_path,
            load_backbone=load_backbone,
            net=net,
            strict=strict_load,
            load_weights_only=True,
            load_ema_as_net=load_ema_as_net,
            load_processing_params=load_processing,
        )
    if checkpoint_num_classes != num_classes:
        net.replace_head(new_num_classes=num_classes)

    if num_input_channels is not None and num_input_channels != net.get_input_channels():
        net.replace_input_channels(in_channels=num_input_channels)

    return net

`get_architecture(model_name, arch_params, download_required_code=True, download_platform_weights=True)`

Get the corresponding architecture class.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Define the model's architecture from models/ALL_ARCHITECTURES	required
`arch_params`	`HpmStruct`	Architecture hyper parameters. e.g.: block, num_blocks, etc.	required
`download_required_code`	`bool`	if model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.	`True`
`download_platform_weights`	`bool`	bool, when getting a model from the platform, whether to downlaod the pretrained weights as well. In any other case this parameter will be ignored. (default=True).	`True`

Returns:

Type	Description
`Tuple[Type[torch.nn.Module], HpmStruct, str, bool]`	architecture_cls: Class of the model - arch_params: Might be updated if loading from remote deci lab - pretrained_weights_path: path to the pretrained weights from deci lab (None for local models or when deci client is not enabled). - is_remote: True if loading from remote deci lab

Source code in src/super_gradients/training/models/model_factory.py

def get_architecture(
    model_name: str, arch_params: HpmStruct, download_required_code: bool = True, download_platform_weights: bool = True
) -> Tuple[Type[torch.nn.Module], HpmStruct, str, bool]:
    """
    Get the corresponding architecture class.

    :param model_name:          Define the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:         Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param download_required_code: if model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                        will prevent additional code from being downloaded. This affects only models from remote client.

    :param download_platform_weights:  bool, when getting a model from the platform, whether to downlaod the pretrained weights as well.
        In any other case this parameter will be ignored. (default=True).

    :return:
        - architecture_cls:     Class of the model
        - arch_params:          Might be updated if loading from remote deci lab
        - pretrained_weights_path:   path to the pretrained weights from deci lab (None for local models or when deci
            client is not enabled).

        - is_remote:            True if loading from remote deci lab
    """
    pretrained_weights_path = None
    is_remote = False
    if not isinstance(model_name, str):
        raise ValueError(f"Input parameter `model_name` should be a string. Got {model_name} of type {type(model_name)}.")

    architecture = get_param(ARCHITECTURES, model_name)
    if model_name not in ARCHITECTURES.keys() and architecture is None:
        if client_enabled:
            logger.info(f'The requested model "{model_name}" was not found in SuperGradients. Trying to load a model from the Platform...')
            deci_client = DeciClient()

            _arch_params = deci_client.get_model_arch_params(model_name)
            if _arch_params is None:
                raise ValueError(
                    f'The requested model "{model_name}" was not found in the Platform. See docs or all_architectures.py for supported model names.'
                )
            else:
                logger.info(f'The requested model "{model_name}" is available in the platform and will now be downloaded...')

            if download_required_code:  # Some extra code might be required to instantiate the arch params.
                deci_client.download_and_load_model_additional_code(model_name, target_path=str(Path.cwd()))
                logger.debug(f'Additional code for model "{model_name}" has been downloaded from the platform.')

            _arch_params = hydra.utils.instantiate(_arch_params)
            if download_platform_weights:
                pretrained_weights_path = deci_client.get_model_weights(model_name)
                logger.info("The model weights were downloaded from the platform.")
            else:
                pretrained_weights_path = None
            model_name = _arch_params["model_name"]
            del _arch_params["model_name"]
            _arch_params = HpmStruct(**_arch_params)
            _arch_params.override(**arch_params.to_dict())
            arch_params, is_remote = _arch_params, True
        else:
            raise UnknownTypeException(
                message=f'The requested model "{model_name}" was not found in SuperGradients. See docs or all_architectures.py for supported model names.',
                unknown_type=model_name,
                choices=list(ARCHITECTURES.keys()),
            )

    return get_param(ARCHITECTURES, model_name), arch_params, pretrained_weights_path, is_remote

`get_model_name(model)`

Get the name of a model loaded by SuperGradients' models.get(). If the model was not loaded using models.get(), return None.

Source code in src/super_gradients/training/models/model_factory.py

def get_model_name(model: torch.nn.Module) -> Optional[str]:
    """Get the name of a model loaded by SuperGradients' `models.get()`. If the model was not loaded using `models.get()`, return None."""
    return getattr(model, "_sg_model_name", None)

`instantiate_model(model_name, arch_params, num_classes, pretrained_weights=None, download_required_code=True)`

Instantiates nn.Module according to architecture and arch_params, and handles pretrained weights and the required module manipulation (i.e head replacement).

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Define the model's architecture from models/ALL_ARCHITECTURES	required
`arch_params`	`dict`	Architecture hyper parameters. e.g.: block, num_blocks, etc.	required
`num_classes`	`int`	Number of classes (defines the net's structure). If None is given, will try to derrive from pretrained_weight's corresponding dataset.	required
`pretrained_weights`	`str`	Describe the dataset of the pretrained weights (for example "imagenent"). Add `platform/` prefix if the weights are stored in the platform - Please note that in this case, `num_classes` is expected to be the checkpoints number of classes, and not the number of class that you want to use - you will need to replace the head afterward if you want to work with a different number of classes.	`None`
`download_required_code`	`bool`	if model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.	`True`

Returns:

Type	Description
`Union[SgModule, torch.nn.Module]`	Instantiated model i.e torch.nn.Module, architecture_class (will be none when architecture is not str)

Source code in src/super_gradients/training/models/model_factory.py

def instantiate_model(
    model_name: str, arch_params: dict, num_classes: int, pretrained_weights: str = None, download_required_code: bool = True
) -> Union[SgModule, torch.nn.Module]:
    """
    Instantiates nn.Module according to architecture and arch_params, and handles pretrained weights and the required
        module manipulation (i.e head replacement).

    :param model_name:          Define the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:         Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param num_classes:         Number of classes (defines the net's structure).
                                    If None is given, will try to derrive from pretrained_weight's corresponding dataset.
    :param pretrained_weights:  Describe the dataset of the pretrained weights (for example "imagenent").
                                Add `platform/` prefix if the weights are stored in the platform -
                                Please note that in this case, `num_classes` is expected to be the checkpoints number of classes, and not the number of class
                                that you want to use - you will need to replace the head afterward if you want to work with a different number of classes.
    :param download_required_code: if model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                will prevent additional code from being downloaded. This affects only models from remote client.

    :return:                    Instantiated model i.e torch.nn.Module, architecture_class (will be none when architecture is not str)
    """
    if arch_params is None:
        arch_params = {}
    arch_params = core_utils.HpmStruct(**arch_params)
    download_platform_weights = isinstance(pretrained_weights, str) and pretrained_weights.startswith("platform/")
    architecture_cls, arch_params, pretrained_weights_path, is_remote = get_architecture(
        model_name, arch_params, download_required_code, download_platform_weights
    )

    if not issubclass(architecture_cls, SgModule):
        net = architecture_cls(**arch_params.to_dict(include_schema=False))
    else:
        if core_utils.get_param(arch_params, "num_classes"):
            logger.warning(
                "Passing num_classes through arch_params is deprecated and will be removed in the next version. " "Pass num_classes explicitly to models.get"
            )
            num_classes = num_classes or arch_params.num_classes

        if num_classes is not None:
            arch_params.override(num_classes=num_classes)

        if pretrained_weights is None and num_classes is None:
            raise ValueError("num_classes or pretrained_weights must be passed to determine net's structure.")

        if pretrained_weights:
            if pretrained_weights in PRETRAINED_NUM_CLASSES.keys():
                num_classes_new_head = core_utils.get_param(arch_params, "num_classes", PRETRAINED_NUM_CLASSES[pretrained_weights])
                arch_params.num_classes = PRETRAINED_NUM_CLASSES[pretrained_weights]
            elif not download_platform_weights:
                raise ValueError(
                    f'`pretrained_weights="{pretrained_weights}"` is not a valid and was not found in that platform. '
                    f'Valid pretrained weights are: "{PRETRAINED_NUM_CLASSES.keys()}"'
                )

        # Most of the SG models work with a single params names "arch_params" of type HpmStruct, but a few take
        # **kwargs instead
        if "arch_params" not in get_callable_param_names(architecture_cls):
            net = architecture_cls(**arch_params.to_dict(include_schema=False))
        else:
            net = architecture_cls(arch_params=arch_params)

        if pretrained_weights:
            # The logic is follows - first we initialize the preprocessing params using default hard-coded params
            # If pretrained checkpoint contains preprocessing params, new params will be loaded and override the ones from
            # this step in load_pretrained_weights_local/load_pretrained_weights
            if isinstance(net, HasPredict):
                processing_params = get_pretrained_processing_params(model_name, pretrained_weights)
                net.set_dataset_processing_params(**processing_params)

            if is_remote and pretrained_weights_path:
                load_pretrained_weights_local(net, model_name, pretrained_weights_path)
            else:
                load_pretrained_weights(net, model_name, pretrained_weights)

            if pretrained_weights in PRETRAINED_NUM_CLASSES.keys() and num_classes_new_head != arch_params.num_classes:
                net.replace_head(new_num_classes=num_classes_new_head)
                arch_params.num_classes = num_classes_new_head

    _add_model_name_attribute(net, model_name)

    return net

`AdaptBlock`

Bases: nn.Module

Residual block with deformable convolution

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

class AdaptBlock(nn.Module):
    """
    Residual block with deformable convolution
    """

    expansion = 1

    def __init__(self, inplanes, outplanes, stride=1, downsample=None, dilation=1, deformable_groups=1):
        super(AdaptBlock, self).__init__()
        regular_matrix = torch.tensor([[-1, -1, -1, 0, 0, 0, 1, 1, 1], [-1, 0, 1, -1, 0, 1, -1, 0, 1]])
        self.register_buffer("regular_matrix", regular_matrix.float())
        self.downsample = downsample
        self.transform_matrix_conv = nn.Conv2d(inplanes, 4, 3, 1, 1, bias=True)
        self.translation_conv = nn.Conv2d(inplanes, 2, 3, 1, 1, bias=True)

        self.adapt_conv = torchvision.ops.DeformConv2d(
            inplanes, outplanes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False, groups=deformable_groups
        )

        self.bn = nn.BatchNorm2d(outplanes)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x

        N, _, H, W = x.shape
        transform_matrix = self.transform_matrix_conv(x)
        transform_matrix = transform_matrix.permute(0, 2, 3, 1).reshape((N * H * W, 2, 2))
        offset = torch.matmul(transform_matrix, self.regular_matrix)
        offset = offset - self.regular_matrix
        offset = offset.transpose(1, 2).reshape((N, H, W, 18)).permute(0, 3, 1, 2)

        translation = self.translation_conv(x)
        offset[:, 0::2, :, :] += translation[:, 0:1, :, :]
        offset[:, 1::2, :, :] += translation[:, 1:2, :, :]

        out = self.adapt_conv(x, offset)
        out = self.bn(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

`BasicBlock`

Bases: nn.Module

ResNet basic block

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

class BasicBlock(nn.Module):
    """
    ResNet basic block
    """

    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

`Bottleneck`

Bases: nn.Module

ResNet bottleneck block

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

class Bottleneck(nn.Module):
    """
    ResNet bottleneck block
    """

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

`DEKRPoseEstimationModel`

Bases: SgModule, HasPredict

Implementation of HRNet model from DEKR paper (https://arxiv.org/abs/2104.02300).

The model takes an image of (B,C,H,W) shape and outputs two tensors (heatmap, offset) as predictions: - heatmap (B, NumJoints+1,H * upsample_factor, W * upsample_factor) - offset (B, NumJoints*2, H * upsample_factor, W * upsample_factor)

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

@register_model(Models.DEKR_CUSTOM)
class DEKRPoseEstimationModel(SgModule, HasPredict):
    """
    Implementation of HRNet model from DEKR paper (https://arxiv.org/abs/2104.02300).

    The model takes an image of (B,C,H,W) shape and outputs two tensors (heatmap, offset) as predictions:
      - heatmap (B, NumJoints+1,H * upsample_factor, W * upsample_factor)
      - offset (B, NumJoints*2, H * upsample_factor, W * upsample_factor)
    """

    def __init__(self, arch_params):
        super(DEKRPoseEstimationModel, self).__init__()

        # stem net
        in_channels = get_param(arch_params, "in_channels", 3)
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(Bottleneck, 64, 64, 4)

        # build stage
        self.spec = arch_params.SPEC
        self.stages_spec = self.spec.STAGES
        self.num_stages = self.spec.STAGES.NUM_STAGES
        num_channels_last = [256]
        for i in range(self.num_stages):
            num_channels = self.stages_spec.NUM_CHANNELS[i]
            transition_layer = self._make_transition_layer(num_channels_last, num_channels)
            setattr(self, "transition{}".format(i + 1), transition_layer)

            stage, num_channels_last = self._make_stage(self.stages_spec, i, num_channels, True)
            setattr(self, "stage{}".format(i + 2), stage)

        # build head net
        self.head_inp_channels = int(sum(self.stages_spec.NUM_CHANNELS[-1]))
        self.config_heatmap = self.spec.HEAD_HEATMAP
        self.config_offset = self.spec.HEAD_OFFSET
        self.num_joints = arch_params.num_classes
        self.num_offset = self.num_joints * 2
        self.num_joints_with_center = self.num_joints + 1
        self.offset_prekpt = self.config_offset["NUM_CHANNELS_PERKPT"]

        offset_channels = self.num_joints * self.offset_prekpt
        self.transition_heatmap = self._make_transition_for_head(self.head_inp_channels, self.config_heatmap["NUM_CHANNELS"])
        self.transition_offset = self._make_transition_for_head(self.head_inp_channels, offset_channels)
        self.head_heatmap = self._make_heatmap_head(self.config_heatmap)
        self.offset_feature_layers, self.offset_final_layer = self._make_separete_regression_head(self.config_offset)
        self.heatmap_activation = nn.Sigmoid() if self.config_heatmap["HEATMAP_APPLY_SIGMOID"] else nn.Identity()
        self.init_weights()

    def replace_head(self, new_num_classes: int):
        self.num_joints = new_num_classes
        self.num_offset = new_num_classes * 2
        self.num_joints_with_center = new_num_classes + 1

        offset_channels = self.num_joints * self.offset_prekpt
        self.head_heatmap = self._make_heatmap_head(self.config_heatmap)
        self.transition_offset = self._make_transition_for_head(self.head_inp_channels, offset_channels)
        self.offset_feature_layers, self.offset_final_layer = self._make_separete_regression_head(self.config_offset)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

    def _make_transition_for_head(self, inplanes: int, outplanes: int) -> nn.Module:
        transition_layer = [nn.Conv2d(inplanes, outplanes, 1, 1, 0, bias=False), nn.BatchNorm2d(outplanes), nn.ReLU(True)]
        return nn.Sequential(*transition_layer)

    def _make_heatmap_head(self, layer_config: Mapping[str, Any]) -> nn.ModuleList:
        heatmap_head_layers = []

        feature_conv = self._make_layer(
            blocks_dict[layer_config["BLOCK"]],
            layer_config["NUM_CHANNELS"],
            layer_config["NUM_CHANNELS"],
            layer_config["NUM_BLOCKS"],
            dilation=layer_config["DILATION_RATE"],
        )
        heatmap_head_layers.append(feature_conv)

        heatmap_conv = nn.Conv2d(
            in_channels=layer_config["NUM_CHANNELS"],
            out_channels=self.num_joints_with_center,
            kernel_size=self.spec.FINAL_CONV_KERNEL,
            stride=1,
            padding=1 if self.spec.FINAL_CONV_KERNEL == 3 else 0,
        )
        heatmap_head_layers.append(heatmap_conv)

        return nn.ModuleList(heatmap_head_layers)

    def _make_separete_regression_head(self, layer_config) -> Tuple[nn.ModuleList, nn.ModuleList]:
        """
        Build offset regression head for each joint
        :param layer_config:
        :return:
        """
        offset_feature_layers = []
        offset_final_layer = []

        for _ in range(self.num_joints):
            feature_conv = self._make_layer(
                blocks_dict[layer_config["BLOCK"]],
                layer_config["NUM_CHANNELS_PERKPT"],
                layer_config["NUM_CHANNELS_PERKPT"],
                layer_config["NUM_BLOCKS"],
                dilation=layer_config["DILATION_RATE"],
            )
            offset_feature_layers.append(feature_conv)

            offset_conv = nn.Conv2d(
                in_channels=layer_config["NUM_CHANNELS_PERKPT"],
                out_channels=2,
                kernel_size=self.spec.FINAL_CONV_KERNEL,
                stride=1,
                padding=1 if self.spec.FINAL_CONV_KERNEL == 3 else 0,
            )
            offset_final_layer.append(offset_conv)

        return nn.ModuleList(offset_feature_layers), nn.ModuleList(offset_final_layer)

    def _make_layer(self, block, inplanes, planes, blocks, stride=1, dilation=1):
        downsample = None
        if stride != 1 or inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(inplanes, planes, stride, downsample, dilation=dilation))
        inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(inplanes, planes, dilation=dilation))

        return nn.Sequential(*layers)

    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(inplace=True),
                        )
                    )
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i + 1 - num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
                    conv3x3s.append(nn.Sequential(nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False), nn.BatchNorm2d(outchannels), nn.ReLU(inplace=True)))
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_stage(self, stages_spec, stage_index, num_inchannels, multi_scale_output=True):
        num_modules = stages_spec.NUM_MODULES[stage_index]
        num_branches = stages_spec.NUM_BRANCHES[stage_index]
        num_blocks = stages_spec.NUM_BLOCKS[stage_index]
        num_channels = stages_spec.NUM_CHANNELS[stage_index]
        block = blocks_dict[stages_spec["BLOCK"][stage_index]]
        fuse_method = stages_spec.FUSE_METHOD[stage_index]

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(HighResolutionModule(num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output))
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        y_list = [x]
        for i in range(self.num_stages):
            x_list = []
            transition = getattr(self, "transition{}".format(i + 1))
            for j in range(self.stages_spec["NUM_BRANCHES"][i]):
                if transition[j]:
                    x_list.append(transition[j](y_list[-1]))
                else:
                    x_list.append(y_list[j])
            y_list = getattr(self, "stage{}".format(i + 2))(x_list)

        x0_h, x0_w = y_list[0].size(2), y_list[0].size(3)
        x = torch.cat(
            [
                y_list[0],
                F.upsample(y_list[1], size=(x0_h, x0_w), mode="bilinear"),
                F.upsample(y_list[2], size=(x0_h, x0_w), mode="bilinear"),
                F.upsample(y_list[3], size=(x0_h, x0_w), mode="bilinear"),
            ],
            1,
        )

        heatmap = self.head_heatmap[1](self.head_heatmap[0](self.transition_heatmap(x)))

        final_offset = []
        offset_feature = self.transition_offset(x)

        for j in range(self.num_joints):
            final_offset.append(
                self.offset_final_layer[j](self.offset_feature_layers[j](offset_feature[:, j * self.offset_prekpt : (j + 1) * self.offset_prekpt]))
            )

        offset = torch.cat(final_offset, dim=1)
        return self.heatmap_activation(heatmap), offset

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ["bias"]:
                        nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        for m in self.modules():
            if hasattr(m, "transform_matrix_conv"):
                nn.init.constant_(m.transform_matrix_conv.weight, 0)
                if hasattr(m, "bias"):
                    nn.init.constant_(m.transform_matrix_conv.bias, 0)
            if hasattr(m, "translation_conv"):
                nn.init.constant_(m.translation_conv.weight, 0)
                if hasattr(m, "bias"):
                    nn.init.constant_(m.translation_conv.bias, 0)

    @staticmethod
    def get_post_prediction_callback(conf: float = 0.05):
        return DEKRPoseEstimationDecodeCallback(
            min_confidence=conf,
            keypoint_threshold=0.05,
            nms_threshold=0.05,
            apply_sigmoid=True,
            max_num_people=30,
            nms_num_threshold=8,
            output_stride=4,
        )

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        edge_links: Union[np.ndarray, List[Tuple[int, int]]],
        edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        image_processor: Optional[Processing] = None,
        conf: Optional[float] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._edge_links = edge_links or self._edge_links
        self._edge_colors = edge_colors or self._edge_colors
        self._keypoint_colors = keypoint_colors or self._keypoint_colors
        self._image_processor = image_processor or self._image_processor
        self._default_nms_conf = conf or self._default_nms_conf

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True
    ) -> PoseEstimationPipeline:
        """Instantiate the prediction pipeline of this model.

        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        """
        if None in (self._edge_links, self._image_processor, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        conf = conf or self._default_nms_conf

        if len(self._keypoint_colors) != self.num_joints:
            raise RuntimeError(
                "The number of colors for the keypoints ({}) does not match the number of joints ({})".format(len(self._keypoint_colors), self.num_joints)
            )
        if len(self._edge_colors) != len(self._edge_links):
            raise RuntimeError(
                "The number of colors for the joints ({}) does not match the number of joint links ({})".format(len(self._edge_colors), len(self._edge_links))
            )

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0))
        else:
            image_processor = self._image_processor

        pipeline = PoseEstimationPipeline(
            model=self,
            image_processor=image_processor,
            edge_links=self._edge_links,
            edge_colors=self._edge_colors,
            keypoint_colors=self._keypoint_colors,
            post_prediction_callback=self.get_post_prediction_callback(conf=conf),
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesPoseEstimationPrediction:
        """Predict an image or a list of images.

        :param images:  Images to predict.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True):
        """Predict using webcam.

        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        """
        pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

`predict(images, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

def predict(
    self,
    images: ImageSource,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesPoseEstimationPrediction:
    """Predict an image or a list of images.

    :param images:  Images to predict.
    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(conf=None, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True):
    """Predict using webcam.

    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    """
    pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

`set_dataset_processing_params(edge_links, edge_colors, keypoint_colors, image_processor=None, conf=None)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded	`None`

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    edge_links: Union[np.ndarray, List[Tuple[int, int]]],
    edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    image_processor: Optional[Processing] = None,
    conf: Optional[float] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._edge_links = edge_links or self._edge_links
    self._edge_colors = edge_colors or self._edge_colors
    self._keypoint_colors = keypoint_colors or self._keypoint_colors
    self._image_processor = image_processor or self._image_processor
    self._default_nms_conf = conf or self._default_nms_conf

`DEKRW32NODC`

Bases: DEKRPoseEstimationModel

DEKR-W32 model for pose estimation without deformable convolutions.

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py

@register_model(Models.DEKR_W32_NO_DC)
class DEKRW32NODC(DEKRPoseEstimationModel):
    """
    DEKR-W32 model for pose estimation without deformable convolutions.
    """

    def __init__(self, arch_params):
        POSE_DEKR_W32_NO_DC_ARCH_PARAMS = get_arch_params("pose_dekr_w32_no_dc_arch_params")

        merged_arch_params = HpmStruct(**copy.deepcopy(POSE_DEKR_W32_NO_DC_ARCH_PARAMS))
        merged_arch_params.override(**arch_params.to_dict())
        super().__init__(merged_arch_params)

`PoseRescoringNet`

Bases: SgModule

Rescoring network for pose estimation. It takes input features and predicts the single scalar score which is the multiplication factor for original score prediction. This model learns what are the reasonable/possible joint configurations. So it may downweight confidence of impossible joint configurations.

The model is a simple 3-layer MLP with ReLU activation. The input is the concatenation of the predicted poses and prior information in the form of the joint links. See RescoringNet.get_feature() for details. The output is a single scalar value.

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py

@register_model(Models.POSE_RESCORING)
class PoseRescoringNet(SgModule):
    """
    Rescoring network for pose estimation. It takes input features and predicts the single scalar score
    which is the multiplication factor for original score prediction. This model learns what are the reasonable/possible
    joint configurations. So it may downweight confidence of impossible joint configurations.

    The model is a simple 3-layer MLP with ReLU activation. The input is the concatenation of the predicted poses and prior
    information in the form of the joint links. See RescoringNet.get_feature() for details.
    The output is a single scalar value.
    """

    def __init__(self, num_classes: int, hidden_channels: int, num_layers: int, edge_links: List[Tuple[int, int]]):
        super(PoseRescoringNet, self).__init__()
        in_channels = len(edge_links) * 2 + len(edge_links) + num_classes  # [joint_relate, joint_length, visibility]
        layers = []
        for _ in range(num_layers):
            layers.append(nn.Linear(in_channels, hidden_channels, bias=True))
            layers.append(nn.ReLU())
            in_channels = hidden_channels
        self.layers = nn.Sequential(*layers)
        self.final = nn.Linear(hidden_channels, 1, bias=True)
        self.edge_links = torch.tensor(edge_links).long()

    def forward(self, poses: Tensor) -> Tuple[Tensor, Tensor]:
        """

        :param x: Predicted poses or shape [N, J, 3] or [B, N, J, 3]
        :return: Tuple of input poses and corresponding scores
        """

        x = self.get_feature(poses, self.edge_links)
        x = self.layers(x)
        y_pred = self.final(x)
        return poses, y_pred

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
                nn.init.constant_(m.bias, 0)

    @classmethod
    def get_feature(cls, poses: Tensor, edge_links: Tensor) -> Tensor:
        """
        Compute the feature vector input to the rescoring network.

        :param poses: [N, J, 3] Predicted poses
        :param edge_links: [L,2] List of joint indices
        :return: [N, L*2+L+J] Feature vector
        """
        joint_xy = poses[..., :2]
        visibility = poses[..., 2]

        joint_1 = edge_links[:, 0]
        joint_2 = edge_links[:, 1]

        # To get the Delta x Delta y
        joint_relate = joint_xy[..., joint_1, :] - joint_xy[..., joint_2, :]  # [N, L, 2]
        joint_length = ((joint_relate**2)[..., 0] + (joint_relate**2)[..., 1]) ** (0.5)  # [N, L]

        # To use the torso distance to normalize
        normalize = (joint_length[..., 9] + joint_length[..., 11]) / 2  # [N] # NOTE: THIS IS COCO-SPECIFIC
        normalize_tiled = torch.tile(normalize, (len(joint_1), 2, 1)).permute(2, 0, 1)
        normalize_tiled = normalize_tiled.clamp_min(1)

        joint_length = joint_length / normalize_tiled[..., 0]
        joint_relate = joint_relate / normalize_tiled
        joint_relate = torch.flatten(joint_relate, start_dim=-2)  # .reshape((-1, len(joint_1) * 2))

        feature = [joint_relate, joint_length, visibility]
        feature = torch.cat(feature, dim=-1)
        return feature

`forward(poses)`

Parameters:

Name	Type	Description	Default
`x`		Predicted poses or shape [N, J, 3] or [B, N, J, 3]	required

Returns:

Type	Description
`Tuple[Tensor, Tensor]`	Tuple of input poses and corresponding scores

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py

def forward(self, poses: Tensor) -> Tuple[Tensor, Tensor]:
    """

    :param x: Predicted poses or shape [N, J, 3] or [B, N, J, 3]
    :return: Tuple of input poses and corresponding scores
    """

    x = self.get_feature(poses, self.edge_links)
    x = self.layers(x)
    y_pred = self.final(x)
    return poses, y_pred

`get_feature(poses, edge_links)` `classmethod`

Compute the feature vector input to the rescoring network.

Parameters:

Name	Type	Description	Default
`poses`	`Tensor`	[N, J, 3] Predicted poses	required
`edge_links`	`Tensor`	[L,2] List of joint indices	required

Returns:

Type	Description
`Tensor`	[N, L*2+L+J] Feature vector

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py

@classmethod
def get_feature(cls, poses: Tensor, edge_links: Tensor) -> Tensor:
    """
    Compute the feature vector input to the rescoring network.

    :param poses: [N, J, 3] Predicted poses
    :param edge_links: [L,2] List of joint indices
    :return: [N, L*2+L+J] Feature vector
    """
    joint_xy = poses[..., :2]
    visibility = poses[..., 2]

    joint_1 = edge_links[:, 0]
    joint_2 = edge_links[:, 1]

    # To get the Delta x Delta y
    joint_relate = joint_xy[..., joint_1, :] - joint_xy[..., joint_2, :]  # [N, L, 2]
    joint_length = ((joint_relate**2)[..., 0] + (joint_relate**2)[..., 1]) ** (0.5)  # [N, L]

    # To use the torso distance to normalize
    normalize = (joint_length[..., 9] + joint_length[..., 11]) / 2  # [N] # NOTE: THIS IS COCO-SPECIFIC
    normalize_tiled = torch.tile(normalize, (len(joint_1), 2, 1)).permute(2, 0, 1)
    normalize_tiled = normalize_tiled.clamp_min(1)

    joint_length = joint_length / normalize_tiled[..., 0]
    joint_relate = joint_relate / normalize_tiled
    joint_relate = torch.flatten(joint_relate, start_dim=-2)  # .reshape((-1, len(joint_1) * 2))

    feature = [joint_relate, joint_length, visibility]
    feature = torch.cat(feature, dim=-1)
    return feature

`YoloNASPoseDFLHead`

Bases: BaseDetectionModule, SupportsReplaceNumClasses

YoloNASPoseDFLHead is the head used in YoloNASPose model. This class implements single-class object detection and keypoints regression on a single scale feature map

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py

@register_detection_module()
class YoloNASPoseDFLHead(BaseDetectionModule, SupportsReplaceNumClasses):
    """
    YoloNASPoseDFLHead is the head used in YoloNASPose model.
    This class implements single-class object detection and keypoints regression on a single scale feature map
    """

    def __init__(
        self,
        in_channels: int,
        bbox_inter_channels: int,
        pose_inter_channels: int,
        pose_regression_blocks: int,
        shared_stem: bool,
        pose_conf_in_class_head: bool,
        pose_block_use_repvgg: bool,
        width_mult: float,
        first_conv_group_size: int,
        num_classes: int,
        stride: int,
        reg_max: int,
        cls_dropout_rate: float = 0.0,
        reg_dropout_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASDFLHead
        :param in_channels: Input channels
        :param bbox_inter_channels: Intermediate number of channels for box detection & regression
        :param pose_inter_channels: Intermediate number of channels for pose regression
        :param shared_stem: Whether to share the stem between the pose and bbox heads
        :param pose_conf_in_class_head: Whether to include the pose confidence in the classification head
        :param width_mult: Width multiplier
        :param first_conv_group_size: Group size
        :param num_classes: Number of keypoints classes for pose regression. Number of detection classes is always 1.
        :param stride: Output stride for this head
        :param reg_max: Number of bins in the regression head
        :param cls_dropout_rate: Dropout rate for the classification head
        :param reg_dropout_rate: Dropout rate for the regression head
        """
        super().__init__(in_channels)

        bbox_inter_channels = width_multiplier(bbox_inter_channels, width_mult, 8)
        pose_inter_channels = width_multiplier(pose_inter_channels, width_mult, 8)

        if first_conv_group_size == 0:
            groups = 0
        elif first_conv_group_size == -1:
            groups = 1
        else:
            groups = bbox_inter_channels // first_conv_group_size

        self.num_classes = num_classes
        self.shared_stem = shared_stem
        self.pose_conf_in_class_head = pose_conf_in_class_head

        if self.shared_stem:
            max_input = max(bbox_inter_channels, pose_inter_channels)
            self.stem = ConvBNReLU(in_channels, max_input, kernel_size=1, stride=1, padding=0, bias=False)

            if max_input != pose_inter_channels:
                self.pose_stem = nn.Conv2d(max_input, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            else:
                self.pose_stem = nn.Identity()

            if max_input != bbox_inter_channels:
                self.bbox_stem = nn.Conv2d(max_input, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            else:
                self.bbox_stem = nn.Identity()

        else:
            self.stem = nn.Identity()
            self.pose_stem = ConvBNReLU(in_channels, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            self.bbox_stem = ConvBNReLU(in_channels, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

        first_cls_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        first_reg_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        if pose_block_use_repvgg:
            pose_block = partial(QARepVGGBlock, use_alpha=True)
        else:
            pose_block = partial(ConvBNReLU, kernel_size=3, stride=1, padding=1, bias=False)

        pose_convs = [pose_block(pose_inter_channels, pose_inter_channels) for _ in range(pose_regression_blocks)]
        self.pose_convs = nn.Sequential(*pose_convs)

        self.reg_pred = nn.Conv2d(bbox_inter_channels, 4 * (reg_max + 1), 1, 1, 0)

        if self.pose_conf_in_class_head:
            self.cls_pred = nn.Conv2d(bbox_inter_channels, 1 + self.num_classes, 1, 1, 0)
            self.pose_pred = nn.Conv2d(pose_inter_channels, 2 * self.num_classes, 1, 1, 0)  # each keypoint is x,y
        else:
            self.cls_pred = nn.Conv2d(bbox_inter_channels, 1, 1, 1, 0)
            self.pose_pred = nn.Conv2d(pose_inter_channels, 3 * self.num_classes, 1, 1, 0)  # each keypoint is x,y,confidence

        self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
        self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

        self.stride = stride

        self.prior_prob = 1e-2
        self._initialize_biases()

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        if self.pose_conf_in_class_head:
            self.cls_pred = compute_new_weights_fn(self.cls_pred, 1 + num_classes)
            self.pose_pred = compute_new_weights_fn(self.pose_pred, 2 * num_classes)
        else:
            self.pose_pred = compute_new_weights_fn(self.pose_pred, 3 * num_classes)
        self.num_classes = num_classes

    @property
    def out_channels(self):
        return None

    def forward(self, x) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """

        :param x: Input feature map of shape [B, Cin, H, W]
        :return: Tuple of [reg_output, cls_output, pose_regression, pose_logits]
            - reg_output:      Tensor of [B, 4 * (reg_max + 1), H, W]
            - cls_output:      Tensor of [B, 1, H, W]
            - pose_regression: Tensor of [B, num_classes, 2, H, W]
            - pose_logits:     Tensor of [B, num_classes, H, W]
        """
        x = self.stem(x)
        pose_features = self.pose_stem(x)
        bbox_features = self.bbox_stem(x)

        cls_feat = self.cls_convs(bbox_features)
        cls_feat = self.cls_dropout_rate(cls_feat)
        cls_output = self.cls_pred(cls_feat)

        reg_feat = self.reg_convs(bbox_features)
        reg_feat = self.reg_dropout_rate(reg_feat)
        reg_output = self.reg_pred(reg_feat)

        pose_feat = self.pose_convs(pose_features)
        pose_feat = self.reg_dropout_rate(pose_feat)

        pose_output = self.pose_pred(pose_feat)

        if self.pose_conf_in_class_head:
            pose_logits = cls_output[:, 1:, :, :]
            cls_output = cls_output[:, 0:1, :, :]
            pose_regression = pose_output.reshape((pose_output.size(0), self.num_classes, 2, pose_output.size(2), pose_output.size(3)))
        else:
            pose_output = pose_output.reshape((pose_output.size(0), self.num_classes, 3, pose_output.size(2), pose_output.size(3)))
            pose_logits = pose_output[:, :, 2, :, :]
            pose_regression = pose_output[:, :, 0:2, :, :]

        return reg_output, cls_output, pose_regression, pose_logits

    def _initialize_biases(self):
        prior_bias = -math.log((1 - self.prior_prob) / self.prior_prob)
        torch.nn.init.constant_(self.cls_pred.bias, prior_bias)

`init(in_channels, bbox_inter_channels, pose_inter_channels, pose_regression_blocks, shared_stem, pose_conf_in_class_head, pose_block_use_repvgg, width_mult, first_conv_group_size, num_classes, stride, reg_max, cls_dropout_rate=0.0, reg_dropout_rate=0.0)`

Initialize the YoloNASDFLHead

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Input channels	required
`bbox_inter_channels`	`int`	Intermediate number of channels for box detection & regression	required
`pose_inter_channels`	`int`	Intermediate number of channels for pose regression	required
`shared_stem`	`bool`	Whether to share the stem between the pose and bbox heads	required
`pose_conf_in_class_head`	`bool`	Whether to include the pose confidence in the classification head	required
`width_mult`	`float`	Width multiplier	required
`first_conv_group_size`	`int`	Group size	required
`num_classes`	`int`	Number of keypoints classes for pose regression. Number of detection classes is always 1.	required
`stride`	`int`	Output stride for this head	required
`reg_max`	`int`	Number of bins in the regression head	required
`cls_dropout_rate`	`float`	Dropout rate for the classification head	`0.0`
`reg_dropout_rate`	`float`	Dropout rate for the regression head	`0.0`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py

def __init__(
    self,
    in_channels: int,
    bbox_inter_channels: int,
    pose_inter_channels: int,
    pose_regression_blocks: int,
    shared_stem: bool,
    pose_conf_in_class_head: bool,
    pose_block_use_repvgg: bool,
    width_mult: float,
    first_conv_group_size: int,
    num_classes: int,
    stride: int,
    reg_max: int,
    cls_dropout_rate: float = 0.0,
    reg_dropout_rate: float = 0.0,
):
    """
    Initialize the YoloNASDFLHead
    :param in_channels: Input channels
    :param bbox_inter_channels: Intermediate number of channels for box detection & regression
    :param pose_inter_channels: Intermediate number of channels for pose regression
    :param shared_stem: Whether to share the stem between the pose and bbox heads
    :param pose_conf_in_class_head: Whether to include the pose confidence in the classification head
    :param width_mult: Width multiplier
    :param first_conv_group_size: Group size
    :param num_classes: Number of keypoints classes for pose regression. Number of detection classes is always 1.
    :param stride: Output stride for this head
    :param reg_max: Number of bins in the regression head
    :param cls_dropout_rate: Dropout rate for the classification head
    :param reg_dropout_rate: Dropout rate for the regression head
    """
    super().__init__(in_channels)

    bbox_inter_channels = width_multiplier(bbox_inter_channels, width_mult, 8)
    pose_inter_channels = width_multiplier(pose_inter_channels, width_mult, 8)

    if first_conv_group_size == 0:
        groups = 0
    elif first_conv_group_size == -1:
        groups = 1
    else:
        groups = bbox_inter_channels // first_conv_group_size

    self.num_classes = num_classes
    self.shared_stem = shared_stem
    self.pose_conf_in_class_head = pose_conf_in_class_head

    if self.shared_stem:
        max_input = max(bbox_inter_channels, pose_inter_channels)
        self.stem = ConvBNReLU(in_channels, max_input, kernel_size=1, stride=1, padding=0, bias=False)

        if max_input != pose_inter_channels:
            self.pose_stem = nn.Conv2d(max_input, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        else:
            self.pose_stem = nn.Identity()

        if max_input != bbox_inter_channels:
            self.bbox_stem = nn.Conv2d(max_input, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        else:
            self.bbox_stem = nn.Identity()

    else:
        self.stem = nn.Identity()
        self.pose_stem = ConvBNReLU(in_channels, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        self.bbox_stem = ConvBNReLU(in_channels, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

    first_cls_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    first_reg_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    if pose_block_use_repvgg:
        pose_block = partial(QARepVGGBlock, use_alpha=True)
    else:
        pose_block = partial(ConvBNReLU, kernel_size=3, stride=1, padding=1, bias=False)

    pose_convs = [pose_block(pose_inter_channels, pose_inter_channels) for _ in range(pose_regression_blocks)]
    self.pose_convs = nn.Sequential(*pose_convs)

    self.reg_pred = nn.Conv2d(bbox_inter_channels, 4 * (reg_max + 1), 1, 1, 0)

    if self.pose_conf_in_class_head:
        self.cls_pred = nn.Conv2d(bbox_inter_channels, 1 + self.num_classes, 1, 1, 0)
        self.pose_pred = nn.Conv2d(pose_inter_channels, 2 * self.num_classes, 1, 1, 0)  # each keypoint is x,y
    else:
        self.cls_pred = nn.Conv2d(bbox_inter_channels, 1, 1, 1, 0)
        self.pose_pred = nn.Conv2d(pose_inter_channels, 3 * self.num_classes, 1, 1, 0)  # each keypoint is x,y,confidence

    self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
    self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

    self.stride = stride

    self.prior_prob = 1e-2
    self._initialize_biases()

`forward(x)`

Parameters:

Name	Type	Description	Default
`x`		Input feature map of shape [B, Cin, H, W]	required

Returns:

Type	Description
`Tuple[Tensor, Tensor, Tensor, Tensor]`	Tuple of [reg_output, cls_output, pose_regression, pose_logits] - reg_output: Tensor of [B, 4 * (reg_max + 1), H, W] - cls_output: Tensor of [B, 1, H, W] - pose_regression: Tensor of [B, num_classes, 2, H, W] - pose_logits: Tensor of [B, num_classes, H, W]

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py

def forward(self, x) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """

    :param x: Input feature map of shape [B, Cin, H, W]
    :return: Tuple of [reg_output, cls_output, pose_regression, pose_logits]
        - reg_output:      Tensor of [B, 4 * (reg_max + 1), H, W]
        - cls_output:      Tensor of [B, 1, H, W]
        - pose_regression: Tensor of [B, num_classes, 2, H, W]
        - pose_logits:     Tensor of [B, num_classes, H, W]
    """
    x = self.stem(x)
    pose_features = self.pose_stem(x)
    bbox_features = self.bbox_stem(x)

    cls_feat = self.cls_convs(bbox_features)
    cls_feat = self.cls_dropout_rate(cls_feat)
    cls_output = self.cls_pred(cls_feat)

    reg_feat = self.reg_convs(bbox_features)
    reg_feat = self.reg_dropout_rate(reg_feat)
    reg_output = self.reg_pred(reg_feat)

    pose_feat = self.pose_convs(pose_features)
    pose_feat = self.reg_dropout_rate(pose_feat)

    pose_output = self.pose_pred(pose_feat)

    if self.pose_conf_in_class_head:
        pose_logits = cls_output[:, 1:, :, :]
        cls_output = cls_output[:, 0:1, :, :]
        pose_regression = pose_output.reshape((pose_output.size(0), self.num_classes, 2, pose_output.size(2), pose_output.size(3)))
    else:
        pose_output = pose_output.reshape((pose_output.size(0), self.num_classes, 3, pose_output.size(2), pose_output.size(3)))
        pose_logits = pose_output[:, :, 2, :, :]
        pose_regression = pose_output[:, :, 0:2, :, :]

    return reg_output, cls_output, pose_regression, pose_logits

`YoloNASPoseNDFLHeads`

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py

@register_detection_module()
class YoloNASPoseNDFLHeads(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        heads_list: List[Union[HpmStruct, DictConfig]],
        grid_cell_scale: float = 5.0,
        grid_cell_offset: float = 0.5,
        reg_max: int = 16,
        inference_mode: bool = False,
        eval_size: Optional[Tuple[int, int]] = None,
        width_mult: float = 1.0,
        pose_offset_multiplier: float = 1.0,
        compensate_grid_cell_offset: bool = True,
    ):
        """
        Initializes the NDFLHeads module.

        :param num_classes: Number of detection classes
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param grid_cell_scale: A scaling factor applied to the grid cell coordinates.
               This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).
        :param grid_cell_offset: A fixed offset that is added to the grid cell coordinates.
               This offset represents a 'center' of the cell and is 0.5 by default.
        :param reg_max: Number of bins in the regression head
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param width_mult: A scaling factor applied to in_channels.
        :param pose_offset_multiplier: A scaling factor applied to the pose regression offset. This multiplier is
               meant to reduce absolute magnitude of weights in pose regression layers.
               Default value is 1.0.
        :param compensate_grid_cell_offset: (bool) Controls whether to subtract anchor cell offset from the pose regression.
               If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride.
               If False, predicted pose coordinates decoded as (offsets + anchors) * stride.
               Default value is True.

        """
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]
        super().__init__(in_channels)

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size
        self.pose_offset_multiplier = pose_offset_multiplier
        self.compensate_grid_cell_offset = compensate_grid_cell_offset
        self.inference_mode = inference_mode

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

        factory = det_factory.DetectionModulesFactory()
        heads_list = self._insert_heads_list_params(heads_list, factory, num_classes, reg_max)

        self.num_heads = len(heads_list)
        fpn_strides: List[int] = []
        for i in range(self.num_heads):
            new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
            fpn_strides.append(new_head.stride)
            setattr(self, f"head{i + 1}", new_head)

        self.fpn_strides = tuple(fpn_strides)

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        for i in range(self.num_heads):
            head = getattr(self, f"head{i + 1}")
            head.replace_num_classes(num_classes, compute_new_weights_fn)

        self.num_classes = num_classes

    @staticmethod
    def _insert_heads_list_params(
        heads_list: List[Union[HpmStruct, DictConfig]], factory: det_factory.DetectionModulesFactory, num_classes: int, reg_max: int
    ) -> List[Union[HpmStruct, DictConfig]]:
        """
        Injects num_classes and reg_max parameters into the heads_list.

        :param heads_list:  Input heads list
        :param factory:     DetectionModulesFactory
        :param num_classes: Number of classes
        :param reg_max:     Number of bins in the regression head
        :return:            Heads list with injected parameters
        """
        for i in range(len(heads_list)):
            heads_list[i] = factory.insert_module_param(heads_list[i], "num_classes", num_classes)
            heads_list[i] = factory.insert_module_param(heads_list[i], "reg_max", reg_max)
        return heads_list

    @torch.jit.ignore
    def _init_weights(self):
        if self.eval_size:
            device = infer_model_device(self)
            dtype = infer_model_dtype(self)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def forward(self, feats: Tuple[Tensor, ...]) -> Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]:
        """
        Runs the forward for all the underlying heads and concatenate the predictions to a single result.
        :param feats: List of feature maps from the neck of different strides
        :return: Return value depends on the mode:
        If tracing, a tuple of 4 tensors (decoded predictions) is returned:
        - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format
        - pred_scores [B, Num Anchors, 1] - Predicted scores for each box
        - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format
        - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint

        In training/eval mode, a tuple of 2 tensors returned:
        - decoded predictions - they are the same as in tracing mode
        - raw outputs - a tuple of 8 elements in total, this is needed for training the model.
        """

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []
        pose_regression_list = []
        pose_logits_list = []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            reg_distri, cls_logit, pose_regression, pose_logits = getattr(self, f"head{i + 1}")(feat)
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, -1, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

            pose_regression_list.append(torch.permute(pose_regression.flatten(3), [0, 3, 1, 2]))  # [B, J, 2, H, W] -> [B, H * W, J, 2]
            pose_logits_list.append(torch.permute(pose_logits.flatten(2), [0, 2, 1]))  # [B, J, H, W] -> [B, H * W, J]

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        pose_regression_list = torch.cat(pose_regression_list, dim=1)  # [B, Anchors, J, 2]
        pose_logits_list = torch.cat(pose_logits_list, dim=1)  # [B, Anchors, J]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        # Decode keypoints
        if self.pose_offset_multiplier != 1.0:
            pose_regression_list *= self.pose_offset_multiplier

        if self.compensate_grid_cell_offset:
            pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2) - self.grid_cell_offset
        else:
            pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2)

        pose_regression_list *= stride_tensor.unsqueeze(0).unsqueeze(2)

        pred_pose_coords = pose_regression_list.detach().clone()  # [B, Anchors, C, 2]
        pred_pose_scores = pose_logits_list.detach().clone().sigmoid()  # [B, Anchors, C]

        decoded_predictions = pred_bboxes, pred_scores, pred_pose_coords, pred_pose_scores

        if torch.jit.is_tracing() or self.inference_mode:
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, pose_regression_list, pose_logits_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    @property
    def out_channels(self):
        return None

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = torch.arange(end=w) + self.grid_cell_offset
            shift_y = torch.arange(end=h) + self.grid_cell_offset
            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype))
        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)

        if device is not None:
            anchor_points = anchor_points.to(device)
            stride_tensor = stride_tensor.to(device)
        return anchor_points, stride_tensor

`init(num_classes, in_channels, heads_list, grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, inference_mode=False, eval_size=None, width_mult=1.0, pose_offset_multiplier=1.0, compensate_grid_cell_offset=True)`

Initializes the NDFLHeads module.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of detection classes	required
`in_channels`	`Tuple[int, int, int]`	Number of channels for each feature map (See width_mult)	required
`grid_cell_scale`	`float`	A scaling factor applied to the grid cell coordinates. This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).	`5.0`
`grid_cell_offset`	`float`	A fixed offset that is added to the grid cell coordinates. This offset represents a 'center' of the cell and is 0.5 by default.	`0.5`
`reg_max`	`int`	Number of bins in the regression head	`16`
`eval_size`	`Optional[Tuple[int, int]]`	(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.	`None`
`width_mult`	`float`	A scaling factor applied to in_channels.	`1.0`
`pose_offset_multiplier`	`float`	A scaling factor applied to the pose regression offset. This multiplier is meant to reduce absolute magnitude of weights in pose regression layers. Default value is 1.0.	`1.0`
`compensate_grid_cell_offset`	`bool`	(bool) Controls whether to subtract anchor cell offset from the pose regression. If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride. If False, predicted pose coordinates decoded as (offsets + anchors) * stride. Default value is True.	`True`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py

def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    heads_list: List[Union[HpmStruct, DictConfig]],
    grid_cell_scale: float = 5.0,
    grid_cell_offset: float = 0.5,
    reg_max: int = 16,
    inference_mode: bool = False,
    eval_size: Optional[Tuple[int, int]] = None,
    width_mult: float = 1.0,
    pose_offset_multiplier: float = 1.0,
    compensate_grid_cell_offset: bool = True,
):
    """
    Initializes the NDFLHeads module.

    :param num_classes: Number of detection classes
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param grid_cell_scale: A scaling factor applied to the grid cell coordinates.
           This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).
    :param grid_cell_offset: A fixed offset that is added to the grid cell coordinates.
           This offset represents a 'center' of the cell and is 0.5 by default.
    :param reg_max: Number of bins in the regression head
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param width_mult: A scaling factor applied to in_channels.
    :param pose_offset_multiplier: A scaling factor applied to the pose regression offset. This multiplier is
           meant to reduce absolute magnitude of weights in pose regression layers.
           Default value is 1.0.
    :param compensate_grid_cell_offset: (bool) Controls whether to subtract anchor cell offset from the pose regression.
           If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride.
           If False, predicted pose coordinates decoded as (offsets + anchors) * stride.
           Default value is True.

    """
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]
    super().__init__(in_channels)

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size
    self.pose_offset_multiplier = pose_offset_multiplier
    self.compensate_grid_cell_offset = compensate_grid_cell_offset
    self.inference_mode = inference_mode

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

    factory = det_factory.DetectionModulesFactory()
    heads_list = self._insert_heads_list_params(heads_list, factory, num_classes, reg_max)

    self.num_heads = len(heads_list)
    fpn_strides: List[int] = []
    for i in range(self.num_heads):
        new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
        fpn_strides.append(new_head.stride)
        setattr(self, f"head{i + 1}", new_head)

    self.fpn_strides = tuple(fpn_strides)

`forward(feats)`

Runs the forward for all the underlying heads and concatenate the predictions to a single result.

Parameters:

Name	Type	Description	Default
`feats`	`Tuple[Tensor, ...]`	List of feature maps from the neck of different strides	required

Returns:

Type Description

Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]

Return value depends on the mode: If tracing, a tuple of 4 tensors (decoded predictions) is returned: - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format - pred_scores [B, Num Anchors, 1] - Predicted scores for each box - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint In training/eval mode, a tuple of 2 tensors returned: - decoded predictions - they are the same as in tracing mode - raw outputs - a tuple of 8 elements in total, this is needed for training the model.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py

def forward(self, feats: Tuple[Tensor, ...]) -> Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]:
    """
    Runs the forward for all the underlying heads and concatenate the predictions to a single result.
    :param feats: List of feature maps from the neck of different strides
    :return: Return value depends on the mode:
    If tracing, a tuple of 4 tensors (decoded predictions) is returned:
    - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format
    - pred_scores [B, Num Anchors, 1] - Predicted scores for each box
    - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format
    - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint

    In training/eval mode, a tuple of 2 tensors returned:
    - decoded predictions - they are the same as in tracing mode
    - raw outputs - a tuple of 8 elements in total, this is needed for training the model.
    """

    cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []
    pose_regression_list = []
    pose_logits_list = []

    for i, feat in enumerate(feats):
        b, _, h, w = feat.shape
        height_mul_width = h * w
        reg_distri, cls_logit, pose_regression, pose_logits = getattr(self, f"head{i + 1}")(feat)
        reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

        reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
        reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

        # cls and reg
        cls_score_list.append(cls_logit.reshape([b, -1, height_mul_width]))
        reg_dist_reduced_list.append(reg_dist_reduced)

        pose_regression_list.append(torch.permute(pose_regression.flatten(3), [0, 3, 1, 2]))  # [B, J, 2, H, W] -> [B, H * W, J, 2]
        pose_logits_list.append(torch.permute(pose_logits.flatten(2), [0, 2, 1]))  # [B, J, H, W] -> [B, H * W, J]

    cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
    cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

    reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
    reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

    pose_regression_list = torch.cat(pose_regression_list, dim=1)  # [B, Anchors, J, 2]
    pose_logits_list = torch.cat(pose_logits_list, dim=1)  # [B, Anchors, J]

    # Decode bboxes
    # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
    if self.eval_size:
        anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
    else:
        anchor_points_inference, stride_tensor = self._generate_anchors(feats)

    pred_scores = cls_score_list.sigmoid()
    pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

    # Decode keypoints
    if self.pose_offset_multiplier != 1.0:
        pose_regression_list *= self.pose_offset_multiplier

    if self.compensate_grid_cell_offset:
        pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2) - self.grid_cell_offset
    else:
        pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2)

    pose_regression_list *= stride_tensor.unsqueeze(0).unsqueeze(2)

    pred_pose_coords = pose_regression_list.detach().clone()  # [B, Anchors, C, 2]
    pred_pose_scores = pose_logits_list.detach().clone().sigmoid()  # [B, Anchors, C]

    decoded_predictions = pred_bboxes, pred_scores, pred_pose_coords, pred_pose_scores

    if torch.jit.is_tracing() or self.inference_mode:
        return decoded_predictions

    anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

    raw_predictions = cls_score_list, reg_distri_list, pose_regression_list, pose_logits_list, anchors, anchor_points, num_anchors_list, stride_tensor
    return decoded_predictions, raw_predictions

`YoloNASPosePostPredictionCallback`

Bases: AbstractPoseEstimationPostPredictionCallback

A post-prediction callback for YoloNASPose model. Performs confidence thresholding, Top-K and NMS steps.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py

class YoloNASPosePostPredictionCallback(AbstractPoseEstimationPostPredictionCallback):
    """
    A post-prediction callback for YoloNASPose model.
    Performs confidence thresholding, Top-K and NMS steps.
    """

    def __init__(
        self,
        pose_confidence_threshold: float,
        nms_iou_threshold: float,
        pre_nms_max_predictions: int,
        post_nms_max_predictions: int,
    ):
        """
        :param pose_confidence_threshold: Pose detection confidence threshold
        :param nms_iou_threshold:         IoU threshold for NMS step.
        :param pre_nms_max_predictions:   Number of predictions participating in NMS step
        :param post_nms_max_predictions:  Maximum number of boxes to return after NMS step
        """
        if post_nms_max_predictions > pre_nms_max_predictions:
            raise ValueError("post_nms_max_predictions must be less than pre_nms_max_predictions")

        super().__init__()
        self.pose_confidence_threshold = pose_confidence_threshold
        self.nms_iou_threshold = nms_iou_threshold
        self.pre_nms_max_predictions = pre_nms_max_predictions
        self.post_nms_max_predictions = post_nms_max_predictions

    @torch.no_grad()
    def __call__(self, outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]) -> List[PoseEstimationPredictions]:
        """
        Take YoloNASPose's predictions and decode them into usable pose predictions.

        :param outputs: Output of the model's forward() method
        :return:        List of decoded predictions for each image in the batch.
        """
        # First is model predictions, second element of tuple is logits for loss computation
        predictions = outputs[0]

        decoded_predictions: List[PoseEstimationPredictions] = []
        for pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores in zip(*predictions):
            # pred_bboxes [Anchors, 4] in XYXY format
            # pred_scores [Anchors, 1] confidence scores [0..1]
            # pred_pose_coords [Anchors, Num Keypoints, 2] in (x,y) format
            # pred_pose_scores [Anchors, Num Keypoints] confidence scores [0..1]

            pred_bboxes_conf = pred_bboxes_conf.squeeze(-1)  # [Anchors]
            conf_mask = pred_bboxes_conf >= self.pose_confidence_threshold  # [Anchors]

            pred_bboxes_conf = pred_bboxes_conf[conf_mask].float()
            pred_bboxes_xyxy = pred_bboxes_xyxy[conf_mask].float()
            pred_pose_coords = pred_pose_coords[conf_mask].float()
            pred_pose_scores = pred_pose_scores[conf_mask].float()

            # Filter all predictions by self.nms_top_k
            if pred_bboxes_conf.size(0) > self.pre_nms_max_predictions:
                topk_candidates = torch.topk(pred_bboxes_conf, k=self.pre_nms_max_predictions, largest=True, sorted=True)
                pred_bboxes_conf = pred_bboxes_conf[topk_candidates.indices]
                pred_bboxes_xyxy = pred_bboxes_xyxy[topk_candidates.indices]
                pred_pose_coords = pred_pose_coords[topk_candidates.indices]
                pred_pose_scores = pred_pose_scores[topk_candidates.indices]

            # NMS
            idx_to_keep = torchvision.ops.boxes.nms(boxes=pred_bboxes_xyxy, scores=pred_bboxes_conf, iou_threshold=self.nms_iou_threshold)

            final_bboxes = pred_bboxes_xyxy[idx_to_keep]  # [Instances,]
            final_scores = pred_bboxes_conf[idx_to_keep]  # [Instances,]

            final_poses = torch.cat(
                [
                    pred_pose_coords[idx_to_keep],
                    pred_pose_scores[idx_to_keep].unsqueeze(-1),
                ],
                dim=-1,
            )  # [Instances, Num Keypoints, 3]

            decoded_predictions.append(
                PoseEstimationPredictions(
                    poses=final_poses[: self.post_nms_max_predictions],
                    scores=final_scores[: self.post_nms_max_predictions],
                    bboxes_xyxy=final_bboxes[: self.post_nms_max_predictions],
                )
            )

        return decoded_predictions

`call(outputs)`

Take YoloNASPose's predictions and decode them into usable pose predictions.

Parameters:

Name	Type	Description	Default
`outputs`	`Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]`	Output of the model's forward() method	required

Returns:

Type	Description
`List[PoseEstimationPredictions]`	List of decoded predictions for each image in the batch.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py

@torch.no_grad()
def __call__(self, outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]) -> List[PoseEstimationPredictions]:
    """
    Take YoloNASPose's predictions and decode them into usable pose predictions.

    :param outputs: Output of the model's forward() method
    :return:        List of decoded predictions for each image in the batch.
    """
    # First is model predictions, second element of tuple is logits for loss computation
    predictions = outputs[0]

    decoded_predictions: List[PoseEstimationPredictions] = []
    for pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores in zip(*predictions):
        # pred_bboxes [Anchors, 4] in XYXY format
        # pred_scores [Anchors, 1] confidence scores [0..1]
        # pred_pose_coords [Anchors, Num Keypoints, 2] in (x,y) format
        # pred_pose_scores [Anchors, Num Keypoints] confidence scores [0..1]

        pred_bboxes_conf = pred_bboxes_conf.squeeze(-1)  # [Anchors]
        conf_mask = pred_bboxes_conf >= self.pose_confidence_threshold  # [Anchors]

        pred_bboxes_conf = pred_bboxes_conf[conf_mask].float()
        pred_bboxes_xyxy = pred_bboxes_xyxy[conf_mask].float()
        pred_pose_coords = pred_pose_coords[conf_mask].float()
        pred_pose_scores = pred_pose_scores[conf_mask].float()

        # Filter all predictions by self.nms_top_k
        if pred_bboxes_conf.size(0) > self.pre_nms_max_predictions:
            topk_candidates = torch.topk(pred_bboxes_conf, k=self.pre_nms_max_predictions, largest=True, sorted=True)
            pred_bboxes_conf = pred_bboxes_conf[topk_candidates.indices]
            pred_bboxes_xyxy = pred_bboxes_xyxy[topk_candidates.indices]
            pred_pose_coords = pred_pose_coords[topk_candidates.indices]
            pred_pose_scores = pred_pose_scores[topk_candidates.indices]

        # NMS
        idx_to_keep = torchvision.ops.boxes.nms(boxes=pred_bboxes_xyxy, scores=pred_bboxes_conf, iou_threshold=self.nms_iou_threshold)

        final_bboxes = pred_bboxes_xyxy[idx_to_keep]  # [Instances,]
        final_scores = pred_bboxes_conf[idx_to_keep]  # [Instances,]

        final_poses = torch.cat(
            [
                pred_pose_coords[idx_to_keep],
                pred_pose_scores[idx_to_keep].unsqueeze(-1),
            ],
            dim=-1,
        )  # [Instances, Num Keypoints, 3]

        decoded_predictions.append(
            PoseEstimationPredictions(
                poses=final_poses[: self.post_nms_max_predictions],
                scores=final_scores[: self.post_nms_max_predictions],
                bboxes_xyxy=final_bboxes[: self.post_nms_max_predictions],
            )
        )

    return decoded_predictions

`init(pose_confidence_threshold, nms_iou_threshold, pre_nms_max_predictions, post_nms_max_predictions)`

Parameters:

Name	Type	Description	Default
`pose_confidence_threshold`	`float`	Pose detection confidence threshold	required
`nms_iou_threshold`	`float`	IoU threshold for NMS step.	required
`pre_nms_max_predictions`	`int`	Number of predictions participating in NMS step	required
`post_nms_max_predictions`	`int`	Maximum number of boxes to return after NMS step	required

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py

def __init__(
    self,
    pose_confidence_threshold: float,
    nms_iou_threshold: float,
    pre_nms_max_predictions: int,
    post_nms_max_predictions: int,
):
    """
    :param pose_confidence_threshold: Pose detection confidence threshold
    :param nms_iou_threshold:         IoU threshold for NMS step.
    :param pre_nms_max_predictions:   Number of predictions participating in NMS step
    :param post_nms_max_predictions:  Maximum number of boxes to return after NMS step
    """
    if post_nms_max_predictions > pre_nms_max_predictions:
        raise ValueError("post_nms_max_predictions must be less than pre_nms_max_predictions")

    super().__init__()
    self.pose_confidence_threshold = pose_confidence_threshold
    self.nms_iou_threshold = nms_iou_threshold
    self.pre_nms_max_predictions = pre_nms_max_predictions
    self.post_nms_max_predictions = post_nms_max_predictions

`YoloNASPose`

Bases: CustomizableDetector, ExportablePoseEstimationModel, SupportsInputShapeCheck

YoloNASPose model

Exported model support matrix

Batch Size	Format	OnnxRuntime 1.13.1	TensorRT 8.4.2	TensorRT 8.5.3	TensorRT 8.6.1
1	Flat	Yes	Yes	Yes	Yes
>1	Flat	Yes	Yes	Yes	Yes
1	Batch	Yes	No	No	Yes
>1	Batch	Yes	No	No	Yes

ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

class YoloNASPose(CustomizableDetector, ExportablePoseEstimationModel, SupportsInputShapeCheck):
    """
    YoloNASPose model

    Exported model support matrix

    | Batch Size | Format | OnnxRuntime 1.13.1 | TensorRT 8.4.2 | TensorRT 8.5.3 | TensorRT 8.6.1 |
    |------------|--------|--------------------|----------------|----------------|----------------|
    | 1          | Flat   | Yes                | Yes            | Yes            | Yes            |
    | >1         | Flat   | Yes                | Yes            | Yes            | Yes            |
    | 1          | Batch  | Yes                | No             | No             | Yes            |
    | >1         | Batch  | Yes                | No             | No             | Yes            |

    ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14
    """

    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        super().__init__(
            backbone=backbone,
            heads=heads,
            neck=neck,
            num_classes=num_classes,
            bn_eps=bn_eps,
            bn_momentum=bn_momentum,
            inplace_act=inplace_act,
            in_channels=in_channels,
        )
        self._edge_links = None
        self._edge_colors = None
        self._keypoint_colors = None
        self._image_processor = None
        self._default_nms_conf = None
        self._default_nms_iou = None
        self._default_pre_nms_max_predictions = None
        self._default_post_nms_max_predictions = None

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractPoseEstimationDecodingModule:
        return YoloNASPoseDecodingModule(num_pre_nms_predictions)

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> PoseEstimationPrediction:
        """Predict an image or a list of images.

        :param images:     Images to predict.
        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param batch_size: Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param batch_size: Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.

        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> PoseEstimationPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        if None in (self._image_processor, self._default_nms_iou, self._default_nms_conf, self._edge_links):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = iou or self._default_nms_iou
        conf = conf or self._default_nms_conf
        pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
        post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = PoseEstimationPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                pre_nms_max_predictions=pre_nms_max_predictions,
                post_nms_max_predictions=post_nms_max_predictions,
            ),
            fuse_model=fuse_model,
            edge_links=self._edge_links,
            edge_colors=self._edge_colors,
            keypoint_colors=self._keypoint_colors,
            fp16=fp16,
        )
        return pipeline

    @classmethod
    def get_post_prediction_callback(
        cls, conf: float, iou: float, pre_nms_max_predictions=1000, post_nms_max_predictions=300
    ) -> YoloNASPosePostPredictionCallback:
        return YoloNASPosePostPredictionCallback(
            pose_confidence_threshold=conf,
            nms_iou_threshold=iou,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
        )

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        edge_links: Union[np.ndarray, List[Tuple[int, int]]],
        edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        image_processor: Optional[Processing] = None,
        conf: Optional[float] = None,
        iou: Optional[float] = 0.7,
        pre_nms_max_predictions=300,
        post_nms_max_predictions=100,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._edge_links = edge_links or self._edge_links
        self._edge_colors = edge_colors or self._edge_colors
        self._keypoint_colors = keypoint_colors or self._keypoint_colors
        self._image_processor = image_processor or self._image_processor
        self._default_nms_conf = conf or self._default_nms_conf
        self._default_nms_iou = iou or self._default_nms_iou
        self._default_pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
        self._default_post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

    def get_input_shape_steps(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

`get_input_shape_steps()`

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

def get_input_shape_steps(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

`get_minimum_input_shape_size()`

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

def get_minimum_input_shape_size(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

`predict(images, iou=None, conf=None, pre_nms_max_predictions=None, post_nms_max_predictions=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict an image or a list of images.

Parameters:

Name	Type	Description	Default
`images`	`ImageSource`	Images to predict.	required
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    pre_nms_max_predictions: Optional[int] = None,
    post_nms_max_predictions: Optional[int] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> PoseEstimationPrediction:
    """Predict an image or a list of images.

    :param images:     Images to predict.
    :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                       If None, the default value associated to the training is used.
    :param batch_size: Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        pre_nms_max_predictions=pre_nms_max_predictions,
        post_nms_max_predictions=post_nms_max_predictions,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

`predict_webcam(iou=None, conf=None, pre_nms_max_predictions=None, post_nms_max_predictions=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)`

Predict using webcam.

Parameters:

Name	Type	Description	Default
`iou`	`Optional[float]`	(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.	`None`
`batch_size`	`int`	Maximum number of images to process at the same time.	`32`
`fuse_model`	`bool`	If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.	`True`
`skip_image_resizing`	`bool`	If True, the image processor will not resize the images.	`False`
`fp16`	`bool`	If True, use mixed precision for inference.	`True`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    pre_nms_max_predictions: Optional[int] = None,
    post_nms_max_predictions: Optional[int] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                       If None, the default value associated to the training is used.
    :param batch_size: Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.

    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        pre_nms_max_predictions=pre_nms_max_predictions,
        post_nms_max_predictions=post_nms_max_predictions,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        fp16=fp16,
    )
    pipeline.predict_webcam()

`set_dataset_processing_params(edge_links, edge_colors, keypoint_colors, image_processor=None, conf=None, iou=0.7, pre_nms_max_predictions=300, post_nms_max_predictions=100)`

Set the processing parameters for the dataset.

Parameters:

Name	Type	Description	Default
`image_processor`	`Optional[Processing]`	(Optional) Image processing objects to reproduce the dataset preprocessing used for training.	`None`
`conf`	`Optional[float]`	(Optional) Below the confidence threshold, prediction are discarded	`None`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    edge_links: Union[np.ndarray, List[Tuple[int, int]]],
    edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    image_processor: Optional[Processing] = None,
    conf: Optional[float] = None,
    iou: Optional[float] = 0.7,
    pre_nms_max_predictions=300,
    post_nms_max_predictions=100,
) -> None:
    """Set the processing parameters for the dataset.

    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._edge_links = edge_links or self._edge_links
    self._edge_colors = edge_colors or self._edge_colors
    self._keypoint_colors = keypoint_colors or self._keypoint_colors
    self._image_processor = image_processor or self._image_processor
    self._default_nms_conf = conf or self._default_nms_conf
    self._default_nms_iou = iou or self._default_nms_iou
    self._default_pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
    self._default_post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

`YoloNASPoseDecodingModule`

Bases: AbstractPoseEstimationDecodingModule

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

class YoloNASPoseDecodingModule(AbstractPoseEstimationDecodingModule):
    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, inputs: Any) -> int:
        """

        :param inputs: YoloNASPose model outputs
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
        else:
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

        return pred_bboxes_xyxy.size(1)

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
        """
        Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

        :param inputs: YoloNASPose model outputs
        :return: Tuple of (pred_bboxes, pred_scores, pred_joints)
        - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format
        - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose
        - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format
        """
        if torch.jit.is_tracing():
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
        else:
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_bboxes_conf.size()

        topk_candidates = torch.topk(pred_bboxes_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_bboxes_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1, 1)
        flat_indices = torch.flatten(indices_with_offset)

        pred_poses_and_scores = torch.cat([pred_pose_coords, pred_pose_scores.unsqueeze(3)], dim=3)

        output_pred_bboxes = pred_bboxes_xyxy.reshape(-1, pred_bboxes_xyxy.size(2))[flat_indices, :].reshape(
            pred_bboxes_xyxy.size(0), nms_top_k, pred_bboxes_xyxy.size(2)
        )
        output_pred_scores = pred_bboxes_conf.reshape(-1, pred_bboxes_conf.size(2))[flat_indices, :].reshape(
            pred_bboxes_conf.size(0), nms_top_k, pred_bboxes_conf.size(2)
        )
        output_pred_joints = pred_poses_and_scores.reshape(-1, pred_poses_and_scores.size(2), 3)[flat_indices, :, :].reshape(
            pred_poses_and_scores.size(0), nms_top_k, pred_poses_and_scores.size(2), pred_poses_and_scores.size(3)
        )

        return output_pred_bboxes, output_pred_scores, output_pred_joints

`forward(inputs)`

Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

Parameters:

Name	Type	Description	Default
`inputs`	`Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]`	YoloNASPose model outputs	required

Returns:

Type	Description
	Tuple of (pred_bboxes, pred_scores, pred_joints) - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
    """
    Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

    :param inputs: YoloNASPose model outputs
    :return: Tuple of (pred_bboxes, pred_scores, pred_joints)
    - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format
    - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose
    - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format
    """
    if torch.jit.is_tracing():
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
    else:
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

    nms_top_k = self.num_pre_nms_predictions
    batch_size, num_anchors, _ = pred_bboxes_conf.size()

    topk_candidates = torch.topk(pred_bboxes_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

    offsets = num_anchors * torch.arange(batch_size, device=pred_bboxes_conf.device)
    indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1, 1)
    flat_indices = torch.flatten(indices_with_offset)

    pred_poses_and_scores = torch.cat([pred_pose_coords, pred_pose_scores.unsqueeze(3)], dim=3)

    output_pred_bboxes = pred_bboxes_xyxy.reshape(-1, pred_bboxes_xyxy.size(2))[flat_indices, :].reshape(
        pred_bboxes_xyxy.size(0), nms_top_k, pred_bboxes_xyxy.size(2)
    )
    output_pred_scores = pred_bboxes_conf.reshape(-1, pred_bboxes_conf.size(2))[flat_indices, :].reshape(
        pred_bboxes_conf.size(0), nms_top_k, pred_bboxes_conf.size(2)
    )
    output_pred_joints = pred_poses_and_scores.reshape(-1, pred_poses_and_scores.size(2), 3)[flat_indices, :, :].reshape(
        pred_poses_and_scores.size(0), nms_top_k, pred_poses_and_scores.size(2), pred_poses_and_scores.size(3)
    )

    return output_pred_bboxes, output_pred_scores, output_pred_joints

`infer_total_number_of_predictions(inputs)`

Parameters:

Name	Type	Description	Default
`inputs`	`Any`	YoloNASPose model outputs	required

Returns:

Type	Description
`int`

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py

@torch.jit.ignore
def infer_total_number_of_predictions(self, inputs: Any) -> int:
    """

    :param inputs: YoloNASPose model outputs
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
    else:
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

    return pred_bboxes_xyxy.size(1)

`SegmentationHead`

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/common.py

class SegmentationHead(nn.Module):
    def __init__(self, in_channels: int, mid_channels: int, num_classes: int, dropout: float):
        super(SegmentationHead, self).__init__()
        self.seg_head = nn.Sequential(
            ConvBNReLU(in_channels, mid_channels, kernel_size=3, padding=1, stride=1, bias=False),
            nn.Dropout(dropout),
            nn.Conv2d(mid_channels, num_classes, kernel_size=1, bias=False),
        )

    def forward(self, x):
        return self.seg_head(x)

    def replace_num_classes(self, num_classes: int):
        """
        This method replace the last Conv Classification layer to output a different number of classes.
        Note that the weights of the new layers are random initiated.
        """
        old_cls_conv = self.seg_head[-1]
        self.seg_head[-1] = nn.Conv2d(old_cls_conv.in_channels, num_classes, kernel_size=1, bias=False)

`replace_num_classes(num_classes)`

This method replace the last Conv Classification layer to output a different number of classes. Note that the weights of the new layers are random initiated.

Source code in src/super_gradients/training/models/segmentation_models/common.py

def replace_num_classes(self, num_classes: int):
    """
    This method replace the last Conv Classification layer to output a different number of classes.
    Note that the weights of the new layers are random initiated.
    """
    old_cls_conv = self.seg_head[-1]
    self.seg_head[-1] = nn.Conv2d(old_cls_conv.in_channels, num_classes, kernel_size=1, bias=False)

`ASPP`

Bases: AbstractContextModule

ASPP bottleneck block. Splits the input to len(dilation_list) + 1, (a 1x1 conv) heads of differently dilated convolutions. The different heads will be concatenated and the output channel of each will be the input channel / len(dilation_list) + 1 so as to keep the same output channel as input channel.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py

class ASPP(AbstractContextModule):
    """
    ASPP bottleneck block. Splits the input to len(dilation_list) + 1, (a 1x1 conv) heads of differently dilated convolutions.
    The different heads will be concatenated and the output channel of each will be the
    input channel / len(dilation_list) + 1 so as to keep the same output channel as input channel.
    """

    def __init__(self, in_channels: int, dilation_list: List[int], in_out_ratio: float = 1.0, use_bias: bool = False, **kwargs):
        """
        :param dilation_list: list of dilation rates, the num of dilation branches should be set so that there is a
            whole division of the input channels, see assertion below.
        :param in_out_ratio: output / input num of channels ratio.
        :param use_bias: legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with
            extra redundant biases before batchnorm operators. should be set to `False` for new training processes.
        """
        super().__init__()
        num_dilation_branches = len(dilation_list) + 1
        inter_ratio = num_dilation_branches / in_out_ratio
        assert in_channels % inter_ratio == 0
        inter_channels = int(in_channels / inter_ratio)

        self.dilated_conv_list = nn.ModuleList(
            [
                ConvBNReLU(in_channels, inter_channels, kernel_size=1, dilation=1, bias=use_bias),
                *[ConvBNReLU(in_channels, inter_channels, kernel_size=3, dilation=d, padding=d, bias=use_bias) for d in dilation_list],
            ]
        )

        self.out_channels = inter_channels * num_dilation_branches

    def output_channels(self):
        return self.out_channels

    def forward(self, x):
        x = torch.cat([dilated_conv(x) for dilated_conv in self.dilated_conv_list], dim=1)
        return x

`init(in_channels, dilation_list, in_out_ratio=1.0, use_bias=False, **kwargs)`

Parameters:

Name	Type	Description	Default
`dilation_list`	`List[int]`	list of dilation rates, the num of dilation branches should be set so that there is a whole division of the input channels, see assertion below.	required
`in_out_ratio`	`float`	output / input num of channels ratio.	`1.0`
`use_bias`	`bool`	legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with extra redundant biases before batchnorm operators. should be set to `False` for new training processes.	`False`

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py

def __init__(self, in_channels: int, dilation_list: List[int], in_out_ratio: float = 1.0, use_bias: bool = False, **kwargs):
    """
    :param dilation_list: list of dilation rates, the num of dilation branches should be set so that there is a
        whole division of the input channels, see assertion below.
    :param in_out_ratio: output / input num of channels ratio.
    :param use_bias: legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with
        extra redundant biases before batchnorm operators. should be set to `False` for new training processes.
    """
    super().__init__()
    num_dilation_branches = len(dilation_list) + 1
    inter_ratio = num_dilation_branches / in_out_ratio
    assert in_channels % inter_ratio == 0
    inter_channels = int(in_channels / inter_ratio)

    self.dilated_conv_list = nn.ModuleList(
        [
            ConvBNReLU(in_channels, inter_channels, kernel_size=1, dilation=1, bias=use_bias),
            *[ConvBNReLU(in_channels, inter_channels, kernel_size=3, dilation=d, padding=d, bias=use_bias) for d in dilation_list],
        ]
    )

    self.out_channels = inter_channels * num_dilation_branches

`SPPM`

Bases: AbstractContextModule

Simple Pyramid Pooling context Module.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py

class SPPM(AbstractContextModule):
    """
    Simple Pyramid Pooling context Module.
    """

    def __init__(
        self,
        in_channels: int,
        inter_channels: int,
        out_channels: int,
        pool_sizes: List[Union[int, Tuple[int, int]]],
        upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
        align_corners: bool = False,
    ):
        """
        :param inter_channels: num channels in each pooling branch.
        :param out_channels: The number of output channels after pyramid pooling module.
        :param pool_sizes: spatial output sizes of the pooled feature maps.
        """
        super().__init__()
        self.branches = nn.ModuleList(
            [
                nn.Sequential(
                    nn.AdaptiveAvgPool2d(pool_size),
                    ConvBNReLU(in_channels, inter_channels, kernel_size=1, bias=False),
                )
                for pool_size in pool_sizes
            ]
        )
        self.conv_out = ConvBNReLU(inter_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.out_channels = out_channels
        self.upsample_mode = upsample_mode
        self.align_corners = align_corners
        self.pool_sizes = pool_sizes

    def forward(self, x):
        out = None
        input_shape = x.shape[2:]
        for branch in self.branches:
            y = branch(x)
            y = F.interpolate(y, size=input_shape, mode=self.upsample_mode, align_corners=self.align_corners)
            out = y if out is None else out + y
        out = self.conv_out(out)
        return out

    def output_channels(self):
        return self.out_channels

    def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
        """
        Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported
        when compiling to ONNX: `Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.`
        """
        input_size = [x / stride_ratio for x in input_size[-2:]]
        for branch in self.branches:
            global_pool: nn.AdaptiveAvgPool2d = branch[0]
            # If not a global average pooling skip this. The module might be already converted to average pooling
            # modules.
            if not isinstance(global_pool, nn.AdaptiveAvgPool2d):
                continue
            out_size = global_pool.output_size
            out_size = out_size if isinstance(out_size, (tuple, list)) else (out_size, out_size)
            kernel_size = [int(i / o) for i, o in zip(input_size, out_size)]
            branch[0] = nn.AvgPool2d(kernel_size=kernel_size, stride=kernel_size)

`init(in_channels, inter_channels, out_channels, pool_sizes, upsample_mode=UpsampleMode.BILINEAR, align_corners=False)`

Parameters:

Name	Type	Description	Default
`inter_channels`	`int`	num channels in each pooling branch.	required
`out_channels`	`int`	The number of output channels after pyramid pooling module.	required
`pool_sizes`	`List[Union[int, Tuple[int, int]]]`	spatial output sizes of the pooled feature maps.	required

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py

def __init__(
    self,
    in_channels: int,
    inter_channels: int,
    out_channels: int,
    pool_sizes: List[Union[int, Tuple[int, int]]],
    upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
    align_corners: bool = False,
):
    """
    :param inter_channels: num channels in each pooling branch.
    :param out_channels: The number of output channels after pyramid pooling module.
    :param pool_sizes: spatial output sizes of the pooled feature maps.
    """
    super().__init__()
    self.branches = nn.ModuleList(
        [
            nn.Sequential(
                nn.AdaptiveAvgPool2d(pool_size),
                ConvBNReLU(in_channels, inter_channels, kernel_size=1, bias=False),
            )
            for pool_size in pool_sizes
        ]
    )
    self.conv_out = ConvBNReLU(inter_channels, out_channels, kernel_size=3, padding=1, bias=False)
    self.out_channels = out_channels
    self.upsample_mode = upsample_mode
    self.align_corners = align_corners
    self.pool_sizes = pool_sizes

`prep_model_for_conversion(input_size, stride_ratio=32, **kwargs)`

Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported when compiling to ONNX: Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py

def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
    """
    Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported
    when compiling to ONNX: `Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.`
    """
    input_size = [x / stride_ratio for x in input_size[-2:]]
    for branch in self.branches:
        global_pool: nn.AdaptiveAvgPool2d = branch[0]
        # If not a global average pooling skip this. The module might be already converted to average pooling
        # modules.
        if not isinstance(global_pool, nn.AdaptiveAvgPool2d):
            continue
        out_size = global_pool.output_size
        out_size = out_size if isinstance(out_size, (tuple, list)) else (out_size, out_size)
        kernel_size = [int(i / o) for i, o in zip(input_size, out_size)]
        branch[0] = nn.AvgPool2d(kernel_size=kernel_size, stride=kernel_size)

`DAPPMBranch`

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class DAPPMBranch(nn.Module):
    def __init__(self, kernel_size: int, stride: int, in_planes: int, branch_planes: int, inter_mode: str = "bilinear"):
        """
        A DAPPM branch
        :param kernel_size: the kernel size for the average pooling
                when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed
        :param stride: stride for the average pooling
                when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1)
                when stride=1: no average pooling is performed
                when stride>1: average polling is performed (scaling the input down and up again)
        :param in_planes:
        :param branch_planes: width after the the first convolution
        :param inter_mode: interpolation mode for upscaling
        """

        super().__init__()
        down_list = []
        if stride == 0:
            # when stride is 0 average pool all the input to 1x1
            down_list.append(nn.AdaptiveAvgPool2d((1, 1)))
        elif stride == 1:
            # when stride id 1 no average pooling is used
            pass
        else:
            down_list.append(nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=stride))

        down_list.append(nn.BatchNorm2d(in_planes))
        down_list.append(nn.ReLU(inplace=True))
        down_list.append(nn.Conv2d(in_planes, branch_planes, kernel_size=1, bias=False))

        self.down_scale = nn.Sequential(*down_list)
        self.up_scale = UpscaleOnline(inter_mode)

        if stride != 1:
            self.process = nn.Sequential(
                nn.BatchNorm2d(branch_planes),
                nn.ReLU(inplace=True),
                nn.Conv2d(branch_planes, branch_planes, kernel_size=3, padding=1, bias=False),
            )

    def forward(self, x):
        """
        All branches of the DAPPM but the first one receive the output of the previous branch as a second input
        :param x: in branch 0 - the original input of the DAPPM. in other branches - a list containing the original
        input and the output of the previous branch.
        """

        if isinstance(x, list):
            output_of_prev_branch = x[1]
            x = x[0]
        else:
            output_of_prev_branch = None

        in_width = x.shape[-1]
        in_height = x.shape[-2]
        out = self.down_scale(x)
        out = self.up_scale(out, output_height=in_height, output_width=in_width)

        if output_of_prev_branch is not None:
            out = self.process(out + output_of_prev_branch)

        return out

`init(kernel_size, stride, in_planes, branch_planes, inter_mode='bilinear')`

A DAPPM branch

Parameters:

Name	Type	Description	Default
`kernel_size`	`int`	the kernel size for the average pooling when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed	required
`stride`	`int`	stride for the average pooling when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1) when stride=1: no average pooling is performed when stride>1: average polling is performed (scaling the input down and up again)	required
`in_planes`	`int`		required
`branch_planes`	`int`	width after the the first convolution	required
`inter_mode`	`str`	interpolation mode for upscaling	`'bilinear'`

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def __init__(self, kernel_size: int, stride: int, in_planes: int, branch_planes: int, inter_mode: str = "bilinear"):
    """
    A DAPPM branch
    :param kernel_size: the kernel size for the average pooling
            when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed
    :param stride: stride for the average pooling
            when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1)
            when stride=1: no average pooling is performed
            when stride>1: average polling is performed (scaling the input down and up again)
    :param in_planes:
    :param branch_planes: width after the the first convolution
    :param inter_mode: interpolation mode for upscaling
    """

    super().__init__()
    down_list = []
    if stride == 0:
        # when stride is 0 average pool all the input to 1x1
        down_list.append(nn.AdaptiveAvgPool2d((1, 1)))
    elif stride == 1:
        # when stride id 1 no average pooling is used
        pass
    else:
        down_list.append(nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=stride))

    down_list.append(nn.BatchNorm2d(in_planes))
    down_list.append(nn.ReLU(inplace=True))
    down_list.append(nn.Conv2d(in_planes, branch_planes, kernel_size=1, bias=False))

    self.down_scale = nn.Sequential(*down_list)
    self.up_scale = UpscaleOnline(inter_mode)

    if stride != 1:
        self.process = nn.Sequential(
            nn.BatchNorm2d(branch_planes),
            nn.ReLU(inplace=True),
            nn.Conv2d(branch_planes, branch_planes, kernel_size=3, padding=1, bias=False),
        )

`forward(x)`

All branches of the DAPPM but the first one receive the output of the previous branch as a second input

Parameters:

Name	Type	Description	Default
`x`		in branch 0 - the original input of the DAPPM. in other branches - a list containing the original input and the output of the previous branch.	required

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def forward(self, x):
    """
    All branches of the DAPPM but the first one receive the output of the previous branch as a second input
    :param x: in branch 0 - the original input of the DAPPM. in other branches - a list containing the original
    input and the output of the previous branch.
    """

    if isinstance(x, list):
        output_of_prev_branch = x[1]
        x = x[0]
    else:
        output_of_prev_branch = None

    in_width = x.shape[-1]
    in_height = x.shape[-2]
    out = self.down_scale(x)
    out = self.up_scale(out, output_height=in_height, output_width=in_width)

    if output_of_prev_branch is not None:
        out = self.process(out + output_of_prev_branch)

    return out

`DDRBackBoneBase`

Bases: nn.Module, SupportsReplaceInputChannels, ABC

A base class defining functions that must be supported by DDRBackBones

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class DDRBackBoneBase(nn.Module, SupportsReplaceInputChannels, ABC):
    """A base class defining functions that must be supported by DDRBackBones"""

    def validate_backbone_attributes(self):
        expected_attributes = ["stem", "layer1", "layer2", "layer3", "layer4", "input_channels"]
        for attribute in expected_attributes:
            assert hasattr(self, attribute), f"Invalid backbone - attribute '{attribute}' is missing"

    def get_backbone_output_number_of_channels(self):
        """Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the
        skip and compress layers"""
        output_shapes = {}
        x = torch.randn(1, self.input_channels, 320, 320)
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        output_shapes["layer2"] = x.shape[1]
        for layer in self.layer3:
            x = layer(x)
        output_shapes["layer3"] = x.shape[1]
        x = self.layer4(x)
        output_shapes["layer4"] = x.shape[1]
        return output_shapes

`get_backbone_output_number_of_channels()`

Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the skip and compress layers

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def get_backbone_output_number_of_channels(self):
    """Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the
    skip and compress layers"""
    output_shapes = {}
    x = torch.randn(1, self.input_channels, 320, 320)
    x = self.stem(x)
    x = self.layer1(x)
    x = self.layer2(x)
    output_shapes["layer2"] = x.shape[1]
    for layer in self.layer3:
        x = layer(x)
    output_shapes["layer3"] = x.shape[1]
    x = self.layer4(x)
    output_shapes["layer4"] = x.shape[1]
    return output_shapes

`DDRNet`

Bases: SegmentationModule, ExportableSegmentationModel

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class DDRNet(SegmentationModule, ExportableSegmentationModel):
    def __init__(
        self,
        backbone: DDRBackBoneBase.__class__,
        additional_layers: list,
        upscale_module: nn.Module,
        num_classes: int,
        highres_planes: int,
        spp_width: int,
        head_width: int,
        use_aux_heads: bool = False,
        ssp_inter_mode: str = "bilinear",
        segmentation_inter_mode: str = "bilinear",
        skip_block: nn.Module.__class__ = None,
        layer5_block: nn.Module.__class__ = Bottleneck,
        layer5_bottleneck_expansion: int = 2,
        classification_mode=False,
        spp_kernel_sizes: list = [1, 5, 9, 17, 0],
        spp_strides: list = [1, 2, 4, 8, 0],
        layer3_repeats: int = 1,
    ):
        """

        :param backbone: the low resolution branch of DDR, expected to have specific attributes in the class
        :param additional_layers: list of num blocks for the highres stage and layer5
        :param upscale_module: upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)
        :param num_classes: number of classes
        :param highres_planes: number of channels in the high resolution net
        :param use_aux_heads: add a second segmentation head (fed from after compress3 + upscale). this head can be used
        during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)
        :param ssp_inter_mode: the interpolation used in the SPP block
        :param segmentation_inter_mode: the interpolation used in the segmentation head
        :param skip_block: allows specifying a different block (from 'block') for the skip layer
        :param layer5_block: type of block to use in layer5 and layer5_skip
        :param layer5_bottleneck_expansion: determines the expansion rate for Bottleneck block
        :param spp_kernel_sizes: list of kernel sizes for the spp module pooling
        :param spp_strides: list of strides for the spp module pooling
        :param layer3_repeats: number of times to repeat the 3rd stage of ddr model, including the paths interchange
         modules.
        """

        super().__init__(use_aux_heads=use_aux_heads)
        self.use_aux_heads = use_aux_heads
        self.upscale = upscale_module
        self.ssp_inter_mode = ssp_inter_mode
        self.segmentation_inter_mode = segmentation_inter_mode
        self.relu = nn.ReLU(inplace=False)
        self.classification_mode = classification_mode
        self.layer3_repeats = layer3_repeats
        self.num_classes = num_classes

        assert not (use_aux_heads and classification_mode), "auxiliary head cannot be used in classification mode"

        assert isinstance(backbone, DDRBackBoneBase), "The backbone must inherit from AbstractDDRBackBone"
        self._backbone: DDRBackBoneBase = backbone
        self._backbone.validate_backbone_attributes()
        out_chan_backbone = self._backbone.get_backbone_output_number_of_channels()

        # Repeat r-times layer4
        self.compression3, self.down3, self.layer3_skip = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()
        for i in range(layer3_repeats):
            self.compression3.append(ConvBN(in_channels=out_chan_backbone["layer3"], out_channels=highres_planes, kernel_size=1, bias=False))
            self.down3.append(ConvBN(in_channels=highres_planes, out_channels=out_chan_backbone["layer3"], kernel_size=3, stride=2, padding=1, bias=False))
            self.layer3_skip.append(
                _make_layer(
                    in_planes=out_chan_backbone["layer2"] if i == 0 else highres_planes,
                    planes=highres_planes,
                    block=skip_block,
                    num_blocks=additional_layers[1],
                )
            )

        self.compression4 = ConvBN(in_channels=out_chan_backbone["layer4"], out_channels=highres_planes, kernel_size=1, bias=False)

        self.down4 = nn.Sequential(
            ConvBN(in_channels=highres_planes, out_channels=highres_planes * 2, kernel_size=3, stride=2, padding=1, bias=False, add_relu=True),
            ConvBN(in_channels=highres_planes * 2, out_channels=out_chan_backbone["layer4"], kernel_size=3, stride=2, padding=1, bias=False),
        )
        self.layer4_skip = _make_layer(block=skip_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[2])
        self.layer5_skip = _make_layer(
            block=layer5_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[3], expansion=layer5_bottleneck_expansion
        )

        # when training the backbones on Imagenet:
        #  - layer 5 has stride 1
        #  - a new high_to_low_fusion is added with to 3x3 convs with stride 2 (and double the width)
        #  - a classification head is placed instead of the segmentation head
        if self.classification_mode:
            self.layer5 = _make_layer(
                block=layer5_block,
                in_planes=out_chan_backbone["layer4"],
                planes=out_chan_backbone["layer4"],
                num_blocks=additional_layers[0],
                expansion=layer5_bottleneck_expansion,
            )

            highres_planes_out = highres_planes * layer5_bottleneck_expansion
            self.high_to_low_fusion = nn.Sequential(
                ConvBN(in_channels=highres_planes_out, out_channels=highres_planes_out * 2, kernel_size=3, stride=2, padding=1, add_relu=True),
                ConvBN(
                    in_channels=highres_planes_out * 2,
                    out_channels=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    add_relu=True,
                ),
            )

            self.average_pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(in_features=out_chan_backbone["layer4"] * layer5_bottleneck_expansion, out_features=num_classes)

        else:
            self.layer5 = _make_layer(
                block=layer5_block,
                in_planes=out_chan_backbone["layer4"],
                planes=out_chan_backbone["layer4"],
                num_blocks=additional_layers[0],
                stride=2,
                expansion=layer5_bottleneck_expansion,
            )

            self.spp = DAPPM(
                in_planes=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                branch_planes=spp_width,
                out_planes=highres_planes * layer5_bottleneck_expansion,
                inter_mode=self.ssp_inter_mode,
                kernel_sizes=spp_kernel_sizes,
                strides=spp_strides,
            )

            self.final_layer = SegmentHead(highres_planes * layer5_bottleneck_expansion, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

            if self.use_aux_heads:
                self.seghead_extra = SegmentHead(highres_planes, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

        self.highres_planes = highres_planes
        self.layer5_bottleneck_expansion = layer5_bottleneck_expansion
        self.head_width = head_width
        self.init_params()

    @property
    def backbone(self):
        """
        Create a fake backbone module to load backbone pre-trained weights.
        """
        return nn.Sequential(
            OrderedDict(
                [
                    ("_backbone", self._backbone),
                    ("compression3", self.compression3),
                    ("compression4", self.compression4),
                    ("down3", self.down3),
                    ("down4", self.down4),
                    ("layer3_skip", self.layer3_skip),
                    ("layer4_skip", self.layer4_skip),
                    ("layer4_skip", self.layer4_skip),
                    ("layer5_skip", self.layer5_skip),
                ]
            )
        )

    def forward(self, x):
        width_output = x.shape[-1] // 8
        height_output = x.shape[-2] // 8

        x = self._backbone.stem(x)
        x = self._backbone.layer1(x)
        x = self._backbone.layer2(self.relu(x))

        # Repeat layer 3
        x_skip = x
        for i in range(self.layer3_repeats):
            out_layer3 = self._backbone.layer3[i](self.relu(x))
            out_layer3_skip = self.layer3_skip[i](self.relu(x_skip))

            x = out_layer3 + self.down3[i](self.relu(out_layer3_skip))
            x_skip = out_layer3_skip + self.upscale(self.compression3[i](self.relu(out_layer3)), height_output, width_output)

        # save for auxiliary head
        if self.use_aux_heads:
            temp = x_skip

        out_layer4 = self._backbone.layer4(self.relu(x))
        out_layer4_skip = self.layer4_skip(self.relu(x_skip))

        x = out_layer4 + self.down4(self.relu(out_layer4_skip))
        x_skip = out_layer4_skip + self.upscale(self.compression4(self.relu(out_layer4)), height_output, width_output)

        out_layer5_skip = self.layer5_skip(self.relu(x_skip))

        if self.classification_mode:
            x_skip = self.high_to_low_fusion(self.relu(out_layer5_skip))
            x = self.layer5(self.relu(x))
            x = self.average_pool(x + x_skip)
            x = self.fc(x.squeeze())
            return x
        else:
            x = self.upscale(self.spp(self.layer5(self.relu(x))), height_output, width_output)

            x = self.final_layer(x + out_layer5_skip)

            if self.use_aux_heads:
                x_extra = self.seghead_extra(temp)
                return x, x_extra
            else:
                return x

    def replace_head(self, new_num_classes=None, new_head=None, new_aux_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_aux_head is not None:
            self.seghead_extra = new_aux_head
        if new_head is not None:
            self.final_layer = new_head
            self.num_classes = None
        else:
            self.final_layer = SegmentHead(
                self.highres_planes * self.layer5_bottleneck_expansion, self.head_width, new_num_classes, 8, inter_mode=self.segmentation_inter_mode
            )
            if self.use_aux_heads:
                self.seghead_extra = SegmentHead(self.highres_planes, self.head_width, new_num_classes, 8, inter_mode=self.segmentation_inter_mode)
            self.num_classes = new_num_classes

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        lr_dict = {"final_layer": lr, "default": 0}
        if self.use_aux_heads:
            lr_dict["seghead_extra"] = lr
        return lr_dict

    def _remove_auxiliary_heads(self):
        if hasattr(self, "seghead_extra"):
            del self.seghead_extra

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
            - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]
        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate backbone params from the rest.
        :return: iterators of groups named_parameters.
        """
        backbone_names = [n for n, p in self.backbone.named_parameters()]
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if name in backbone_names:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self._backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self._backbone.get_input_channels()

`backbone` `property`

Create a fake backbone module to load backbone pre-trained weights.

`init(backbone, additional_layers, upscale_module, num_classes, highres_planes, spp_width, head_width, use_aux_heads=False, ssp_inter_mode='bilinear', segmentation_inter_mode='bilinear', skip_block=None, layer5_block=Bottleneck, layer5_bottleneck_expansion=2, classification_mode=False, spp_kernel_sizes=[1, 5, 9, 17, 0], spp_strides=[1, 2, 4, 8, 0], layer3_repeats=1)`

Parameters:

Name	Type	Description	Default
`backbone`	`DDRBackBoneBase.__class__`	the low resolution branch of DDR, expected to have specific attributes in the class	required
`additional_layers`	`list`	list of num blocks for the highres stage and layer5	required
`upscale_module`	`nn.Module`	upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)	required
`num_classes`	`int`	number of classes	required
`highres_planes`	`int`	number of channels in the high resolution net	required
`use_aux_heads`	`bool`	add a second segmentation head (fed from after compress3 + upscale). this head can be used during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)	`False`
`ssp_inter_mode`	`str`	the interpolation used in the SPP block	`'bilinear'`
`segmentation_inter_mode`	`str`	the interpolation used in the segmentation head	`'bilinear'`
`skip_block`	`nn.Module.__class__`	allows specifying a different block (from 'block') for the skip layer	`None`
`layer5_block`	`nn.Module.__class__`	type of block to use in layer5 and layer5_skip	`Bottleneck`
`layer5_bottleneck_expansion`	`int`	determines the expansion rate for Bottleneck block	`2`
`spp_kernel_sizes`	`list`	list of kernel sizes for the spp module pooling	`[1, 5, 9, 17, 0]`
`spp_strides`	`list`	list of strides for the spp module pooling	`[1, 2, 4, 8, 0]`
`layer3_repeats`	`int`	number of times to repeat the 3rd stage of ddr model, including the paths interchange modules.	`1`

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def __init__(
    self,
    backbone: DDRBackBoneBase.__class__,
    additional_layers: list,
    upscale_module: nn.Module,
    num_classes: int,
    highres_planes: int,
    spp_width: int,
    head_width: int,
    use_aux_heads: bool = False,
    ssp_inter_mode: str = "bilinear",
    segmentation_inter_mode: str = "bilinear",
    skip_block: nn.Module.__class__ = None,
    layer5_block: nn.Module.__class__ = Bottleneck,
    layer5_bottleneck_expansion: int = 2,
    classification_mode=False,
    spp_kernel_sizes: list = [1, 5, 9, 17, 0],
    spp_strides: list = [1, 2, 4, 8, 0],
    layer3_repeats: int = 1,
):
    """

    :param backbone: the low resolution branch of DDR, expected to have specific attributes in the class
    :param additional_layers: list of num blocks for the highres stage and layer5
    :param upscale_module: upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)
    :param num_classes: number of classes
    :param highres_planes: number of channels in the high resolution net
    :param use_aux_heads: add a second segmentation head (fed from after compress3 + upscale). this head can be used
    during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)
    :param ssp_inter_mode: the interpolation used in the SPP block
    :param segmentation_inter_mode: the interpolation used in the segmentation head
    :param skip_block: allows specifying a different block (from 'block') for the skip layer
    :param layer5_block: type of block to use in layer5 and layer5_skip
    :param layer5_bottleneck_expansion: determines the expansion rate for Bottleneck block
    :param spp_kernel_sizes: list of kernel sizes for the spp module pooling
    :param spp_strides: list of strides for the spp module pooling
    :param layer3_repeats: number of times to repeat the 3rd stage of ddr model, including the paths interchange
     modules.
    """

    super().__init__(use_aux_heads=use_aux_heads)
    self.use_aux_heads = use_aux_heads
    self.upscale = upscale_module
    self.ssp_inter_mode = ssp_inter_mode
    self.segmentation_inter_mode = segmentation_inter_mode
    self.relu = nn.ReLU(inplace=False)
    self.classification_mode = classification_mode
    self.layer3_repeats = layer3_repeats
    self.num_classes = num_classes

    assert not (use_aux_heads and classification_mode), "auxiliary head cannot be used in classification mode"

    assert isinstance(backbone, DDRBackBoneBase), "The backbone must inherit from AbstractDDRBackBone"
    self._backbone: DDRBackBoneBase = backbone
    self._backbone.validate_backbone_attributes()
    out_chan_backbone = self._backbone.get_backbone_output_number_of_channels()

    # Repeat r-times layer4
    self.compression3, self.down3, self.layer3_skip = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()
    for i in range(layer3_repeats):
        self.compression3.append(ConvBN(in_channels=out_chan_backbone["layer3"], out_channels=highres_planes, kernel_size=1, bias=False))
        self.down3.append(ConvBN(in_channels=highres_planes, out_channels=out_chan_backbone["layer3"], kernel_size=3, stride=2, padding=1, bias=False))
        self.layer3_skip.append(
            _make_layer(
                in_planes=out_chan_backbone["layer2"] if i == 0 else highres_planes,
                planes=highres_planes,
                block=skip_block,
                num_blocks=additional_layers[1],
            )
        )

    self.compression4 = ConvBN(in_channels=out_chan_backbone["layer4"], out_channels=highres_planes, kernel_size=1, bias=False)

    self.down4 = nn.Sequential(
        ConvBN(in_channels=highres_planes, out_channels=highres_planes * 2, kernel_size=3, stride=2, padding=1, bias=False, add_relu=True),
        ConvBN(in_channels=highres_planes * 2, out_channels=out_chan_backbone["layer4"], kernel_size=3, stride=2, padding=1, bias=False),
    )
    self.layer4_skip = _make_layer(block=skip_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[2])
    self.layer5_skip = _make_layer(
        block=layer5_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[3], expansion=layer5_bottleneck_expansion
    )

    # when training the backbones on Imagenet:
    #  - layer 5 has stride 1
    #  - a new high_to_low_fusion is added with to 3x3 convs with stride 2 (and double the width)
    #  - a classification head is placed instead of the segmentation head
    if self.classification_mode:
        self.layer5 = _make_layer(
            block=layer5_block,
            in_planes=out_chan_backbone["layer4"],
            planes=out_chan_backbone["layer4"],
            num_blocks=additional_layers[0],
            expansion=layer5_bottleneck_expansion,
        )

        highres_planes_out = highres_planes * layer5_bottleneck_expansion
        self.high_to_low_fusion = nn.Sequential(
            ConvBN(in_channels=highres_planes_out, out_channels=highres_planes_out * 2, kernel_size=3, stride=2, padding=1, add_relu=True),
            ConvBN(
                in_channels=highres_planes_out * 2,
                out_channels=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                kernel_size=3,
                stride=2,
                padding=1,
                add_relu=True,
            ),
        )

        self.average_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(in_features=out_chan_backbone["layer4"] * layer5_bottleneck_expansion, out_features=num_classes)

    else:
        self.layer5 = _make_layer(
            block=layer5_block,
            in_planes=out_chan_backbone["layer4"],
            planes=out_chan_backbone["layer4"],
            num_blocks=additional_layers[0],
            stride=2,
            expansion=layer5_bottleneck_expansion,
        )

        self.spp = DAPPM(
            in_planes=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
            branch_planes=spp_width,
            out_planes=highres_planes * layer5_bottleneck_expansion,
            inter_mode=self.ssp_inter_mode,
            kernel_sizes=spp_kernel_sizes,
            strides=spp_strides,
        )

        self.final_layer = SegmentHead(highres_planes * layer5_bottleneck_expansion, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

        if self.use_aux_heads:
            self.seghead_extra = SegmentHead(highres_planes, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

    self.highres_planes = highres_planes
    self.layer5_bottleneck_expansion = layer5_bottleneck_expansion
    self.head_width = head_width
    self.init_params()

`initialize_param_groups(lr, training_params)`

Custom param groups for training: - Different lr for backbone and the rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]
    return param_groups

`DDRNetCustom`

Bases: DDRNet

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class DDRNetCustom(DDRNet):
    def __init__(self, arch_params: HpmStruct):
        """Parse arch_params and translate the parameters to build the original DDRNet architecture"""
        if get_param(arch_params, "aux_heads") is not None:
            message = "arch_params.aux_heads is deprecated in 3.1.1 and will be removed in 3.2.0."
            if get_param(arch_params, "use_aux_heads") is not None:
                message += "\n using arch_params.use_aux_heads instead."

            else:
                message += "\n use arch_params.use_aux_heads instead."
            warnings.warn(message, DeprecationWarning)
            use_aux_heads = get_param(arch_params, "aux_heads")
        else:
            use_aux_heads = get_param(arch_params, "use_aux_heads")
        super().__init__(
            backbone=arch_params.backbone,
            additional_layers=arch_params.additional_layers,
            upscale_module=arch_params.upscale_module,
            num_classes=arch_params.num_classes,
            highres_planes=arch_params.highres_planes,
            spp_width=arch_params.spp_planes,
            head_width=arch_params.head_planes,
            use_aux_heads=use_aux_heads,
            ssp_inter_mode=arch_params.ssp_inter_mode,
            segmentation_inter_mode=arch_params.segmentation_inter_mode,
            skip_block=arch_params.skip_block,
            layer5_block=arch_params.layer5_block,
            layer5_bottleneck_expansion=arch_params.layer5_bottleneck_expansion,
            classification_mode=arch_params.classification_mode,
            spp_kernel_sizes=arch_params.spp_kernel_sizes,
            spp_strides=arch_params.spp_strides,
            layer3_repeats=arch_params.layer3_repeats,
        )

`init(arch_params)`

Parse arch_params and translate the parameters to build the original DDRNet architecture

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def __init__(self, arch_params: HpmStruct):
    """Parse arch_params and translate the parameters to build the original DDRNet architecture"""
    if get_param(arch_params, "aux_heads") is not None:
        message = "arch_params.aux_heads is deprecated in 3.1.1 and will be removed in 3.2.0."
        if get_param(arch_params, "use_aux_heads") is not None:
            message += "\n using arch_params.use_aux_heads instead."

        else:
            message += "\n use arch_params.use_aux_heads instead."
        warnings.warn(message, DeprecationWarning)
        use_aux_heads = get_param(arch_params, "aux_heads")
    else:
        use_aux_heads = get_param(arch_params, "use_aux_heads")
    super().__init__(
        backbone=arch_params.backbone,
        additional_layers=arch_params.additional_layers,
        upscale_module=arch_params.upscale_module,
        num_classes=arch_params.num_classes,
        highres_planes=arch_params.highres_planes,
        spp_width=arch_params.spp_planes,
        head_width=arch_params.head_planes,
        use_aux_heads=use_aux_heads,
        ssp_inter_mode=arch_params.ssp_inter_mode,
        segmentation_inter_mode=arch_params.segmentation_inter_mode,
        skip_block=arch_params.skip_block,
        layer5_block=arch_params.layer5_block,
        layer5_bottleneck_expansion=arch_params.layer5_bottleneck_expansion,
        classification_mode=arch_params.classification_mode,
        spp_kernel_sizes=arch_params.spp_kernel_sizes,
        spp_strides=arch_params.spp_strides,
        layer3_repeats=arch_params.layer3_repeats,
    )

`RegnetDDRBackBone`

Bases: DDRBackBoneBase

Translation of Regnet to fit DDR model

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class RegnetDDRBackBone(DDRBackBoneBase):
    """
    Translation of Regnet to fit DDR model
    """

    def __init__(self, regnet_module: nn.Module.__class__):
        super().__init__()
        self.input_channels = regnet_module.net.stem.conv.in_channels
        self.stem = regnet_module.net.stem
        self.layer1 = regnet_module.net.stage_0
        self.layer2 = regnet_module.net.stage_1
        self.layer3 = nn.ModuleList([regnet_module.net.stage_2])
        self.layer4 = regnet_module.net.stage_3

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self.stem, SupportsReplaceInputChannels):
            self.stem.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        else:
            raise NotImplementedError(f"`{self.stem.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self.stem, SupportsReplaceInputChannels):
            return self.stem.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.stem.__class__.__name__}` does not support `replace_input_channels`")

`SegmentHead`

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class SegmentHead(nn.Module):
    def __init__(self, in_planes: int, inter_planes: int, out_planes: int, scale_factor: int, inter_mode: str = "bilinear"):
        """
        Last stage of the segmentation network.
        Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor
        :param in_planes: width of input
        :param inter_planes: width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle
        :param out_planes: output width
        :param scale_factor: scaling factor
        :param inter_mode: one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle.
        when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling
        """
        super().__init__()

        if inter_mode == "pixel_shuffle":
            assert inter_planes % (scale_factor ^ 2) == 0, "when using pixel_shuffle, inter_planes must be a multiple of scale_factor^2"

        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, inter_planes, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(inter_planes)
        self.relu = nn.ReLU(inplace=True)

        if inter_mode == "pixel_shuffle":
            self.conv2 = nn.Conv2d(inter_planes, inter_planes, kernel_size=1, padding=0, bias=True)
            self.upscale = nn.PixelShuffle(scale_factor)
        else:
            self.conv2 = nn.Conv2d(inter_planes, out_planes, kernel_size=1, padding=0, bias=True)
            self.upscale = nn.Upsample(scale_factor=scale_factor, mode=inter_mode)

        self.scale_factor = scale_factor

    def forward(self, x):
        x = self.conv1(self.relu(self.bn1(x)))
        out = self.conv2(self.relu(self.bn2(x)))
        out = self.upscale(out)

        return out

`init(in_planes, inter_planes, out_planes, scale_factor, inter_mode='bilinear')`

Last stage of the segmentation network. Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor

Parameters:

Name	Type	Description	Default
`in_planes`	`int`	width of input	required
`inter_planes`	`int`	width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle	required
`out_planes`	`int`	output width	required
`scale_factor`	`int`	scaling factor	required
`inter_mode`	`str`	one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle. when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling	`'bilinear'`

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

def __init__(self, in_planes: int, inter_planes: int, out_planes: int, scale_factor: int, inter_mode: str = "bilinear"):
    """
    Last stage of the segmentation network.
    Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor
    :param in_planes: width of input
    :param inter_planes: width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle
    :param out_planes: output width
    :param scale_factor: scaling factor
    :param inter_mode: one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle.
    when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling
    """
    super().__init__()

    if inter_mode == "pixel_shuffle":
        assert inter_planes % (scale_factor ^ 2) == 0, "when using pixel_shuffle, inter_planes must be a multiple of scale_factor^2"

    self.bn1 = nn.BatchNorm2d(in_planes)
    self.conv1 = nn.Conv2d(in_planes, inter_planes, kernel_size=3, padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(inter_planes)
    self.relu = nn.ReLU(inplace=True)

    if inter_mode == "pixel_shuffle":
        self.conv2 = nn.Conv2d(inter_planes, inter_planes, kernel_size=1, padding=0, bias=True)
        self.upscale = nn.PixelShuffle(scale_factor)
    else:
        self.conv2 = nn.Conv2d(inter_planes, out_planes, kernel_size=1, padding=0, bias=True)
        self.upscale = nn.Upsample(scale_factor=scale_factor, mode=inter_mode)

    self.scale_factor = scale_factor

`UpscaleOnline`

Bases: nn.Module

In some cases the required scale/size for the scaling is known only when the input is received. This class support such cases. only the interpolation mode is set in advance.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py

class UpscaleOnline(nn.Module):
    """
    In some cases the required scale/size for the scaling is known only when the input is received.
    This class support such cases. only the interpolation mode is set in advance.
    """

    def __init__(self, mode="bilinear"):
        super().__init__()
        self.mode = mode

    def forward(self, x, output_height: int, output_width: int):
        return F.interpolate(x, size=[output_height, output_width], mode=self.mode)

`DDRNet39Backbone`

Bases: DDRNet39

A somewhat frankenstein version of the DDRNet39 model that tries to be a feature extractor module.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet_backbones.py

@register_detection_module()
class DDRNet39Backbone(DDRNet39):
    """
    A somewhat frankenstein version of the DDRNet39 model that tries to be a feature extractor module.
    """

    def __init__(self, arch_params: HpmStruct):
        super().__init__(arch_params)

        # Delete everything that is not needed for feature extraction
        del self.final_layer
        if self.use_aux_heads:
            self.use_aux_heads = False
            del self.aux_head

        if self.classification_mode:
            del self.fc
            del self.average_pool
            del self.high_to_low_fusion
            del self.layer5

        self._out_channels = (self.highres_planes * self.layer5_bottleneck_expansion,)

    def forward(self, x):
        width_output = x.shape[-1] // 8
        height_output = x.shape[-2] // 8

        x = self._backbone.stem(x)
        x = self._backbone.layer1(x)
        x = self._backbone.layer2(self.relu(x))

        # Repeat layer 3
        x_skip = x
        for i in range(self.layer3_repeats):
            out_layer3 = self._backbone.layer3[i](self.relu(x))
            out_layer3_skip = self.layer3_skip[i](self.relu(x_skip))

            x = out_layer3 + self.down3[i](self.relu(out_layer3_skip))
            x_skip = out_layer3_skip + self.upscale(self.compression3[i](self.relu(out_layer3)), height_output, width_output)

        out_layer4 = self._backbone.layer4(self.relu(x))
        out_layer4_skip = self.layer4_skip(self.relu(x_skip))

        x = out_layer4 + self.down4(self.relu(out_layer4_skip))
        x_skip = out_layer4_skip + self.upscale(self.compression4(self.relu(out_layer4)), height_output, width_output)

        out_layer5_skip = self.layer5_skip(self.relu(x_skip))

        x = self.upscale(self.spp(self.layer5(self.relu(x))), height_output, width_output)

        return x + out_layer5_skip

    @property
    def out_channels(self) -> Tuple[int]:
        return self._out_channels

`LadderBottleneck`

Bases: nn.Module

ResNet Bottleneck

Source code in src/super_gradients/training/models/segmentation_models/laddernet.py

class LadderBottleneck(nn.Module):
    """ResNet Bottleneck"""

    # pylint: disable=unused-argument
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1, norm_layer=None):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False)
        self.bn2 = norm_layer(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def _sum_each(self, x, y):
        assert len(x) == len(y)
        z = []
        for i in range(len(x)):
            z.append(x[i] + y[i])
        return z

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

`LadderResNet`

Bases: nn.Module

Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.

Parameters

block : Block Class for the residual block. Options are BasicBlockV1, BottleneckV1. layers : list of int Numbers of layers in each block classes : int, default 1000 Number of classification classes. dilated : bool, default False Applying dilation strategy to pretrained ResNet yielding a stride-8 model, typically used in Semantic Segmentation. norm_layer : object Normalization layer used in backbone network (default: :class:mxnet.gluon.nn.BatchNorm; for Synchronized Cross-GPU BachNormalization).

Reference:

- He, Kaiming, et al. "Deep residual learning for image recognition."
    Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.

- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."

Source code in src/super_gradients/training/models/segmentation_models/laddernet.py

class LadderResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.

    Parameters
    ----------
    block : Block
        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
    layers : list of int
        Numbers of layers in each block
    classes : int, default 1000
        Number of classification classes.
    dilated : bool, default False
        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
        typically used in Semantic Segmentation.
    norm_layer : object
        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
        for Synchronized Cross-GPU BachNormalization).

    Reference:

        - He, Kaiming, et al. "Deep residual learning for image recognition."
            Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.

        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """

    # pylint: disable=unused-variable
    # def __init__(self, block, layers, num_classes=1000, dilated=False, norm_layer=SyncBatchNorm): # FIXME - ORIGINAL CODE
    def __init__(self, block, layers, num_classes=1000, dilated=False, norm_layer=nn.BatchNorm2d):  # FIXME - TIME MEASUREMENT CODE
        self.inplanes = 64
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
        if dilated:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2, norm_layer=norm_layer)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2, norm_layer=norm_layer)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            import math

            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                norm_layer(planes * block.expansion),
            )

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1, downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, dilation=2, downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation, norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

`conv3x3(in_planes, out_planes, stride=1)`

3x3 convolution with padding

Source code in src/super_gradients/training/models/segmentation_models/laddernet.py

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True)

`PPLiteSegBase`

Bases: SegmentationModule

The PP_LiteSeg implementation based on PaddlePaddle. The original article refers to "Juncai Peng, Yi Liu, Shiyu Tang, Yuying Hao, Lutao Chu, Guowei Chen, Zewu Wu, Zeyu Chen, Zhiliang Yu, Yuning Du, Qingqing Dang,Baohua Lai, Qiwen Liu, Xiaoguang Hu, Dianhai Yu, Yanjun Ma. PP-LiteSeg: A Superior Real-Time Semantic Segmentation Model. https://arxiv.org/abs/2204.02681".

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

class PPLiteSegBase(SegmentationModule):
    """
    The PP_LiteSeg implementation based on PaddlePaddle.
    The original article refers to "Juncai Peng, Yi Liu, Shiyu Tang, Yuying Hao, Lutao Chu,
    Guowei Chen, Zewu Wu, Zeyu Chen, Zhiliang Yu, Yuning Du, Qingqing Dang,Baohua Lai,
    Qiwen Liu, Xiaoguang Hu, Dianhai Yu, Yanjun Ma. PP-LiteSeg: A Superior Real-Time Semantic
    Segmentation Model. https://arxiv.org/abs/2204.02681".
    """

    def __init__(
        self,
        num_classes,
        backbone: AbstractSTDCBackbone,
        projection_channels_list: List[int],
        sppm_inter_channels: int,
        sppm_out_channels: int,
        sppm_pool_sizes: List[int],
        sppm_upsample_mode: Union[UpsampleMode, str],
        align_corners: bool,
        decoder_up_factors: List[int],
        decoder_channels: List[int],
        decoder_upsample_mode: Union[UpsampleMode, str],
        head_scale_factor: int,
        head_upsample_mode: Union[UpsampleMode, str],
        head_mid_channels: int,
        dropout: float,
        use_aux_heads: bool,
        aux_hidden_channels: List[int],
        aux_scale_factors: List[int],
    ):
        """
        :param backbone: Backbone nn.Module should implement the abstract class `AbstractSTDCBackbone`.
        :param projection_channels_list: channels list to project encoder features before fusing with the decoder
            stream.
        :param sppm_inter_channels: num channels in each sppm pooling branch.
        :param sppm_out_channels: The number of output channels after sppm module.
        :param sppm_pool_sizes: spatial output sizes of the pooled feature maps.
        :param sppm_upsample_mode: Upsample mode to original size after pooling.
        :param decoder_up_factors: list upsample factor per decoder stage.
        :param decoder_channels: list of num_channels per decoder stage.
        :param decoder_upsample_mode: upsample mode in decoder stages, see UpsampleMode for valid options.
        :param head_scale_factor: scale factor for final the segmentation head logits.
        :param head_upsample_mode: upsample mode to final prediction sizes, see UpsampleMode for valid options.
        :param head_mid_channels: num of hidden channels in segmentation head.
        :param use_aux_heads: set True when training, output extra Auxiliary feature maps from the encoder module.
        :param aux_hidden_channels: List of hidden channels in auxiliary segmentation heads.
        :param aux_scale_factors: list of uppsample factors for final auxiliary heads logits.
        """
        super().__init__(use_aux_heads=use_aux_heads)

        # Init Encoder
        backbone_out_channels = backbone.get_backbone_output_number_of_channels()
        assert len(backbone_out_channels) == len(projection_channels_list), (
            f"The length of backbone outputs ({backbone_out_channels}) should match the length of projection channels" f"({len(projection_channels_list)})."
        )
        context = SPPM(
            in_channels=backbone_out_channels[-1],
            inter_channels=sppm_inter_channels,
            out_channels=sppm_out_channels,
            pool_sizes=sppm_pool_sizes,
            upsample_mode=sppm_upsample_mode,
            align_corners=align_corners,
        )
        self.encoder = PPLiteSegEncoder(backbone=backbone, context_module=context, projection_channels_list=projection_channels_list)
        encoder_channels = self.encoder.get_output_number_of_channels()

        # Init Decoder
        self.decoder = PPLiteSegDecoder(
            encoder_channels=encoder_channels,
            up_factors=decoder_up_factors,
            out_channels=decoder_channels,
            upsample_mode=decoder_upsample_mode,
            align_corners=align_corners,
        )

        # Init Segmentation classification heads
        self.seg_head = nn.Sequential(
            SegmentationHead(in_channels=decoder_channels[-1], mid_channels=head_mid_channels, num_classes=num_classes, dropout=dropout),
            make_upsample_module(scale_factor=head_scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
        )
        # Auxiliary heads
        if self.use_aux_heads:
            encoder_out_channels = projection_channels_list
            self.aux_heads = nn.ModuleList(
                [
                    nn.Sequential(
                        SegmentationHead(backbone_ch, hidden_ch, num_classes, dropout=dropout),
                        make_upsample_module(scale_factor=scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
                    )
                    for backbone_ch, hidden_ch, scale_factor in zip(encoder_out_channels, aux_hidden_channels, aux_scale_factors)
                ]
            )
        self.init_params()
        self.num_classes = num_classes

    def _remove_auxiliary_heads(self):
        if hasattr(self, "aux_heads"):
            del self.aux_heads

    @property
    def backbone(self) -> nn.Module:
        """
        Support SG load backbone when training.
        """
        return self.encoder.backbone

    def forward(self, x):
        feats = self.encoder(x)
        if self.use_aux_heads:
            enc_feats = feats[:-1]
        x = self.decoder(feats)
        x = self.seg_head(x)
        if not self.use_aux_heads:
            return x
        aux_feats = [aux_head(feat) for feat, aux_head in zip(enc_feats, self.aux_heads)]
        return tuple([x] + aux_feats)

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
            - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]
        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate backbone params from the rest.
        :return: iterators of groups named_parameters.
        """
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if "encoder.backbone" in name:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
        if not torch_version_is_greater_or_equal(1, 11):
            raise RuntimeError("PPLiteSeg model ONNX export requires torch => 1.11, torch installed: " + str(torch.__version__))
        super().prep_model_for_conversion(input_size, **kwargs)
        if isinstance(self.encoder.context_module, SPPM):
            self.encoder.context_module.prep_model_for_conversion(input_size=input_size, stride_ratio=stride_ratio)

    def replace_head(self, new_num_classes: int, **kwargs):
        for module in self.modules():
            if isinstance(module, SegmentationHead):
                module.replace_num_classes(new_num_classes)
        self.num_classes = new_num_classes

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        lr_dict = {"seg_head": lr, "default": 0}
        if self.use_aux_heads:
            lr_dict["aux_heads"] = lr
        return lr_dict

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.encoder.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.encoder.get_input_channels()

`backbone: nn.Module` `property`

Support SG load backbone when training.

`init(num_classes, backbone, projection_channels_list, sppm_inter_channels, sppm_out_channels, sppm_pool_sizes, sppm_upsample_mode, align_corners, decoder_up_factors, decoder_channels, decoder_upsample_mode, head_scale_factor, head_upsample_mode, head_mid_channels, dropout, use_aux_heads, aux_hidden_channels, aux_scale_factors)`

Parameters:

Name	Type	Description	Default
`backbone`	`AbstractSTDCBackbone`	Backbone nn.Module should implement the abstract class `AbstractSTDCBackbone`.	required
`projection_channels_list`	`List[int]`	channels list to project encoder features before fusing with the decoder stream.	required
`sppm_inter_channels`	`int`	num channels in each sppm pooling branch.	required
`sppm_out_channels`	`int`	The number of output channels after sppm module.	required
`sppm_pool_sizes`	`List[int]`	spatial output sizes of the pooled feature maps.	required
`sppm_upsample_mode`	`Union[UpsampleMode, str]`	Upsample mode to original size after pooling.	required
`decoder_up_factors`	`List[int]`	list upsample factor per decoder stage.	required
`decoder_channels`	`List[int]`	list of num_channels per decoder stage.	required
`decoder_upsample_mode`	`Union[UpsampleMode, str]`	upsample mode in decoder stages, see UpsampleMode for valid options.	required
`head_scale_factor`	`int`	scale factor for final the segmentation head logits.	required
`head_upsample_mode`	`Union[UpsampleMode, str]`	upsample mode to final prediction sizes, see UpsampleMode for valid options.	required
`head_mid_channels`	`int`	num of hidden channels in segmentation head.	required
`use_aux_heads`	`bool`	set True when training, output extra Auxiliary feature maps from the encoder module.	required
`aux_hidden_channels`	`List[int]`	List of hidden channels in auxiliary segmentation heads.	required
`aux_scale_factors`	`List[int]`	list of uppsample factors for final auxiliary heads logits.	required

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

def __init__(
    self,
    num_classes,
    backbone: AbstractSTDCBackbone,
    projection_channels_list: List[int],
    sppm_inter_channels: int,
    sppm_out_channels: int,
    sppm_pool_sizes: List[int],
    sppm_upsample_mode: Union[UpsampleMode, str],
    align_corners: bool,
    decoder_up_factors: List[int],
    decoder_channels: List[int],
    decoder_upsample_mode: Union[UpsampleMode, str],
    head_scale_factor: int,
    head_upsample_mode: Union[UpsampleMode, str],
    head_mid_channels: int,
    dropout: float,
    use_aux_heads: bool,
    aux_hidden_channels: List[int],
    aux_scale_factors: List[int],
):
    """
    :param backbone: Backbone nn.Module should implement the abstract class `AbstractSTDCBackbone`.
    :param projection_channels_list: channels list to project encoder features before fusing with the decoder
        stream.
    :param sppm_inter_channels: num channels in each sppm pooling branch.
    :param sppm_out_channels: The number of output channels after sppm module.
    :param sppm_pool_sizes: spatial output sizes of the pooled feature maps.
    :param sppm_upsample_mode: Upsample mode to original size after pooling.
    :param decoder_up_factors: list upsample factor per decoder stage.
    :param decoder_channels: list of num_channels per decoder stage.
    :param decoder_upsample_mode: upsample mode in decoder stages, see UpsampleMode for valid options.
    :param head_scale_factor: scale factor for final the segmentation head logits.
    :param head_upsample_mode: upsample mode to final prediction sizes, see UpsampleMode for valid options.
    :param head_mid_channels: num of hidden channels in segmentation head.
    :param use_aux_heads: set True when training, output extra Auxiliary feature maps from the encoder module.
    :param aux_hidden_channels: List of hidden channels in auxiliary segmentation heads.
    :param aux_scale_factors: list of uppsample factors for final auxiliary heads logits.
    """
    super().__init__(use_aux_heads=use_aux_heads)

    # Init Encoder
    backbone_out_channels = backbone.get_backbone_output_number_of_channels()
    assert len(backbone_out_channels) == len(projection_channels_list), (
        f"The length of backbone outputs ({backbone_out_channels}) should match the length of projection channels" f"({len(projection_channels_list)})."
    )
    context = SPPM(
        in_channels=backbone_out_channels[-1],
        inter_channels=sppm_inter_channels,
        out_channels=sppm_out_channels,
        pool_sizes=sppm_pool_sizes,
        upsample_mode=sppm_upsample_mode,
        align_corners=align_corners,
    )
    self.encoder = PPLiteSegEncoder(backbone=backbone, context_module=context, projection_channels_list=projection_channels_list)
    encoder_channels = self.encoder.get_output_number_of_channels()

    # Init Decoder
    self.decoder = PPLiteSegDecoder(
        encoder_channels=encoder_channels,
        up_factors=decoder_up_factors,
        out_channels=decoder_channels,
        upsample_mode=decoder_upsample_mode,
        align_corners=align_corners,
    )

    # Init Segmentation classification heads
    self.seg_head = nn.Sequential(
        SegmentationHead(in_channels=decoder_channels[-1], mid_channels=head_mid_channels, num_classes=num_classes, dropout=dropout),
        make_upsample_module(scale_factor=head_scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
    )
    # Auxiliary heads
    if self.use_aux_heads:
        encoder_out_channels = projection_channels_list
        self.aux_heads = nn.ModuleList(
            [
                nn.Sequential(
                    SegmentationHead(backbone_ch, hidden_ch, num_classes, dropout=dropout),
                    make_upsample_module(scale_factor=scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
                )
                for backbone_ch, hidden_ch, scale_factor in zip(encoder_out_channels, aux_hidden_channels, aux_scale_factors)
            ]
        )
    self.init_params()
    self.num_classes = num_classes

`initialize_param_groups(lr, training_params)`

Custom param groups for training: - Different lr for backbone and the rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]
    return param_groups

`PPLiteSegDecoder`

Bases: nn.Module

PPLiteSegDecoder using UAFM blocks to fuse feature maps.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

class PPLiteSegDecoder(nn.Module):
    """
    PPLiteSegDecoder using UAFM blocks to fuse feature maps.
    """

    def __init__(self, encoder_channels: List[int], up_factors: List[int], out_channels: List[int], upsample_mode, align_corners: bool):
        super().__init__()
        # Make a copy of channels list, to prevent out of scope changes.
        encoder_channels = encoder_channels.copy()
        encoder_channels.reverse()
        in_channels = encoder_channels.pop(0)

        # TODO - assert argument length
        self.up_stages = nn.ModuleList()
        for skip_ch, up_factor, out_ch in zip(encoder_channels, up_factors, out_channels):
            self.up_stages.append(
                UAFM(
                    in_channels=in_channels,
                    skip_channels=skip_ch,
                    out_channels=out_ch,
                    up_factor=up_factor,
                    upsample_mode=upsample_mode,
                    align_corners=align_corners,
                )
            )
            in_channels = out_ch

    def forward(self, feats: List[torch.Tensor]):
        feats.reverse()
        x = feats.pop(0)
        for up_stage, skip in zip(self.up_stages, feats):
            x = up_stage(x, skip)
        return x

`PPLiteSegEncoder`

Bases: nn.Module, SupportsReplaceInputChannels

Encoder for PPLiteSeg, include backbone followed by a context module.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

class PPLiteSegEncoder(nn.Module, SupportsReplaceInputChannels):
    """
    Encoder for PPLiteSeg, include backbone followed by a context module.
    """

    def __init__(self, backbone: AbstractSTDCBackbone, projection_channels_list: List[int], context_module: nn.Module):
        super().__init__()
        self.backbone = backbone
        self.context_module = context_module
        feats_channels = backbone.get_backbone_output_number_of_channels()
        self.proj_convs = nn.ModuleList(
            [ConvBNReLU(feat_ch, proj_ch, kernel_size=3, padding=1, bias=False) for feat_ch, proj_ch in zip(feats_channels, projection_channels_list)]
        )
        self.projection_channels_list = projection_channels_list

    def get_output_number_of_channels(self) -> List[int]:
        channels_list = self.projection_channels_list
        if hasattr(self.context_module, "out_channels"):
            channels_list.append(self.context_module.out_channels)
        return channels_list

    def forward(self, x):
        feats = self.backbone(x)
        y = self.context_module(feats[-1])
        feats = [conv(f) for conv, f in zip(self.proj_convs, feats)]
        return feats + [y]

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            return self.backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `get_input_channels`")

`UAFM`

Bases: nn.Module

Unified Attention Fusion Module, which uses mean and max values across the spatial dimensions.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

class UAFM(nn.Module):
    """
    Unified Attention Fusion Module, which uses mean and max values across the spatial dimensions.
    """

    def __init__(
        self,
        in_channels: int,
        skip_channels: int,
        out_channels: int,
        up_factor: int,
        upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
        align_corners: bool = False,
    ):
        """
        :params in_channels: num_channels of input feature map.
        :param skip_channels: num_channels of skip connection feature map.
        :param out_channels: num out channels after features fusion.
        :param up_factor: upsample scale factor of the input feature map.
        :param upsample_mode: see UpsampleMode for valid options.
        """
        super().__init__()
        self.conv_atten = nn.Sequential(
            ConvBNReLU(4, 2, kernel_size=3, padding=1, bias=False), ConvBNReLU(2, 1, kernel_size=3, padding=1, bias=False, use_activation=False)
        )

        self.proj_skip = nn.Identity() if skip_channels == in_channels else ConvBNReLU(skip_channels, in_channels, kernel_size=3, padding=1, bias=False)
        self.up_x = nn.Identity() if up_factor == 1 else make_upsample_module(scale_factor=up_factor, upsample_mode=upsample_mode, align_corners=align_corners)
        self.conv_out = ConvBNReLU(in_channels, out_channels, kernel_size=3, padding=1, bias=False)

    def forward(self, x, skip):
        """
        :param x: input feature map to upsample before fusion.
        :param skip: skip connection feature map.
        """
        x = self.up_x(x)
        skip = self.proj_skip(skip)

        atten = torch.cat([*self._avg_max_spatial_reduce(x, use_concat=False), *self._avg_max_spatial_reduce(skip, use_concat=False)], dim=1)
        atten = self.conv_atten(atten)
        atten = torch.sigmoid(atten)

        out = x * atten + skip * (1 - atten)
        out = self.conv_out(out)
        return out

    @staticmethod
    def _avg_max_spatial_reduce(x, use_concat: bool = False):
        reduced = [torch.mean(x, dim=1, keepdim=True), torch.max(x, dim=1, keepdim=True)[0]]
        if use_concat:
            reduced = torch.cat(reduced, dim=1)
        return reduced

`init(in_channels, skip_channels, out_channels, up_factor, upsample_mode=UpsampleMode.BILINEAR, align_corners=False)`

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	num_channels of input feature map.	required
`skip_channels`	`int`	num_channels of skip connection feature map.	required
`out_channels`	`int`	num out channels after features fusion.	required
`up_factor`	`int`	upsample scale factor of the input feature map.	required
`upsample_mode`	`Union[UpsampleMode, str]`	see UpsampleMode for valid options.	`UpsampleMode.BILINEAR`

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

def __init__(
    self,
    in_channels: int,
    skip_channels: int,
    out_channels: int,
    up_factor: int,
    upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
    align_corners: bool = False,
):
    """
    :params in_channels: num_channels of input feature map.
    :param skip_channels: num_channels of skip connection feature map.
    :param out_channels: num out channels after features fusion.
    :param up_factor: upsample scale factor of the input feature map.
    :param upsample_mode: see UpsampleMode for valid options.
    """
    super().__init__()
    self.conv_atten = nn.Sequential(
        ConvBNReLU(4, 2, kernel_size=3, padding=1, bias=False), ConvBNReLU(2, 1, kernel_size=3, padding=1, bias=False, use_activation=False)
    )

    self.proj_skip = nn.Identity() if skip_channels == in_channels else ConvBNReLU(skip_channels, in_channels, kernel_size=3, padding=1, bias=False)
    self.up_x = nn.Identity() if up_factor == 1 else make_upsample_module(scale_factor=up_factor, upsample_mode=upsample_mode, align_corners=align_corners)
    self.conv_out = ConvBNReLU(in_channels, out_channels, kernel_size=3, padding=1, bias=False)

`forward(x, skip)`

Parameters:

Name	Type	Description	Default
`x`		input feature map to upsample before fusion.	required
`skip`		skip connection feature map.	required

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py

def forward(self, x, skip):
    """
    :param x: input feature map to upsample before fusion.
    :param skip: skip connection feature map.
    """
    x = self.up_x(x)
    skip = self.proj_skip(skip)

    atten = torch.cat([*self._avg_max_spatial_reduce(x, use_concat=False), *self._avg_max_spatial_reduce(skip, use_concat=False)], dim=1)
    atten = self.conv_atten(atten)
    atten = torch.sigmoid(atten)

    out = x * atten + skip * (1 - atten)
    out = self.conv_out(out)
    return out

Implementation of paper: "Rethink Dilated Convolution for Real-time Semantic Segmentation", https://arxiv.org/pdf/2111.09957.pdf Based on original implementation: https://github.com/RolandGao/RegSeg, cloned 23/12/2021, commit c07a833

`AdaptiveShortcutBlock`

Bases: nn.Module

Adaptive shortcut makes the following adaptations, if needed: Applying pooling if stride > 1 Applying 1x1 conv if in/out channels are different or if pooling was applied If stride is 1 and in/out channels are the same, then the shortcut is just an identity

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

class AdaptiveShortcutBlock(nn.Module):
    """
    Adaptive shortcut makes the following adaptations, if needed:
    Applying pooling if stride > 1
    Applying 1x1 conv if in/out channels are different or if pooling was applied
    If stride is 1 and in/out channels are the same, then the shortcut is just an identity
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int):
        super().__init__()
        shortcut_layers = [nn.Identity()]
        if stride != 1:
            shortcut_layers[0] = nn.AvgPool2d(stride, stride, ceil_mode=True)  # override the identity layer
        if in_channels != out_channels or stride != 1:
            shortcut_layers.append(ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False, use_activation=False))
        self.shortcut = nn.Sequential(*shortcut_layers)

    def forward(self, x):
        return self.shortcut(x)

`DBlock`

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

class DBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, dilations: List[int], group_width: int, stride: int, se_ratio: int = 4):
        """
        :param dilations:           a list specifying the required dilations.
                                    the input will be split into len(dilations) groups,
                                    group [i] will be convolved with grouped dilated (dilations[i]) convolution
        :param group_width:         the group width for the dilated convolution(s)
        :param se_ratio:            the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper)
                                    for example: a value of 4 translates to in_channels // 4
        """
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.dilations = dilations
        self.group_width = group_width
        self.stride = stride
        self.se_ratio = se_ratio
        self.shortcut = AdaptiveShortcutBlock(in_channels, out_channels, stride)
        groups = out_channels // group_width

        if len(dilations) == 1:  # minor optimization: no need to split if we only have 1 dilation group
            dilation = dilations[0]
            dilated_conv = nn.Conv2d(out_channels, out_channels, 3, stride=stride, groups=groups, padding=dilation, dilation=dilation, bias=False)
        else:
            dilated_conv = SplitDilatedGroupConvBlock(out_channels, dilations, group_width_per_split=group_width, stride=stride, bias=False)

        self.d_block_path = nn.Sequential(
            ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False),
            dilated_conv,
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            # the ratio of se block applied to `in_channels` as in the original paper
            SqueezeAndExcitationBlock(out_channels, in_channels // se_ratio),
            ConvBNReLU(out_channels, out_channels, 1, use_activation=False, bias=False),
        )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x1 = self.shortcut(x)
        x2 = self.d_block_path(x)
        out = self.relu(x1 + x2)
        return out

    def __str__(self):
        return (
            f"{self.__class__.__name__}_in{self.in_channels}_out{self.out_channels}" f"_d{self.dilations}_gw{self.group_width}_s{self.stride}_se{self.se_ratio}"
        )

`init(in_channels, out_channels, dilations, group_width, stride, se_ratio=4)`

Parameters:

Name	Type	Description	Default
`dilations`	`List[int]`	a list specifying the required dilations. the input will be split into len(dilations) groups, group [i] will be convolved with grouped dilated (dilations[i]) convolution	required
`group_width`	`int`	the group width for the dilated convolution(s)	required
`se_ratio`	`int`	the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper) for example: a value of 4 translates to in_channels // 4	`4`

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

def __init__(self, in_channels: int, out_channels: int, dilations: List[int], group_width: int, stride: int, se_ratio: int = 4):
    """
    :param dilations:           a list specifying the required dilations.
                                the input will be split into len(dilations) groups,
                                group [i] will be convolved with grouped dilated (dilations[i]) convolution
    :param group_width:         the group width for the dilated convolution(s)
    :param se_ratio:            the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper)
                                for example: a value of 4 translates to in_channels // 4
    """
    super().__init__()
    self.in_channels = in_channels
    self.out_channels = out_channels
    self.dilations = dilations
    self.group_width = group_width
    self.stride = stride
    self.se_ratio = se_ratio
    self.shortcut = AdaptiveShortcutBlock(in_channels, out_channels, stride)
    groups = out_channels // group_width

    if len(dilations) == 1:  # minor optimization: no need to split if we only have 1 dilation group
        dilation = dilations[0]
        dilated_conv = nn.Conv2d(out_channels, out_channels, 3, stride=stride, groups=groups, padding=dilation, dilation=dilation, bias=False)
    else:
        dilated_conv = SplitDilatedGroupConvBlock(out_channels, dilations, group_width_per_split=group_width, stride=stride, bias=False)

    self.d_block_path = nn.Sequential(
        ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False),
        dilated_conv,
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True),
        # the ratio of se block applied to `in_channels` as in the original paper
        SqueezeAndExcitationBlock(out_channels, in_channels // se_ratio),
        ConvBNReLU(out_channels, out_channels, 1, use_activation=False, bias=False),
    )
    self.relu = nn.ReLU(inplace=True)

`RegSegDecoder`

Bases: nn.Module

This implementation follows the paper. No 'pattern' in this decoder, so it is specific to 3 stages

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

class RegSegDecoder(nn.Module):
    """
    This implementation follows the paper. No 'pattern' in this decoder, so it is specific to 3 stages
    """

    def __init__(self, backbone_output_channels: List[int], decoder_config: dict):
        super().__init__()
        projection_out_channels = decoder_config["projection_out_channels"]

        assert len(backbone_output_channels) == len(projection_out_channels) == 3, "This decoder is specific for 3 stages"

        self.projections = nn.ModuleList(
            [ConvBNReLU(in_channels, out_channels, 1, bias=False) for in_channels, out_channels in zip(backbone_output_channels, projection_out_channels)]
        )
        self.upsample = nn.Upsample(scale_factor=2, mode=decoder_config["interpolation"], align_corners=True)
        mid_channels = projection_out_channels[1]
        self.conv_bn_relu = ConvBNReLU(in_channels=mid_channels, out_channels=mid_channels // 2, kernel_size=3, padding=1, bias=False)
        self.out_channels = mid_channels // 2 + projection_out_channels[0]  # original implementation: concat

    def forward(self, x_stages):
        proj2 = self.projections[2](x_stages[2])
        proj2 = self.upsample(proj2)
        proj1 = self.projections[1](x_stages[1])
        proj1 = proj1 + proj2
        proj1 = self.conv_bn_relu(proj1)
        proj1 = self.upsample(proj1)
        proj0 = self.projections[0](x_stages[0])
        proj0 = torch.cat((proj1, proj0), dim=1)
        return proj0

`SplitDilatedGroupConvBlock`

Bases: nn.Module

Splits the input to "dilation groups", following grouped convolution with different dilation for each group

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

class SplitDilatedGroupConvBlock(nn.Module):
    """
    Splits the input to "dilation groups", following grouped convolution with different dilation for each group
    """

    def __init__(self, in_channels: int, split_dilations: List[int], group_width_per_split: int, stride: int, bias: bool):
        """
        :param split_dilations:         a list specifying the required dilations.
                                        the input will be split into len(dilations) groups,
                                        group [i] will be convolved with grouped dilated (dilations[i]) convolution
        :param group_width_per_split:   the group width for the *inner* dilated convolution
        """
        super().__init__()
        self.num_splits = len(split_dilations)
        assert in_channels % self.num_splits == 0, f"Cannot split {in_channels} to {self.num_splits} groups with equal size."
        group_channels = in_channels // self.num_splits
        assert group_channels % group_width_per_split == 0, (
            f"Cannot split {group_channels} channels ({in_channels} / {self.num_splits} splits)" f" to groups with {group_width_per_split} channels per group."
        )
        inner_groups = group_channels // group_width_per_split
        self.convs = nn.ModuleList(
            nn.Conv2d(group_channels, group_channels, 3, padding=d, dilation=d, stride=stride, bias=bias, groups=inner_groups) for d in split_dilations
        )
        self._splits = [in_channels // self.num_splits] * self.num_splits

    def forward(self, x):
        x = torch.split(x, self._splits, dim=1)
        return torch.cat([self.convs[i](x[i]) for i in range(self.num_splits)], dim=1)

`init(in_channels, split_dilations, group_width_per_split, stride, bias)`

Parameters:

Name	Type	Description	Default
`split_dilations`	`List[int]`	a list specifying the required dilations. the input will be split into len(dilations) groups, group [i] will be convolved with grouped dilated (dilations[i]) convolution	required
`group_width_per_split`	`int`	the group width for the inner dilated convolution	required

Source code in src/super_gradients/training/models/segmentation_models/regseg.py

def __init__(self, in_channels: int, split_dilations: List[int], group_width_per_split: int, stride: int, bias: bool):
    """
    :param split_dilations:         a list specifying the required dilations.
                                    the input will be split into len(dilations) groups,
                                    group [i] will be convolved with grouped dilated (dilations[i]) convolution
    :param group_width_per_split:   the group width for the *inner* dilated convolution
    """
    super().__init__()
    self.num_splits = len(split_dilations)
    assert in_channels % self.num_splits == 0, f"Cannot split {in_channels} to {self.num_splits} groups with equal size."
    group_channels = in_channels // self.num_splits
    assert group_channels % group_width_per_split == 0, (
        f"Cannot split {group_channels} channels ({in_channels} / {self.num_splits} splits)" f" to groups with {group_width_per_split} channels per group."
    )
    inner_groups = group_channels // group_width_per_split
    self.convs = nn.ModuleList(
        nn.Conv2d(group_channels, group_channels, 3, padding=d, dilation=d, stride=stride, bias=bias, groups=inner_groups) for d in split_dilations
    )
    self._splits = [in_channels // self.num_splits] * self.num_splits