Skip to content

Models

get_arch_params(config_name, overriding_params=None, recipes_dir_path=None)

Class for creating arch parameters dictionary, taking defaults from yaml files in src/super_gradients/recipes/arch_params.

Parameters:

Name Type Description Default
config_name str

Name of the yaml to load (e.g. "resnet18_cifar_arch_params")

required
overriding_params Dict

Dict, dictionary like object containing entries to override.

None
recipes_dir_path Optional[str]

Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "arch_params" folder, which itself should include the config file named after config_name.

None
Source code in src/super_gradients/training/models/arch_params_factory.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_arch_params(config_name: str, overriding_params: Dict = None, recipes_dir_path: Optional[str] = None) -> DictConfig:
    """
    Class for creating arch parameters dictionary, taking defaults from yaml
     files in src/super_gradients/recipes/arch_params.

    :param config_name:         Name of the yaml to load (e.g. "resnet18_cifar_arch_params")
    :param overriding_params: Dict, dictionary like object containing entries to override.
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "arch_params" folder,
                                which itself should include the config file named after config_name.
    """
    overriding_params = overriding_params if overriding_params else dict()

    arch_params = load_arch_params(config_name=config_name, recipes_dir_path=recipes_dir_path)
    arch_params = hydra.utils.instantiate(arch_params)

    arch_params.update(**overriding_params)

    return arch_params

BaseClassifier

Bases: SgModule, HasPredict

Source code in src/super_gradients/training/models/classification_models/base_classifer.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class BaseClassifier(SgModule, HasPredict):
    def __init__(
        self,
    ):
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        super(BaseClassifier, self).__init__()

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(self, class_names: Optional[List[str]] = None, image_processor: Optional[Processing] = None) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> ClassificationPipeline:
        """Instantiate the prediction pipeline of this model.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        if skip_image_resizing:
            raise ValueError("`skip_image_resizing` is not supported for classification models.")

        pipeline = ClassificationPipeline(
            model=self,
            image_processor=self._image_processor,
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesClassificationPrediction:
        """Predict an image or a list of images.

        :param images:      Images to predict.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> None:
        """Predict using webcam.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16: If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

predict(images, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/classification_models/base_classifer.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def predict(
    self,
    images: ImageSource,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesClassificationPrediction:
    """Predict an image or a list of images.

    :param images:      Images to predict.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16: If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(fuse_model=True, skip_image_resizing=False, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/classification_models/base_classifer.py
75
76
77
78
79
80
81
82
def predict_webcam(self, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True) -> None:
    """Predict using webcam.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16: If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

set_dataset_processing_params(class_names=None, image_processor=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
Source code in src/super_gradients/training/models/classification_models/base_classifer.py
22
23
24
25
26
27
28
29
30
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(self, class_names: Optional[List[str]] = None, image_processor: Optional[Processing] = None) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor

BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)

Model from official source: https://github.com/microsoft/unilm/tree/master/beit

At this point only the 1k fine-tuned classification weights and model configs have been added, see original source above for pre-training models and procedure.

Modifications by / Copyright 2021 Ross Wightman, original copyrights below

Beit

Bases: BaseClassifier

Vision Transformer with support for patch or hybrid CNN input stage

Source code in src/super_gradients/training/models/classification_models/beit.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
class Beit(BaseClassifier):
    """Vision Transformer with support for patch or hybrid CNN input stage"""

    def __init__(
        self,
        image_size=(224, 224),
        patch_size=16,
        in_chans=3,
        num_classes=1000,
        global_pool="avg",
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=True,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),
        init_values=None,
        use_abs_pos_emb=True,
        use_rel_pos_bias=False,
        use_shared_rel_pos_bias=False,
        head_init_scale=0.001,
        **kwargs,
    ):
        super().__init__()
        self.num_classes = num_classes
        self.global_pool = global_pool
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.grad_checkpointing = False

        self.image_size = image_size
        self.patch_size = patch_size
        self.patch_embed = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_chans, hidden_dim=self.embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None
        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.grid_size, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    init_values=init_values,
                    window_size=self.patch_embed.grid_size if use_rel_pos_bias else None,
                )
                for i in range(depth)
            ]
        )
        use_fc_norm = self.global_pool == "avg"
        self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else None
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)
        if self.pos_embed is not None:
            trunc_normal_(self.pos_embed, std=0.02)
        trunc_normal_(self.cls_token, std=0.02)
        # trunc_normal_(self.mask_token, std=.02)
        self.fix_init_weight()
        if isinstance(self.head, nn.Linear):
            trunc_normal_(self.head.weight, std=0.02)
            self.head.weight.data.mul_(head_init_scale)
            self.head.bias.data.mul_(head_init_scale)

    def fix_init_weight(self):
        def rescale(param, layer_id):
            param.div_(math.sqrt(2.0 * layer_id))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight.data, layer_id + 1)
            rescale(layer.mlp.fc2.weight.data, layer_id + 1)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        nwd = {"pos_embed", "cls_token"}
        for n, _ in self.named_parameters():
            if "relative_position_bias_table" in n:
                nwd.add(n)
        return nwd

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def group_matcher(self, coarse=False):
        matcher = dict(
            stem=r"^cls_token|pos_embed|patch_embed|rel_pos_bias",  # stem and embed
            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
        )
        return matcher

    @torch.jit.ignore
    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.global_pool = global_pool
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        x = self.patch_embed(x)
        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for blk in self.blocks:
            if self.grad_checkpointing and not torch.jit.is_scripting():
                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
            else:
                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
        x = self.norm(x)
        return x

    def forward_head(self, x, pre_logits: bool = False):
        if self.fc_norm is not None:
            x = x[:, 1:].mean(dim=1)
            x = self.fc_norm(x)
        else:
            x = x[:, 0]
        return x if pre_logits else self.head(x)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.patch_embed = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.embed_dim)

    def get_input_channels(self) -> int:
        return self.patch_embed.get_input_channels()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0.0}

Mlp

Bases: nn.Module

MLP as used in Vision Transformer, MLP-Mixer and related networks

Source code in src/super_gradients/training/models/classification_models/beit.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class Mlp(nn.Module):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features

        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x

trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0)

Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:\mathcal{N}(\text{mean}, \text{std}^2) with values outside :math:[a, b] redrawn until they are within the bounds. The method used for generating the random values works best when :math:a \leq \text{mean} \leq b.

Parameters:

Name Type Description Default
tensor

an n-dimensional torch.Tensor

required
mean

the mean of the normal distribution

0.0
std

the standard deviation of the normal distribution

1.0
a

the minimum cutoff value

-2.0
b

the maximum cutoff value Examples: >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)

2.0
Source code in src/super_gradients/training/models/classification_models/beit.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
    # type: (Tensor, float, float, float, float) -> Tensor
    r"""Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.

    :param tensor: an n-dimensional `torch.Tensor`
    :param mean: the mean of the normal distribution
    :param std: the standard deviation of the normal distribution
    :param a: the minimum cutoff value
    :param b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.trunc_normal_(w)
    """
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

DenseNet

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/densenet.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class DenseNet(BaseClassifier):
    def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int, in_channels: int = 3):
        """
        :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
        :param structure:           how many layers in each pooling block - sequentially
        :param num_init_features:   the number of filters to learn in the first convolutional layer
        :param bn_size:             multiplicative factor for the number of bottle neck layers
                                        (i.e. bn_size * k featurs in the bottleneck)
        :param drop_rate:           dropout rate after each dense layer
        :param num_classes:         number of classes in the classification task
        :param in_channels:         number of channels in the input image
        """
        super(DenseNet, self).__init__()

        # First convolution
        self.features = nn.Sequential(
            OrderedDict(
                [
                    ("conv0", nn.Conv2d(in_channels, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                    ("norm0", nn.BatchNorm2d(num_init_features)),
                    ("relu0", nn.ReLU(inplace=True)),
                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
                ]
            )
        )

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(structure):
            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module("denseblock%d" % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(structure) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module("transition%d" % (i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
        self.features.add_module("norm5", nn.BatchNorm2d(num_features))

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

        # Official init from torch repo.
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.features[0] = replace_conv2d_input_channels(conv=self.features[0], in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.features[0].in_channels

__init__(growth_rate, structure, num_init_features, bn_size, drop_rate, num_classes, in_channels=3)

Parameters:

Name Type Description Default
growth_rate int

number of filter to add each layer (noted as 'k' in the paper)

required
structure list

how many layers in each pooling block - sequentially

required
num_init_features int

the number of filters to learn in the first convolutional layer

required
bn_size int

multiplicative factor for the number of bottle neck layers (i.e. bn_size * k featurs in the bottleneck)

required
drop_rate float

dropout rate after each dense layer

required
num_classes int

number of classes in the classification task

required
in_channels int

number of channels in the input image

3
Source code in src/super_gradients/training/models/classification_models/densenet.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int, in_channels: int = 3):
    """
    :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
    :param structure:           how many layers in each pooling block - sequentially
    :param num_init_features:   the number of filters to learn in the first convolutional layer
    :param bn_size:             multiplicative factor for the number of bottle neck layers
                                    (i.e. bn_size * k featurs in the bottleneck)
    :param drop_rate:           dropout rate after each dense layer
    :param num_classes:         number of classes in the classification task
    :param in_channels:         number of channels in the input image
    """
    super(DenseNet, self).__init__()

    # First convolution
    self.features = nn.Sequential(
        OrderedDict(
            [
                ("conv0", nn.Conv2d(in_channels, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                ("norm0", nn.BatchNorm2d(num_init_features)),
                ("relu0", nn.ReLU(inplace=True)),
                ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
            ]
        )
    )

    # Each denseblock
    num_features = num_init_features
    for i, num_layers in enumerate(structure):
        block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
        self.features.add_module("denseblock%d" % (i + 1), block)
        num_features = num_features + num_layers * growth_rate
        if i != len(structure) - 1:
            trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
            self.features.add_module("transition%d" % (i + 1), trans)
            num_features = num_features // 2

    # Final batch norm
    self.features.add_module("norm5", nn.BatchNorm2d(num_features))

    # Linear layer
    self.classifier = nn.Linear(num_features, num_classes)

    # Official init from torch repo.
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.constant_(m.bias, 0)

Dual Path Networks in PyTorch.

Credits: https://github.com/kuangliu/pytorch-cifar/blob/master/models/dpn.py

EfficientNet model class, based on "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" https://arxiv.org/abs/1905.11946` Code source: https://github.com/lukemelas/EfficientNet-PyTorch Pre-trained checkpoints converted to Deci's code base with the reported accuracy can be found in S3 repo

BlockDecoder

Bases: object

Block Decoder for readability, straight from the official TensorFlow repository.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
class BlockDecoder(object):
    """Block Decoder for readability, straight from the official TensorFlow repository."""

    @staticmethod
    def _decode_block_string(block_string: str) -> BlockArgs:
        """Get a block through a string notation of arguments.

        :param block_string: A string notation of arguments. Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
        :return:     BlockArgs: The namedtuple defined at the top of this file.
        """
        assert isinstance(block_string, str)

        ops = block_string.split("_")
        options = {}
        for op in ops:
            splits = re.split(r"(\d.*)", op)
            if len(splits) >= 2:
                key, value = splits[:2]
                options[key] = value

        # Check stride
        assert ("s" in options and len(options["s"]) == 1) or (len(options["s"]) == 2 and options["s"][0] == options["s"][1])

        return BlockArgs(
            num_repeat=int(options["r"]),
            kernel_size=int(options["k"]),
            stride=[int(options["s"][0])],
            expand_ratio=int(options["e"]),
            input_filters=int(options["i"]),
            output_filters=int(options["o"]),
            se_ratio=float(options["se"]) if "se" in options else None,
            id_skip=("noskip" not in block_string),
        )

    @staticmethod
    def _encode_block_string(block) -> str:
        """Encode a block to a string.

        :param block: A BlockArgs type argument (NamedTuple)
        :return: block_string: A String form of BlockArgs.
        """
        args = [
            "r%d" % block.num_repeat,
            "k%d" % block.kernel_size,
            "s%d%d" % (block.strides[0], block.strides[1]),
            "e%s" % block.expand_ratio,
            "i%d" % block.input_filters,
            "o%d" % block.output_filters,
        ]
        if 0 < block.se_ratio <= 1:
            args.append("se%s" % block.se_ratio)
        if block.id_skip is False:
            args.append("noskip")
        return "_".join(args)

    @staticmethod
    def decode(string_list: List[str]) -> List[BlockArgs]:
        """Decode a list of string notations to specify blocks inside the network.

        :param string_list:     List of strings, each string is a notation of block.
        :return blocks_args:    List of BlockArgs namedtuples of block args.
        """
        assert isinstance(string_list, list)
        blocks_args = []
        for block_string in string_list:
            blocks_args.append(BlockDecoder._decode_block_string(block_string))
        return blocks_args

    @staticmethod
    def encode(blocks_args: List):
        """Encode a list of BlockArgs to a list of strings.

        :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
        :return: block_strings: A list of strings, each string is a notation of block.
        """
        block_strings = []
        for block in blocks_args:
            block_strings.append(BlockDecoder._encode_block_string(block))
        return block_strings

decode(string_list) staticmethod

Decode a list of string notations to specify blocks inside the network.

Parameters:

Name Type Description Default
string_list List[str]

List of strings, each string is a notation of block.

required

Returns:

Type Description
List[BlockArgs]

List of BlockArgs namedtuples of block args.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
277
278
279
280
281
282
283
284
285
286
287
288
@staticmethod
def decode(string_list: List[str]) -> List[BlockArgs]:
    """Decode a list of string notations to specify blocks inside the network.

    :param string_list:     List of strings, each string is a notation of block.
    :return blocks_args:    List of BlockArgs namedtuples of block args.
    """
    assert isinstance(string_list, list)
    blocks_args = []
    for block_string in string_list:
        blocks_args.append(BlockDecoder._decode_block_string(block_string))
    return blocks_args

encode(blocks_args) staticmethod

Encode a list of BlockArgs to a list of strings.

Parameters:

Name Type Description Default
blocks_args List

A list of BlockArgs namedtuples of block args. (list[namedtuples])

required

Returns:

Type Description

block_strings: A list of strings, each string is a notation of block.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
290
291
292
293
294
295
296
297
298
299
300
@staticmethod
def encode(blocks_args: List):
    """Encode a list of BlockArgs to a list of strings.

    :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
    :return: block_strings: A list of strings, each string is a notation of block.
    """
    block_strings = []
    for block in blocks_args:
        block_strings.append(BlockDecoder._encode_block_string(block))
    return block_strings

Conv2dDynamicSamePadding

Bases: nn.Conv2d

2D Convolutions like TensorFlow, for a dynamic image size. The padding is operated in forward function by calculating dynamically.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class Conv2dDynamicSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow, for a dynamic image size.
    The padding is operated in forward function by calculating dynamically.
    """

    # Tips for 'SAME' mode padding.
    #     Given the following:
    #         i: width or height
    #         s: stride
    #         k: kernel size
    #         d: dilation
    #         p: padding
    #     Output after Conv2d:
    #         o = floor((i+p-((k-1)*d+1))/s+1)
    # If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
    # => p = (i-1)*s+((k-1)*d+1)-i

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)  # change the output size according to stride ! ! !
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)

Conv2dStaticSamePadding

Bases: nn.Conv2d

2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. The padding mudule is calculated in construction function, then used in forward.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class Conv2dStaticSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
    The padding mudule is calculated in construction function, then used in forward.
    """

    # With the same calculation as Conv2dDynamicSamePadding

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
        super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

        # Calculate padding based on image size and save it
        assert image_size is not None
        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d((pad_w - pad_w // 2, pad_w // 2, pad_h - pad_h // 2, pad_h // 2))
        else:
            self.static_padding = Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
        return x

EfficientNet

Bases: BaseClassifier

EfficientNet model.

References: [1] https://arxiv.org/abs/1905.11946 (EfficientNet)

Parameters:

Name Type Description Default
width_coefficient float

model's width coefficient. Used as the multiplier.

required
depth_coefficient float

model's depth coefficient. Used as the multiplier.

required
image_size int

Size of input image.

required
dropout_rate float

Dropout probability in final layer

required
num_classes int

Number of classes.

required
batch_norm_momentum Optional[float]

Value used for the running_mean and running_var computation

0.99
batch_norm_epsilon Optional[float]

Value added to the denominator for numerical stability

0.001
drop_connect_rate Optional[float]

Connection dropout probability

0.2
depth_divisor Optional[int]

Model's depth divisor. Used as the divisor.

8
min_depth Optional[int]

Model's minimal depth, if given.

None
backbone_mode Optional[bool]

If true, dropping the final linear layer

False
blocks_args Optional[list]

List of BlockArgs to construct blocks. (list[namedtuple])

None
Source code in src/super_gradients/training/models/classification_models/efficientnet.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
class EfficientNet(BaseClassifier):
    """
    EfficientNet model.

    References:
        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)


    :param width_coefficient:   model's width coefficient. Used as the multiplier.
    :param depth_coefficient:   model's depth coefficient. Used as the multiplier.
    :param image_size:          Size of input image.
    :param dropout_rate:        Dropout probability in final layer
    :param num_classes:         Number of classes.
    :param batch_norm_momentum: Value used for the running_mean and running_var computation
    :param batch_norm_epsilon:  Value added to the denominator for numerical stability
    :param drop_connect_rate:   Connection dropout probability
    :param depth_divisor:       Model's depth divisor. Used as the divisor.
    :param min_depth:           Model's minimal depth, if given.
    :param backbone_mode:       If true, dropping the final linear layer
    :param blocks_args:         List of BlockArgs to construct blocks. (list[namedtuple])
    """

    def __init__(
        self,
        width_coefficient: float,
        depth_coefficient: float,
        image_size: int,
        dropout_rate: float,
        num_classes: int,
        batch_norm_momentum: Optional[float] = 0.99,
        batch_norm_epsilon: Optional[float] = 1e-3,
        drop_connect_rate: Optional[float] = 0.2,
        depth_divisor: Optional[int] = 8,
        min_depth: Optional[int] = None,
        backbone_mode: Optional[bool] = False,
        blocks_args: Optional[list] = None,
    ):
        super().__init__()
        assert isinstance(blocks_args, list), "blocks_args should be a list"
        assert len(blocks_args) > 0, "block args must be greater than 0"

        self._blocks_args = blocks_args
        self.backbone_mode = backbone_mode
        self.drop_connect_rate = drop_connect_rate

        # Batch norm parameters
        bn_mom = 1 - batch_norm_momentum
        bn_eps = batch_norm_epsilon

        # Get stem static or dynamic convolution depending on image size
        Conv2d = get_same_padding_conv2d(image_size=image_size)

        # Stem
        in_channels = 3  # rgb
        out_channels = round_filters(32, width_coefficient, depth_divisor, min_depth)  # number of output channels
        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
        image_size = calculate_output_image_size(image_size, 2)

        # Build blocks
        self._blocks = nn.ModuleList([])
        for block_args in self._blocks_args:

            # Update block input and output filters based on depth multiplier.
            block_args = block_args._replace(
                input_filters=round_filters(block_args.input_filters, width_coefficient, depth_divisor, min_depth),
                output_filters=round_filters(block_args.output_filters, width_coefficient, depth_divisor, min_depth),
                num_repeat=round_repeats(block_args.num_repeat, depth_coefficient),
            )

            # The first block needs to take care of stride and filter size increase.
            self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
            image_size = calculate_output_image_size(image_size, block_args.stride)
            if block_args.num_repeat > 1:  # modify block_args to keep same output size
                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
                self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1

        # Head
        in_channels = block_args.output_filters  # output of final block
        out_channels = round_filters(1280, width_coefficient, depth_divisor, min_depth)
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        # Final linear layer
        if not self.backbone_mode:
            self._avg_pooling = nn.AdaptiveAvgPool2d(1)
            self._dropout = nn.Dropout(dropout_rate)
            self._fc = nn.Linear(out_channels, num_classes)
        self._swish = nn.functional.silu

    def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
        """
        Use convolution layer to extract feature.

        :param inputs: Input tensor.
        :return: Output of the final convolution layer in the efficientnet model.
        """

        # Stem
        x = self._swish(self._bn0(self._conv_stem(inputs)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)

        # Head
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

    def forward(self, inputs):
        """
        EfficientNet's forward function.
        Calls extract_features to extract features, applies final linear layer, and returns logits.

        :param inputs: Input tensor.
        :return: Output of this model after processing.
        """
        bs = inputs.size(0)

        # Convolution layers
        x = self.extract_features(inputs)

        # Pooling and final linear layer, not needed for backbone mode
        if not self.backbone_mode:
            x = self._avg_pooling(x)
            x = x.view(bs, -1)
            x = self._dropout(x)
            x = self._fc(x)

        return x

    def replace_head(self, new_num_classes: Optional[int] = None, new_head: Optional[nn.Module] = None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self._fc = new_head
        else:
            self._fc = nn.Linear(self._fc.in_features, new_num_classes)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self._conv_stem = replace_conv2d_input_channels(conv=self._conv_stem, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self._conv_stem.in_channels

    def load_state_dict(self, state_dict: dict, strict: bool = True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            pretrained_model_weights_dict = pretrained_backbone_weights_dict

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_model_weights_dict, strict)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"_fc": lr, "default": 0.0}

extract_features(inputs)

Use convolution layer to extract feature.

Parameters:

Name Type Description Default
inputs torch.Tensor

Input tensor.

required

Returns:

Type Description
torch.Tensor

Output of the final convolution layer in the efficientnet model.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
    """
    Use convolution layer to extract feature.

    :param inputs: Input tensor.
    :return: Output of the final convolution layer in the efficientnet model.
    """

    # Stem
    x = self._swish(self._bn0(self._conv_stem(inputs)))

    # Blocks
    for idx, block in enumerate(self._blocks):
        drop_connect_rate = self.drop_connect_rate
        if drop_connect_rate:
            drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
        x = block(x, drop_connect_rate=drop_connect_rate)

    # Head
    x = self._swish(self._bn1(self._conv_head(x)))

    return x

forward(inputs)

EfficientNet's forward function. Calls extract_features to extract features, applies final linear layer, and returns logits.

Parameters:

Name Type Description Default
inputs

Input tensor.

required

Returns:

Type Description

Output of this model after processing.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def forward(self, inputs):
    """
    EfficientNet's forward function.
    Calls extract_features to extract features, applies final linear layer, and returns logits.

    :param inputs: Input tensor.
    :return: Output of this model after processing.
    """
    bs = inputs.size(0)

    # Convolution layers
    x = self.extract_features(inputs)

    # Pooling and final linear layer, not needed for backbone mode
    if not self.backbone_mode:
        x = self._avg_pooling(x)
        x = x.view(bs, -1)
        x = self._dropout(x)
        x = self._fc(x)

    return x

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict dict

The state_dict to load

required
strict bool

strict loading (see super() docs)

True
Source code in src/super_gradients/training/models/classification_models/efficientnet.py
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
def load_state_dict(self, state_dict: dict, strict: bool = True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        pretrained_model_weights_dict = pretrained_backbone_weights_dict

    # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
    super().load_state_dict(pretrained_model_weights_dict, strict)

Identity

Bases: nn.Module

Identity mapping. Send input to output directly.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
205
206
207
208
209
210
211
212
213
214
class Identity(nn.Module):
    """Identity mapping.
    Send input to output directly.
    """

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input

MBConvBlock

Bases: nn.Module

Mobile Inverted Residual Bottleneck Block.

References: [1] https://arxiv.org/abs/1704.04861 (MobileNet v1) [2] https://arxiv.org/abs/1801.04381 (MobileNet v2) [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

Parameters:

Name Type Description Default
block_args BlockArgs

BlockArgs.

required
batch_norm_momentum float

Batch norm momentum.

required
batch_norm_epsilon float

Batch norm epsilon.

required
image_size Union[Tuple, List]

[image_height, image_width].

None
Source code in src/super_gradients/training/models/classification_models/efficientnet.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
class MBConvBlock(nn.Module):
    """Mobile Inverted Residual Bottleneck Block.

    References:
        [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
        [2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
        [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

    :param block_args: BlockArgs.
    :param batch_norm_momentum: Batch norm momentum.
    :param batch_norm_epsilon: Batch norm epsilon.
    :param image_size: [image_height, image_width].
    """

    def __init__(self, block_args: BlockArgs, batch_norm_momentum: float, batch_norm_epsilon: float, image_size: Union[Tuple, List] = None):
        super().__init__()
        self._block_args = block_args
        self._bn_mom = 1 - batch_norm_momentum  # pytorch's difference from tensorflow
        self._bn_eps = batch_norm_epsilon
        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip  # whether to use skip connection and drop connect

        # Expansion phase (Inverted Bottleneck)
        inp = self._block_args.input_filters  # number of input channels
        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
        if self._block_args.expand_ratio != 1:
            Conv2d = get_same_padding_conv2d(image_size=image_size)
            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        # Depthwise convolution phase
        k = self._block_args.kernel_size
        s = self._block_args.stride
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._depthwise_conv = Conv2d(in_channels=oup, out_channels=oup, groups=oup, kernel_size=k, stride=s, bias=False)  # groups makes it depthwise
        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
        image_size = calculate_output_image_size(image_size, s)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)

        # Pointwise convolution phase
        final_oup = self._block_args.output_filters
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
        self._swish = nn.functional.silu

    def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
        """MBConvBlock's forward function.

        :param inputs:              Input tensor.
        :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
        :return:                    Output of this block after processing.
        """

        # Expansion and Depthwise Convolution
        x = inputs
        if self._block_args.expand_ratio != 1:
            x = self._expand_conv(inputs)
            x = self._bn0(x)
            x = self._swish(x)

        x = self._depthwise_conv(x)
        x = self._bn1(x)
        x = self._swish(x)

        # Squeeze and Excitation
        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_reduce(x_squeezed)
            x_squeezed = self._swish(x_squeezed)
            x_squeezed = self._se_expand(x_squeezed)
            x = torch.sigmoid(x_squeezed) * x

        # Pointwise Convolution
        x = self._project_conv(x)
        x = self._bn2(x)

        # Skip connection and drop connect
        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            # The combination of skip connection and drop connect brings about stochastic depth.
            if drop_connect_rate:
                x = drop_connect(x, p=drop_connect_rate, training=self.training)
            x = x + inputs  # skip connection
        return x

forward(inputs, drop_connect_rate=None)

MBConvBlock's forward function.

Parameters:

Name Type Description Default
inputs torch.Tensor

Input tensor.

required
drop_connect_rate Optional[float]

Drop connect rate (float, between 0 and 1).

None

Returns:

Type Description
torch.Tensor

Output of this block after processing.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
    """MBConvBlock's forward function.

    :param inputs:              Input tensor.
    :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
    :return:                    Output of this block after processing.
    """

    # Expansion and Depthwise Convolution
    x = inputs
    if self._block_args.expand_ratio != 1:
        x = self._expand_conv(inputs)
        x = self._bn0(x)
        x = self._swish(x)

    x = self._depthwise_conv(x)
    x = self._bn1(x)
    x = self._swish(x)

    # Squeeze and Excitation
    if self.has_se:
        x_squeezed = F.adaptive_avg_pool2d(x, 1)
        x_squeezed = self._se_reduce(x_squeezed)
        x_squeezed = self._swish(x_squeezed)
        x_squeezed = self._se_expand(x_squeezed)
        x = torch.sigmoid(x_squeezed) * x

    # Pointwise Convolution
    x = self._project_conv(x)
    x = self._bn2(x)

    # Skip connection and drop connect
    input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
    if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
        # The combination of skip connection and drop connect brings about stochastic depth.
        if drop_connect_rate:
            x = drop_connect(x, p=drop_connect_rate, training=self.training)
        x = x + inputs  # skip connection
    return x

calculate_output_image_size(input_image_size, stride)

Calculates the output image size when using Conv2dSamePadding with a stride. Necessary for static padding. Thanks to mannatsingh for pointing this out.

Parameters:

Name Type Description Default
input_image_size Union[int, Tuple, List]

Size of input image.

required
stride Union[int, Tuple, List]

Conv2d operation's stride.

required

Returns:

Type Description
Optional[List[int]]

output_image_size: A list [H,W].

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def calculate_output_image_size(input_image_size: Union[int, Tuple, List], stride: Union[int, Tuple, List]) -> Optional[List[int]]:
    """Calculates the output image size when using Conv2dSamePadding with a stride.
    Necessary for static padding. Thanks to mannatsingh for pointing this out.

    :param input_image_size:    Size of input image.
    :param stride:              Conv2d operation's stride.
    :return: output_image_size: A list [H,W].
    """
    if input_image_size is None:
        return None
    elif isinstance(input_image_size, int):
        input_image_size = (input_image_size, input_image_size)

    image_height, image_width = input_image_size
    stride = stride if isinstance(stride, int) else stride[0]
    image_height = int(math.ceil(image_height / stride))
    image_width = int(math.ceil(image_width / stride))
    return [image_height, image_width]

drop_connect(inputs, p, training)

Drop connect.

Parameters:

Name Type Description Default
inputs

Input of this structure. (tensor: BCWH)

required
training bool

Running mode.

required

Returns:

Type Description
torch.Tensor

output: Output after drop connection.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def drop_connect(inputs: torch.Tensor, p: float, training: bool) -> torch.Tensor:
    """Drop connect.

    :param inputs :     Input of this structure. (tensor: BCWH)
    :param p :          Probability of drop connection. (float: 0.0~1.0)
    :param training:    Running mode.
    :return: output: Output after drop connection.
    """
    assert p >= 0 and p <= 1, "p must be in range of [0,1]"

    if not training:
        return inputs

    batch_size = inputs.shape[0]
    keep_prob = 1 - p

    # generate binary_tensor mask according to probability (p for 0, 1-p for 1)
    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
    binary_tensor = torch.floor(random_tensor)

    output = inputs / keep_prob * binary_tensor
    return output

get_same_padding_conv2d(image_size=None)

Chooses static padding if you have specified an image size, and dynamic padding otherwise. Static padding is necessary for ONNX exporting of models.

Parameters:

Name Type Description Default
image_size Optional[Union[int, Tuple[int, int]]]

Size of the image.

None

Returns:

Type Description

Conv2dDynamicSamePadding or Conv2dStaticSamePadding.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
129
130
131
132
133
134
135
136
137
138
139
def get_same_padding_conv2d(image_size: Optional[Union[int, Tuple[int, int]]] = None):
    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
       Static padding is necessary for ONNX exporting of models.

    :param image_size: Size of the image.
    :return: Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
    """
    if image_size is None:
        return Conv2dDynamicSamePadding
    else:
        return partial(Conv2dStaticSamePadding, image_size=image_size)

round_filters(filters, width_coefficient, depth_divisor, min_depth)

Calculate and round number of filters based on width multiplier. Use width_coefficient, depth_divisor and min_depth.

Parameters:

Name Type Description Default
filters int

Filters number to be calculated. Params from arch_params:

required
width_coefficient int

model's width coefficient. Used as the multiplier.

required
depth_divisor int

model's depth divisor. Used as the divisor.

required
min_depth int

model's minimal depth, if given.

required

Returns:

Type Description

new_filters: New filters number after calculating.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def round_filters(filters: int, width_coefficient: int, depth_divisor: int, min_depth: int):
    """Calculate and round number of filters based on width multiplier.
       Use width_coefficient, depth_divisor and min_depth.

    :param filters: Filters number to be calculated. Params from arch_params:
    :param width_coefficient: model's width coefficient. Used as the multiplier.
    :param depth_divisor: model's depth divisor. Used as the divisor.
    :param min_depth: model's minimal depth, if given.
    :return: new_filters: New filters number after calculating.
    """
    if not width_coefficient:
        return filters
    min_depth = min_depth
    filters *= width_coefficient
    min_depth = min_depth or depth_divisor  # pay attention to this line when using min_depth
    # follow the formula transferred from official TensorFlow implementation
    new_filters = max(min_depth, int(filters + depth_divisor / 2) // depth_divisor * depth_divisor)
    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
        new_filters += depth_divisor
    return int(new_filters)

round_repeats(repeats, depth_coefficient)

Calculate module's repeat number of a block based on depth multiplier. Use depth_coefficient.

Parameters:

Name Type Description Default
repeats int

num_repeat to be calculated.

required
depth_coefficient int

the depth coefficient of the model. this func uses it as the multiplier.

required

Returns:

Type Description

new repeat: New repeat number after calculating.

Source code in src/super_gradients/training/models/classification_models/efficientnet.py
64
65
66
67
68
69
70
71
72
73
74
75
def round_repeats(repeats: int, depth_coefficient: int):
    """Calculate module's repeat number of a block based on depth multiplier.
       Use depth_coefficient.

    :param repeats: num_repeat to be calculated.
    :param depth_coefficient: the depth coefficient of the model. this func uses it as the multiplier.
    :return: new repeat: New repeat number after calculating.
    """
    if not depth_coefficient:
        return repeats
    # follow the formula transferred from official TensorFlow implementation
    return int(math.ceil(depth_coefficient * repeats))

Googlenet code based on https://pytorch.org/vision/stable/_modules/torchvision/models/googlenet.html

GoogLeNet

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/googlenet.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class GoogLeNet(BaseClassifier):
    def __init__(self, num_classes=1000, aux_logits=True, init_weights=True, backbone_mode=False, dropout=0.3):
        super(GoogLeNet, self).__init__()

        self.num_classes = num_classes
        self.backbone_mode = backbone_mode

        self.aux_logits = aux_logits
        self.dropout_p = dropout

        self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        self.conv2 = BasicConv2d(64, 64, kernel_size=1)
        self.conv3 = BasicConv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)

        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)

        if aux_logits:
            self.aux1 = InceptionAux(512, num_classes)
            self.aux2 = InceptionAux(528, num_classes)
        else:
            self.aux1 = None
            self.aux2 = None

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        if not self.backbone_mode:
            self.dropout = nn.Dropout(self.dropout_p)
            self.fc = nn.Linear(1024, num_classes)

        if init_weights:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                import scipy.stats as stats

                x = stats.truncnorm(-2, 2, scale=0.01)
                values = torch.as_tensor(x.rvs(m.weight.numel()), dtype=m.weight.dtype)
                values = values.view(m.weight.size())
                with torch.no_grad():
                    m.weight.copy_(values)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _forward(self, x):
        # N x 3 x 224 x 224
        x = self.conv1(x)
        # N x 64 x 112 x 112
        x = self.maxpool1(x)
        # N x 64 x 56 x 56
        x = self.conv2(x)
        # N x 64 x 56 x 56
        x = self.conv3(x)
        # N x 192 x 56 x 56
        x = self.maxpool2(x)

        # N x 192 x 28 x 28
        x = self.inception3a(x)
        # N x 256 x 28 x 28
        x = self.inception3b(x)
        # N x 480 x 28 x 28
        x = self.maxpool3(x)
        # N x 480 x 14 x 14
        x = self.inception4a(x)
        # N x 512 x 14 x 14
        aux1 = None
        if self.aux1 is not None and self.training:
            aux1 = self.aux1(x)

        x = self.inception4b(x)
        # N x 512 x 14 x 14
        x = self.inception4c(x)
        # N x 512 x 14 x 14
        x = self.inception4d(x)
        # N x 528 x 14 x 14
        aux2 = None
        if self.aux2 is not None and self.training:
            aux2 = self.aux2(x)

        x = self.inception4e(x)
        # N x 832 x 14 x 14
        x = self.maxpool4(x)
        # N x 832 x 7 x 7
        x = self.inception5a(x)
        # N x 832 x 7 x 7
        x = self.inception5b(x)
        # N x 1024 x 7 x 7

        x = self.avgpool(x)
        # N x 1024 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 1024
        if not self.backbone_mode:
            x = self.dropout(x)
            x = self.fc(x)
        # N x num_classes
        return x, aux2, aux1

    def forward(self, x):
        x, aux1, aux2 = self._forward(x)
        if self.training and self.aux_logits:
            return GoogLeNetOutputs(x, aux2, aux1)
        else:
            return x

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights
            c_temp = torch.nn.Linear(1024, self.num_classes)
            torch.nn.init.xavier_uniform(c_temp.weight)
            pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
            pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in src/super_gradients/training/models/classification_models/googlenet.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights
        c_temp = torch.nn.Linear(1024, self.num_classes)
        torch.nn.init.xavier_uniform(c_temp.weight)
        pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
        pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

LeNet in PyTorch.

https://yann.lecun.com/exdb/lenet/

MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" for more details.

Block

Bases: nn.Module

Depthwise conv + Pointwise conv

Source code in src/super_gradients/training/models/classification_models/mobilenet.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class Block(nn.Module):
    """Depthwise conv + Pointwise conv"""

    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out

MobileNet

Bases: BaseClassifier, SupportsReplaceInputChannels

Source code in src/super_gradients/training/models/classification_models/mobilenet.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class MobileNet(BaseClassifier, SupportsReplaceInputChannels):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, 128, (128, 2), 256, (256, 2), 512, 512, 512, 512, 512, (512, 2), 1024, (1024, 2)]

    def __init__(self, num_classes=10, backbone_mode=False, up_to_layer=None, in_channels: int = 3):
        super(MobileNet, self).__init__()
        self.backbone_mode = backbone_mode
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32, up_to_layer=up_to_layer if up_to_layer is not None else len(self.cfg))

        if not self.backbone_mode:
            self.linear = nn.Linear(self.cfg[-1], num_classes)

    def _make_layers(self, in_planes, up_to_layer):
        layers = []
        for x in self.cfg[:up_to_layer]:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
        """
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)

        if not self.backbone_mode:
            out = F.avg_pool2d(out, 2)
            out = out.view(out.size(0), -1)
            out = self.linear(out)

        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

forward(x)

Parameters:

Name Type Description Default
up_to_layer

forward through the net layers up to a specific layer. if None, run all layers

required
Source code in src/super_gradients/training/models/classification_models/mobilenet.py
52
53
54
55
56
57
58
59
60
61
62
63
64
def forward(self, x):
    """
    :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
    """
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layers(out)

    if not self.backbone_mode:
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)

    return out

This is a PyTorch implementation of MobileNetV2 architecture as described in the paper: Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation. https://arxiv.org/pdf/1801.04381

Code taken from https://github.com/tonylins/pytorch-mobilenet-v2 License: Apache Version 2.0, January 2004 http://www.apache.org/licenses/

Pre-trained ImageNet model: 'deci-model-repository/mobilenet_v2/ckpt_best.pth'

CustomMobileNetV2

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
@register_model(Models.CUSTOM_MOBILENET_V2)
class CustomMobileNetV2(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params:–≠ HpmStruct
            must contain:
                'num_classes': int
                'width_mult': float
                'structure' : list. specify the mobilenetv2 architecture
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            structure=arch_params.structure,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

Parameters:

Name Type Description Default
arch_params

–≠ HpmStruct must contain: 'num_classes': int 'width_mult': float 'structure' : list. specify the mobilenetv2 architecture

required
Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def __init__(self, arch_params):
    """
    :param arch_params:–≠ HpmStruct
        must contain:
            'num_classes': int
            'width_mult': float
            'structure' : list. specify the mobilenetv2 architecture
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=arch_params.width_mult,
        structure=arch_params.structure,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

InvertedResidual

Bases: nn.Module

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
        """
        :param inp: number of input channels
        :param oup: number of output channels
        :param stride: conv stride
        :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
        :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
        """
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        groups = int(hidden_dim / grouped_conv_size)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)

__init__(inp, oup, stride, expand_ratio, grouped_conv_size=1)

:grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1

Parameters:

Name Type Description Default
inp

number of input channels

required
oup

number of output channels

required
stride

conv stride

required
expand_ratio

expansion ratio of the hidden layer after pointwise conv

required
Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
    """
    :param inp: number of input channels
    :param oup: number of output channels
    :param stride: conv stride
    :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
    :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
    """
    super(InvertedResidual, self).__init__()
    self.stride = stride
    assert stride in [1, 2]

    hidden_dim = int(inp * expand_ratio)
    groups = int(hidden_dim / grouped_conv_size)
    self.use_res_connect = self.stride == 1 and inp == oup

    if expand_ratio == 1:
        self.conv = nn.Sequential(
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )
    else:
        self.conv = nn.Sequential(
            # pw
            nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )

MobileNetV2

Bases: MobileNetBase

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class MobileNetV2(MobileNetBase):
    def __init__(
        self,
        num_classes,
        dropout: float,
        width_mult=1.0,
        structure=None,
        backbone_mode: bool = False,
        grouped_conv_size=1,
        in_channels=3,
    ) -> object:
        super(MobileNetV2, self).__init__()
        self.in_channels = in_channels
        block = InvertedResidual
        last_channel = 1280
        # IF STRUCTURE IS NONE - USE THE DEFAULT STRUCTURE NOTED
        #                                                  t, c,  n, s    stage-0 is the first conv_bn layer
        self.interverted_residual_setting = structure or [
            [1, 16, 1, 1],  # stage-1
            [6, 24, 2, 2],  # stage-2
            [6, 32, 3, 2],  # stage-3
            [6, 64, 4, 2],  # stage-4
            [6, 96, 3, 1],  # stage-5
            [6, 160, 3, 2],  # stage-6
            [6, 320, 1, 1],
        ]  # stage-7
        #                                                                   stage-8  is the last_layer
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel

        curr_channels = 32
        self.features = [conv_bn(in_channels, curr_channels, 2)]
        # building inverted residual blocks
        for t, c, n, s in self.interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            for i in range(n):
                if i == 0:
                    self.features.append(block(curr_channels, output_channel, s, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                else:
                    self.features.append(block(curr_channels, output_channel, 1, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                curr_channels = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(curr_channels, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)
        self.backbone_mode = backbone_mode

        if self.backbone_mode:
            self.classifier = nn.Identity()
            # TODO: remove during migration of YOLOs to the new base
            self.backbone_connection_channels = self._extract_connection_layers_input_channel_size()
        else:
            # building classifier
            self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(self.last_channel, num_classes))
        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        if self.backbone_mode:
            return x
        else:
            x = x.mean(3).mean(2)
            return self.classifier(x)

    def _extract_connection_layers_input_channel_size(self):
        """
        Extracts the number of channels out when using mobilenetV2 as yolo backbone
        """
        curr_layer_input = torch.rand(1, self.in_channels, 320, 320)  # input dims are used to extract number of channels
        layers_num_to_extract = [np.array(self.interverted_residual_setting)[:stage, 2].sum() for stage in [3, 5]]
        connection_layers_input_channel_size = []
        for layer_idx, feature in enumerate(self.features):
            curr_layer_input = feature(curr_layer_input)
            if layer_idx in layers_num_to_extract:
                connection_layers_input_channel_size.append(curr_layer_input.shape[1])
        connection_layers_input_channel_size.append(self.last_channel)
        connection_layers_input_channel_size.reverse()
        return connection_layers_input_channel_size

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.features[0][0] = replace_conv2d_input_channels(conv=self.features[0][0], in_channels=in_channels, fn=compute_new_weights_fn)
        self.in_channels = self.get_input_channels()

    def get_input_channels(self) -> int:
        return self.features[0][0].in_channels

MobileNetV2Base

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
@register_model(Models.MOBILENET_V2)
class MobileNetV2Base(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.0,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

Parameters:

Name Type Description Default
arch_params

HpmStruct must contain: 'num_classes': int

required
Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
207
208
209
210
211
212
213
214
215
216
217
218
def __init__(self, arch_params):
    """
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.0,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

MobileNetV2_135

Bases: MobileNetV2

Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
@register_model(Models.MOBILE_NET_V2_135)
class MobileNetV2_135(MobileNetV2):
    def __init__(self, arch_params):
        """
        This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.35,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50

Parameters:

Name Type Description Default
arch_params

HpmStruct must contain: 'num_classes': int

required
Source code in src/super_gradients/training/models/classification_models/mobilenetv2.py
223
224
225
226
227
228
229
230
231
232
233
234
235
def __init__(self, arch_params):
    """
    This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.35,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

Creates a MobileNetV3 Model as defined in: Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam. (2019). Searching for MobileNetV3 arXiv preprint arXiv:1905.02244.

mobilenetv3_custom

Bases: MobileNetV3

Constructs a MobileNetV3-Customized model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
@register_model(Models.MOBILENET_V3_CUSTOM)
class mobilenetv3_custom(MobileNetV3):
    """
    Constructs a MobileNetV3-Customized model
    """

    def __init__(self, arch_params):
        super().__init__(
            cfgs=arch_params.structure,
            mode=arch_params.mode,
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            in_channels=get_param(arch_params, "in_channels", 3),
        )

mobilenetv3_large

Bases: MobileNetV3

Constructs a MobileNetV3-Large model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
@register_model(Models.MOBILENET_V3_LARGE)
class mobilenetv3_large(MobileNetV3):
    """
    Constructs a MobileNetV3-Large model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 0, 0, 1],
            [3, 4, 24, 0, 0, 2],
            [3, 3, 24, 0, 0, 1],
            [5, 3, 40, 1, 0, 2],
            [5, 3, 40, 1, 0, 1],
            [5, 3, 40, 1, 0, 1],
            [3, 6, 80, 0, 1, 2],
            [3, 2.5, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [5, 6, 160, 1, 1, 2],
            [5, 6, 160, 1, 1, 1],
            [5, 6, 160, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="large", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

mobilenetv3_small

Bases: MobileNetV3

Constructs a MobileNetV3-Small model

Source code in src/super_gradients/training/models/classification_models/mobilenetv3.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
@register_model(Models.MOBILENET_V3_SMALL)
class mobilenetv3_small(MobileNetV3):
    """
    Constructs a MobileNetV3-Small model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 1, 0, 2],
            [3, 4.5, 24, 0, 0, 2],
            [3, 3.67, 24, 0, 0, 1],
            [5, 4, 40, 1, 1, 2],
            [5, 6, 40, 1, 1, 1],
            [5, 6, 40, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 6, 96, 1, 1, 2],
            [5, 6, 96, 1, 1, 1],
            [5, 6, 96, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="small", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search

https://github.com/kuangliu/pytorch-cifar/blob/master/models/pnasnet.py

SepConv

Bases: nn.Module

Separable Convolution.

Source code in src/super_gradients/training/models/classification_models/pnasnet.py
13
14
15
16
17
18
19
20
21
22
class SepConv(nn.Module):
    """Separable Convolution."""

    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding=(kernel_size - 1) // 2, bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))

Pre-activation ResNet in PyTorch.

Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv:1603.05027

Based on https://github.com/kuangliu/pytorch-cifar/blob/master/models/preact_resnet.py

PreActBlock

Bases: nn.Module

Pre-activation version of the BasicBlock.

Source code in src/super_gradients/training/models/classification_models/preact_resnet.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class PreActBlock(nn.Module):
    """Pre-activation version of the BasicBlock."""

    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out

PreActBottleneck

Bases: nn.Module

Pre-activation version of the original Bottleneck module.

Source code in src/super_gradients/training/models/classification_models/preact_resnet.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class PreActBottleneck(nn.Module):
    """Pre-activation version of the original Bottleneck module."""

    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out

Regnet - from paper: Designing Network Design Spaces - https://arxiv.org/pdf/2003.13678.pdf Implementation of paradigm described in paper published by Facebook AI Research (FAIR) @author: Signatrix GmbH Code taken from: https://github.com/signatrix/regnet - MIT Licence

CustomAnyNet

Bases: AnyNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
@register_model(Models.CUSTOM_ANYNET)
class CustomAnyNet(AnyNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            ls_num_blocks=arch_params.ls_num_blocks,
            ls_block_width=arch_params.ls_block_width,
            ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
            ls_group_width=arch_params.ls_group_width,
            stride=arch_params.stride,
            num_classes=arch_params.num_classes,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            backbone_mode=get_param(arch_params, "backbone_mode", False),
            dropout_prob=get_param(arch_params, "dropout_prob", 0),
            droppath_prob=get_param(arch_params, "droppath_prob", 0),
            input_channels=get_param(arch_params, "input_channels", 3),
        )

__init__(arch_params)

All parameters must be provided in arch_params other than SE

Source code in src/super_gradients/training/models/classification_models/regnet.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        ls_num_blocks=arch_params.ls_num_blocks,
        ls_block_width=arch_params.ls_block_width,
        ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
        ls_group_width=arch_params.ls_group_width,
        stride=arch_params.stride,
        num_classes=arch_params.num_classes,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        backbone_mode=get_param(arch_params, "backbone_mode", False),
        dropout_prob=get_param(arch_params, "dropout_prob", 0),
        droppath_prob=get_param(arch_params, "droppath_prob", 0),
        input_channels=get_param(arch_params, "input_channels", 3),
    )

CustomRegNet

Bases: RegNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
@register_model(Models.CUSTOM_REGNET)
class CustomRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            initial_width=arch_params.initial_width,
            slope=arch_params.slope,
            quantized_param=arch_params.quantized_param,
            network_depth=arch_params.network_depth,
            bottleneck_ratio=arch_params.bottleneck_ratio,
            group_width=arch_params.group_width,
            stride=arch_params.stride,
            arch_params=arch_params,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            input_channels=get_param(arch_params, "input_channels", 3),
        )

__init__(arch_params)

All parameters must be provided in arch_params other than SE

Source code in src/super_gradients/training/models/classification_models/regnet.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        initial_width=arch_params.initial_width,
        slope=arch_params.slope,
        quantized_param=arch_params.quantized_param,
        network_depth=arch_params.network_depth,
        bottleneck_ratio=arch_params.bottleneck_ratio,
        group_width=arch_params.group_width,
        stride=arch_params.stride,
        arch_params=arch_params,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        input_channels=get_param(arch_params, "input_channels", 3),
    )

NASRegNet

Bases: RegNetX

Source code in src/super_gradients/training/models/classification_models/regnet.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
@register_model(Models.NAS_REGNET)
class NASRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters are provided as a single structure list: arch_params.structure"""
        structure = arch_params.structure
        super().__init__(
            initial_width=structure[0],
            slope=structure[1],
            quantized_param=structure[2],
            network_depth=structure[3],
            bottleneck_ratio=structure[4],
            group_width=structure[5],
            stride=structure[6],
            se_ratio=structure[7] if structure[7] > 0 else None,
            arch_params=arch_params,
        )

__init__(arch_params)

All parameters are provided as a single structure list: arch_params.structure

Source code in src/super_gradients/training/models/classification_models/regnet.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def __init__(self, arch_params):
    """All parameters are provided as a single structure list: arch_params.structure"""
    structure = arch_params.structure
    super().__init__(
        initial_width=structure[0],
        slope=structure[1],
        quantized_param=structure[2],
        network_depth=structure[3],
        bottleneck_ratio=structure[4],
        group_width=structure[5],
        stride=structure[6],
        se_ratio=structure[7] if structure[7] > 0 else None,
        arch_params=arch_params,
    )

verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width)

VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER

Source code in src/super_gradients/training/models/classification_models/regnet.py
246
247
248
249
250
251
252
253
254
255
256
def verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width):
    """VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER"""
    err_message = "Parameters don't fit"
    assert len(set(ls_bottleneck_ratio)) == 1, f"{err_message} AnyNetXb"
    assert len(set(ls_group_width)) == 1, f"{err_message} AnyNetXc"
    assert all(i <= j for i, j in zip(ls_block_width, ls_block_width[1:])) is True, f"{err_message} AnyNetXd"
    if len(ls_num_blocks) > 2:
        assert all(i <= j for i, j in zip(ls_num_blocks[:-2], ls_num_blocks[1:-1])) is True, f"{err_message} AnyNetXe"
    # For each stage & each layer, number of channels (block width / bottleneck ratio) must be divisible by group width
    for block_width, bottleneck_ratio, group_width in zip(ls_block_width, ls_bottleneck_ratio, ls_group_width):
        assert int(block_width // bottleneck_ratio) % group_width == 0

Repvgg Pytorch Implementation. This model trains a vgg with residual blocks but during inference (in deployment mode) will convert the model to vgg model. Pretrained models: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq Refrerences: [1] https://github.com/DingXiaoH/RepVGG [2] https://arxiv.org/pdf/2101.03697.pdf

Based on https://github.com/DingXiaoH/RepVGG

RepVGG

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/repvgg.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class RepVGG(BaseClassifier):
    def __init__(
        self,
        struct,
        num_classes=1000,
        width_multiplier=None,
        build_residual_branches=True,
        use_se=False,
        backbone_mode=False,
        in_channels=3,
    ):
        """
        :param struct: list containing number of blocks per repvgg stage
        :param num_classes: number of classes if nut in backbone mode
        :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
        :param build_residual_branches: whether to add residual connections or not
        :param use_se: use squeeze and excitation layers
        :param backbone_mode: if true, dropping the final linear layer
        :param in_channels: input channels
        """
        super(RepVGG, self).__init__()

        if isinstance(width_multiplier, float):
            width_multiplier = [width_multiplier] * 4
        else:
            assert len(width_multiplier) == 4

        self.build_residual_branches = build_residual_branches
        self.use_se = use_se
        self.backbone_mode = backbone_mode

        self.in_planes = int(64 * width_multiplier[0])

        self.stem = RepVGGBlock(
            in_channels=in_channels,
            out_channels=self.in_planes,
            stride=2,
            build_residual_branches=build_residual_branches,
            activation_type=nn.ReLU,
            activation_kwargs=dict(inplace=True),
            se_type=SEBlock if self.use_se else nn.Identity,
            se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
        )
        self.cur_layer_idx = 1
        self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
        self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
        self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
        self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
            self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

        if not build_residual_branches:
            self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
            fuse_repvgg_blocks_residual_branches(self)

        self.final_width_mult = width_multiplier[3]

    def _make_stage(self, planes, struct, stride):
        strides = [stride] + [1] * (struct - 1)
        blocks = []
        for stride in strides:
            blocks.append(
                RepVGGBlock(
                    in_channels=self.in_planes,
                    out_channels=planes,
                    stride=stride,
                    groups=1,
                    build_residual_branches=self.build_residual_branches,
                    activation_type=nn.ReLU,
                    activation_kwargs=dict(inplace=True),
                    se_type=SEBlock if self.use_se else nn.Identity,
                    se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
                )
            )
            self.in_planes = planes
            self.cur_layer_idx += 1
        return nn.Sequential(*blocks)

    def forward(self, x):
        out = self.stem(x)
        out = self.stage1(out)
        out = self.stage2(out)
        out = self.stage3(out)
        out = self.stage4(out)
        if not self.backbone_mode:
            out = self.avgpool(out)
            out = out.view(out.size(0), -1)
            out = self.linear(out)
        return out

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        if self.build_residual_branches:
            fuse_repvgg_blocks_residual_branches(self)

    def train(self, mode: bool = True):

        assert (
            not mode or self.build_residual_branches
        ), "Trying to train a model without residual branches, set arch_params.build_residual_branches to True and retrain the model"
        super(RepVGG, self).train(mode=mode)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(int(512 * self.final_width_mult), new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"linear": lr, "default": 0}

__init__(struct, num_classes=1000, width_multiplier=None, build_residual_branches=True, use_se=False, backbone_mode=False, in_channels=3)

Parameters:

Name Type Description Default
struct

list containing number of blocks per repvgg stage

required
num_classes

number of classes if nut in backbone mode

1000
width_multiplier

list of per stage width multiplier or float if using single value for all stages

None
build_residual_branches

whether to add residual connections or not

True
use_se

use squeeze and excitation layers

False
backbone_mode

if true, dropping the final linear layer

False
in_channels

input channels

3
Source code in src/super_gradients/training/models/classification_models/repvgg.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    struct,
    num_classes=1000,
    width_multiplier=None,
    build_residual_branches=True,
    use_se=False,
    backbone_mode=False,
    in_channels=3,
):
    """
    :param struct: list containing number of blocks per repvgg stage
    :param num_classes: number of classes if nut in backbone mode
    :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
    :param build_residual_branches: whether to add residual connections or not
    :param use_se: use squeeze and excitation layers
    :param backbone_mode: if true, dropping the final linear layer
    :param in_channels: input channels
    """
    super(RepVGG, self).__init__()

    if isinstance(width_multiplier, float):
        width_multiplier = [width_multiplier] * 4
    else:
        assert len(width_multiplier) == 4

    self.build_residual_branches = build_residual_branches
    self.use_se = use_se
    self.backbone_mode = backbone_mode

    self.in_planes = int(64 * width_multiplier[0])

    self.stem = RepVGGBlock(
        in_channels=in_channels,
        out_channels=self.in_planes,
        stride=2,
        build_residual_branches=build_residual_branches,
        activation_type=nn.ReLU,
        activation_kwargs=dict(inplace=True),
        se_type=SEBlock if self.use_se else nn.Identity,
        se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
    )
    self.cur_layer_idx = 1
    self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
    self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
    self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
    self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
    if not self.backbone_mode:
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

    if not build_residual_branches:
        self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
        fuse_repvgg_blocks_residual_branches(self)

    self.final_width_mult = width_multiplier[3]

ResNet in PyTorch. For Pre-activation ResNet, see 'preact_resnet.py'. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385

Pre-trained ImageNet models: 'deci-model-repository/resnet?/ckpt_best.pth' => ? = the type of resnet (e.g. 18, 34...) Pre-trained CIFAR10 models: 'deci-model-repository/CIFAR_NAS_#??????/ckpt_best.pth' => ? = num of model, structure, width_mult

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

ResNet

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/resnet.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
class ResNet(BaseClassifier):
    def __init__(
        self,
        block,
        num_blocks: list,
        num_classes: int = 10,
        width_mult: float = 1,
        expansion: int = 1,
        droppath_prob=0.0,
        input_batchnorm: bool = False,
        backbone_mode: bool = False,
        in_channels: int = 3,
    ):
        super(ResNet, self).__init__()
        self.expansion = expansion
        self.backbone_mode = backbone_mode
        self.structure = [num_blocks, width_mult]
        self.in_planes = width_multiplier(64, width_mult)
        self.input_batchnorm = input_batchnorm

        if self.input_batchnorm:
            self.bn0 = nn.BatchNorm2d(num_features=in_channels)

        self.conv1 = nn.Conv2d(in_channels, width_multiplier(64, width_mult), kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(width_multiplier(64, width_mult))
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, width_multiplier(64, width_mult), num_blocks[0], stride=1, droppath_prob=droppath_prob)
        self.layer2 = self._make_layer(block, width_multiplier(128, width_mult), num_blocks[1], stride=2, droppath_prob=droppath_prob)
        self.layer3 = self._make_layer(block, width_multiplier(256, width_mult), num_blocks[2], stride=2, droppath_prob=droppath_prob)
        self.layer4 = self._make_layer(block, width_multiplier(512, width_mult), num_blocks[3], stride=2, droppath_prob=droppath_prob)

        if not self.backbone_mode:
            # IF RESNET IS IN BACK_BONE MODE WE DON'T NEED THE FINAL CLASSIFIER LAYERS, BUT ONLY THE NET BLOCK STRUCTURE
            self.linear = nn.Linear(width_multiplier(512, width_mult) * self.expansion, num_classes)
            self.avgpool = nn.AdaptiveAvgPool2d(1)

        self.width_mult = width_mult

    def _make_layer(self, block, planes, num_blocks, stride, droppath_prob):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        if num_blocks == 0:
            # When the number of blocks is zero but spatial dimension and/or number of filters about to change we put 1
            # 3X3 conv layer to make this change to the new dimensions.
            if stride != 1 or self.in_planes != planes:
                layers.append(nn.Sequential(nn.Conv2d(self.in_planes, planes, kernel_size=3, stride=stride, bias=False, padding=1), nn.BatchNorm2d(planes)))
                self.in_planes = planes

        else:
            for stride in strides:
                layers.append(block(self.in_planes, planes, stride, droppath_prob=droppath_prob))
                self.in_planes = planes * self.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        if self.input_batchnorm:
            x = self.bn0(x)
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        if not self.backbone_mode:
            # IF RESNET IS *NOT* IN BACK_BONE MODE WE  NEED THE FINAL CLASSIFIER LAYERS OUTPUTS
            out = self.avgpool(out)
            out = out.squeeze(dim=2).squeeze(dim=2)
            out = self.linear(out)

        return out

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(width_multiplier(512, self.width_mult) * self.expansion, new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"linear": lr, "default": 0}

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        if self.input_batchnorm:
            self.bn0 = nn.BatchNorm2d(num_features=self.in_channels)

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in src/super_gradients/training/models/classification_models/resnet.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

GroupedConvBlock

Bases: nn.Module

Grouped convolution block.

Source code in src/super_gradients/training/models/classification_models/resnext.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class GroupedConvBlock(nn.Module):
    """Grouped convolution block."""

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None):
        super(GroupedConvBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        self.norm_layer = norm_layer
        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

conv1x1(in_planes, out_planes, stride=1)

1x1 convolution

Source code in src/super_gradients/training/models/classification_models/resnext.py
21
22
23
def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1)

3x3 convolution with padding

Source code in src/super_gradients/training/models/classification_models/resnext.py
16
17
18
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation)

SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.

Code adapted from https://github.com/fastai/imagenet-fast/blob/master/cifar10/models/cifar10/senet.py

ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.

https://github.com/kuangliu/pytorch-cifar/blob/master/models/shufflenet.py

ShuffleBlock

Bases: nn.Module

Source code in src/super_gradients/training/models/classification_models/shufflenet.py
13
14
15
16
17
18
19
20
21
22
class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
        N, C, H, W = x.size()
        g = self.groups
        return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

forward(x)

Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]

Source code in src/super_gradients/training/models/classification_models/shufflenet.py
18
19
20
21
22
def forward(self, x):
    """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
    N, C, H, W = x.size()
    g = self.groups
    return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

ShuffleNetV2 in PyTorch.

See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. (https://arxiv.org/abs/1807.11164)

Code taken from torchvision/models/shufflenetv2.py

ChannelShuffleInvertedResidual

Bases: nn.Module

Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

  • When stride > 1
  • the whole input goes through branch1,
  • the whole input goes through branch2 , and the arbitrary number of output channels are produced.
  • When stride == 1
  • half of input channels in are passed as identity,
  • another half of input channels goes through branch2, and the number of output channels after the block remains the same as in input.

Channel shuffle is performed on a concatenation in both cases.

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class ChannelShuffleInvertedResidual(nn.Module):
    """
    Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

    * When stride > 1
      - the whole input goes through branch1,
      - the whole input goes through branch2 ,
      and the arbitrary number of output channels are produced.
    * When stride == 1
      - half of input channels in are passed as identity,
      - another half of input channels goes through branch2,
      and the number of output channels after the block remains the same as in input.

    Channel shuffle is performed on a concatenation in both cases.
    """

    def __init__(self, inp: int, out: int, stride: int) -> None:
        super(ChannelShuffleInvertedResidual, self).__init__()

        assert 1 <= stride <= 3, "Illegal stride value"
        assert (stride != 1) or (inp == out), "When stride == 1 num of input channels should be equal to the requested num of out output channels"

        self.stride = stride
        # half of requested out channels will be produced by each branch
        branch_features = out // 2

        if self.stride > 1:
            self.branch1 = nn.Sequential(
                nn.Conv2d(inp, inp, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=inp),
                nn.BatchNorm2d(inp),
                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(branch_features),
                nn.ReLU(inplace=True),
            )
        else:
            # won't be called if self.stride == 1
            self.branch1 = nn.Identity()

        self.branch2 = nn.Sequential(
            # branch 2 operates on the whole input when stride > 1 and on half of it otherwise
            nn.Conv2d(inp if (self.stride > 1) else inp // 2, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=branch_features),
            nn.BatchNorm2d(branch_features),
            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
        )

    @staticmethod
    def channel_shuffle(x: Tensor, groups: int) -> Tensor:
        """
        From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
            A “channel shuffle” operation is then introduced to enable
            information communication between different groups of channels and improve accuracy.

        The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

        Example:
            If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
            then activation maps in x are:
            from_B1, from_B1, ... from_B2, from_B2
            After channel_shuffle activation maps in x will be:
            from_B1, from_B2, ... from_B1, from_B2
        """

        batch_size, num_channels, height, width = x.size()
        channels_per_group = num_channels // groups

        # reshape
        x = x.view(batch_size, groups, channels_per_group, height, width)
        x = torch.transpose(x, 1, 2).contiguous()

        # flatten
        x = x.view(batch_size, -1, height, width)
        return x

    def forward(self, x: Tensor) -> Tensor:
        if self.stride == 1:
            # num channels remains the same due to assert that inp == out in __init__
            x1, x2 = x.chunk(2, dim=1)
            out = torch.cat((x1, self.branch2(x2)), dim=1)
        else:
            # inp num channels can change to a requested arbitrary out num channels
            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)

        out = self.channel_shuffle(out, 2)
        return out

channel_shuffle(x, groups) staticmethod

From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164): A “channel shuffle” operation is then introduced to enable information communication between different groups of channels and improve accuracy.

The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

Example: If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle), then activation maps in x are: from_B1, from_B1, ... from_B2, from_B2 After channel_shuffle activation maps in x will be: from_B1, from_B2, ... from_B1, from_B2

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@staticmethod
def channel_shuffle(x: Tensor, groups: int) -> Tensor:
    """
    From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
        A “channel shuffle” operation is then introduced to enable
        information communication between different groups of channels and improve accuracy.

    The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

    Example:
        If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
        then activation maps in x are:
        from_B1, from_B1, ... from_B2, from_B2
        After channel_shuffle activation maps in x will be:
        from_B1, from_B2, ... from_B1, from_B2
    """

    batch_size, num_channels, height, width = x.size()
    channels_per_group = num_channels // groups

    # reshape
    x = x.view(batch_size, groups, channels_per_group, height, width)
    x = torch.transpose(x, 1, 2).contiguous()

    # flatten
    x = x.view(batch_size, -1, height, width)
    return x

ShuffleNetV2Base

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
class ShuffleNetV2Base(BaseClassifier):
    def __init__(
        self,
        structure: List[int],
        stages_out_channels: List[int],
        backbone_mode: bool = False,
        num_classes: int = 1000,
        block: nn.Module = ChannelShuffleInvertedResidual,
        in_channels: int = 3,
    ):
        super(ShuffleNetV2Base, self).__init__()

        self.backbone_mode = backbone_mode

        if len(structure) != 3:
            raise ValueError("expected structure as list of 3 positive ints")
        if len(stages_out_channels) != 5:
            raise ValueError("expected stages_out_channels as list of 5 positive ints")
        self.structure = structure
        self.out_channels = stages_out_channels

        output_channels = self.out_channels[0]
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, output_channels, 3, 2, 1, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )

        input_channels = output_channels

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Static annotations for mypy
        self.layer2 = self._make_layer(block, input_channels, self.out_channels[1], self.structure[0])
        self.layer3 = self._make_layer(block, self.out_channels[1], self.out_channels[2], self.structure[1])
        self.layer4 = self._make_layer(block, self.out_channels[2], self.out_channels[3], self.structure[2])

        input_channels = self.out_channels[3]
        output_channels = self.out_channels[-1]
        self.conv5 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )

        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(output_channels, num_classes)

    @staticmethod
    def _make_layer(block, input_channels, output_channels, repeats):
        # add first block with stride 2 to downsize the input
        seq = [block(input_channels, output_channels, 2)]

        for _ in range(repeats - 1):
            seq.append(block(output_channels, output_channels, 1))
        return nn.Sequential(*seq)

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # removing fc weights first not to break strict loading
            fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

            for key in fc_weights_keys:
                pretrained_model_weights_dict.pop(key)

        super().load_state_dict(pretrained_model_weights_dict, strict)

    def forward(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.conv5(x)

        if not self.backbone_mode:
            x = self.avgpool(x)
            x = x.view(x.size(0), -1)
            x = self.fc(x)
        return x

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1[0] = replace_conv2d_input_channels(conv=self.conv1[0], in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1[0].in_channels

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in src/super_gradients/training/models/classification_models/shufflenetv2.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # removing fc weights first not to break strict loading
        fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

        for key in fc_weights_keys:
            pretrained_model_weights_dict.pop(key)

    super().load_state_dict(pretrained_model_weights_dict, strict)

VGG11/13/16/19 in Pytorch. Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py

Vision Transformer in PyTorch. Reference: [1] Dosovitskiy, Alexey, et al. "An image is worth 16x16 words: Transformers for image recognition at scale." arXiv preprint arXiv:2010.11929 (2020)

Code adapted from https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py

Attention

Bases: nn.Module

self attention layer with residual connection

Source code in src/super_gradients/training/models/classification_models/vit.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class Attention(nn.Module):
    """
    self attention layer with residual connection
    """

    def __init__(self, hidden_dim, heads=8):
        super().__init__()
        dim_head = hidden_dim // heads
        inner_dim = dim_head * heads

        self.heads = heads
        self.scale = dim_head**-0.5

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(hidden_dim, inner_dim * 3, bias=True)  # Qx, Kx, Vx are calculated at once
        self.proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):

        B, N, C = x.shape
        # computing query, key and value matrices at once
        qkv = self.to_qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        out = (attn @ v).transpose(1, 2).reshape(B, N, C)

        out = self.proj(out)

        return out

FeedForward

Bases: nn.Module

feed forward block with residual connection

Source code in src/super_gradients/training/models/classification_models/vit.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class FeedForward(nn.Module):
    """
    feed forward block with residual connection
    """

    def __init__(self, hidden_dim, mlp_dim, dropout=0.0):
        super().__init__()
        self.fc1 = nn.Linear(hidden_dim, mlp_dim)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(mlp_dim, hidden_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.act(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

PatchEmbed

Bases: nn.Module

2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)

Source code in src/super_gradients/training/models/classification_models/vit.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class PatchEmbed(nn.Module):
    """
    2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)
    """

    def __init__(self, img_size: tuple, patch_size: tuple, in_channels=3, hidden_dim=768, norm_layer=None, flatten=True):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(in_channels, hidden_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(hidden_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x

    def get_input_channels(self) -> int:
        return self.proj.in_channels

ViT

Bases: BaseClassifier

Source code in src/super_gradients/training/models/classification_models/vit.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
class ViT(BaseClassifier):
    def __init__(
        self,
        image_size: tuple,
        patch_size: tuple,
        num_classes: int,
        hidden_dim: int,
        depth: int,
        heads: int,
        mlp_dim: int,
        in_channels=3,
        dropout_prob=0.0,
        emb_dropout_prob=0.0,
        backbone_mode=False,
    ):
        """
        :param image_size: Image size tuple for data processing into patches done within the model.
        :param patch_size: Patch size tuple for data processing into patches done within the model.
        :param num_classes: Number of classes for the classification head.
        :param hidden_dim: Output dimension of each transformer block.
        :param depth: Number of transformer blocks
        :param heads: Number of attention heads
        :param mlp_dim: Intermediate dimension of the transformer block's feed forward
        :param in_channels: input channels
        :param dropout: Dropout ratio between the feed forward layers.
        :param emb_dropout: Dropout ratio between after the embedding layer
        :param backbone_mode: If True output after pooling layer
        """

        super().__init__()
        image_height, image_width = image_size
        patch_height, patch_width = patch_size

        assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
        assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

        num_patches = (image_height // patch_height) * (image_width // patch_width)

        self.image_size = image_size
        self.patch_size = patch_size
        self.hidden_dim = hidden_dim
        self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
        self.dropout = nn.Dropout(emb_dropout_prob)

        self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

        self.backbone_mode = backbone_mode
        self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
        self.head = nn.Linear(hidden_dim, num_classes)

    def forward(self, img):
        x = self.patch_embedding(img)  # Convert image to patches and embed
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, : (n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = self.pre_head_norm(x)
        x = x[:, 0]
        if self.backbone_mode:
            return x
        else:
            return self.head(x)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0}

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

    def get_input_channels(self) -> int:
        return self.patch_embedding.get_input_channels()

__init__(image_size, patch_size, num_classes, hidden_dim, depth, heads, mlp_dim, in_channels=3, dropout_prob=0.0, emb_dropout_prob=0.0, backbone_mode=False)

Parameters:

Name Type Description Default
image_size tuple

Image size tuple for data processing into patches done within the model.

required
patch_size tuple

Patch size tuple for data processing into patches done within the model.

required
num_classes int

Number of classes for the classification head.

required
hidden_dim int

Output dimension of each transformer block.

required
depth int

Number of transformer blocks

required
heads int

Number of attention heads

required
mlp_dim int

Intermediate dimension of the transformer block's feed forward

required
in_channels

input channels

3
dropout

Dropout ratio between the feed forward layers.

required
emb_dropout

Dropout ratio between after the embedding layer

required
backbone_mode

If True output after pooling layer

False
Source code in src/super_gradients/training/models/classification_models/vit.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def __init__(
    self,
    image_size: tuple,
    patch_size: tuple,
    num_classes: int,
    hidden_dim: int,
    depth: int,
    heads: int,
    mlp_dim: int,
    in_channels=3,
    dropout_prob=0.0,
    emb_dropout_prob=0.0,
    backbone_mode=False,
):
    """
    :param image_size: Image size tuple for data processing into patches done within the model.
    :param patch_size: Patch size tuple for data processing into patches done within the model.
    :param num_classes: Number of classes for the classification head.
    :param hidden_dim: Output dimension of each transformer block.
    :param depth: Number of transformer blocks
    :param heads: Number of attention heads
    :param mlp_dim: Intermediate dimension of the transformer block's feed forward
    :param in_channels: input channels
    :param dropout: Dropout ratio between the feed forward layers.
    :param emb_dropout: Dropout ratio between after the embedding layer
    :param backbone_mode: If True output after pooling layer
    """

    super().__init__()
    image_height, image_width = image_size
    patch_height, patch_width = patch_size

    assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
    assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

    num_patches = (image_height // patch_height) * (image_width // patch_width)

    self.image_size = image_size
    self.patch_size = patch_size
    self.hidden_dim = hidden_dim
    self.patch_embedding = PatchEmbed(img_size=self.image_size, patch_size=self.patch_size, in_channels=in_channels, hidden_dim=self.hidden_dim)

    self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
    self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
    self.dropout = nn.Dropout(emb_dropout_prob)

    self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

    self.backbone_mode = backbone_mode
    self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
    self.head = nn.Linear(hidden_dim, num_classes)

ConvertableCompletePipelineModel

Bases: torch.nn.Module

Exportable nn.Module that wraps the model, preprocessing and postprocessing.

Parameters:

Name Type Description Default
model torch.nn.Module

torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.

required
pre_process torch.nn.Module

torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().

None
**prep_model_for_conversion_kwargs

for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call.

{}
Source code in src/super_gradients/training/models/conversion.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class ConvertableCompletePipelineModel(torch.nn.Module):
    """
    Exportable nn.Module that wraps the model, preprocessing and postprocessing.

    :param model: torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.
    :param pre_process: torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().
    :param pre_process: torch.nn.Module, postprocessing module, its output is the final output. When none (default), set to Identity().
    :param **prep_model_for_conversion_kwargs: for SgModules- args to be passed to model.prep_model_for_conversion
            prior to torch.onnx.export call.
    """

    def __init__(self, model: torch.nn.Module, pre_process: torch.nn.Module = None, post_process: torch.nn.Module = None, **prep_model_for_conversion_kwargs):
        super(ConvertableCompletePipelineModel, self).__init__()
        model.eval()
        pre_process = pre_process or Identity()
        post_process = post_process or Identity()
        if hasattr(model, "prep_model_for_conversion"):
            model.prep_model_for_conversion(**prep_model_for_conversion_kwargs)
        self.model = model
        self.pre_process = pre_process
        self.post_process = post_process

    def forward(self, x):
        return self.post_process(self.model(self.pre_process(x)))

convert_from_config(cfg)

Exports model according to cfg.

See: super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation, and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.

Parameters:

Name Type Description Default
cfg DictConfig required

Returns:

Type Description
str

out_path, the path of the saved .onnx file.

Source code in src/super_gradients/training/models/conversion.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def convert_from_config(cfg: DictConfig) -> str:
    """
    Exports model according to cfg.

    See:
     super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation,
     and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.
    :param cfg:
    :return: out_path, the path of the saved .onnx file.
    """
    cfg, experiment_cfg = prepare_conversion_cfgs(cfg)
    model = models.get(
        model_name=experiment_cfg.architecture,
        num_classes=experiment_cfg.arch_params.num_classes,
        arch_params=experiment_cfg.arch_params,
        strict_load=cfg.strict_load,
        checkpoint_path=cfg.checkpoint_path,
    )
    cfg = parse_args(cfg, models.convert_to_onnx)
    out_path = models.convert_to_onnx(model=model, **cfg)
    logger.info(f"Successfully exported model at {out_path}")
    return out_path

convert_to_coreml(model, out_path, input_size=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, export_as_ml_program=False, torch_trace_kwargs=None)

Exports a given SG model to CoreML mlprogram or package.

:param model: torch.nn.Module, model to export to CoreML.
:param out_path: str, destination path for the .mlmodel file.
:param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
:param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
:param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
:param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
 prior to ct.convert call. Supported keys are:
- input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
:param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                    (Supports more iOS versions and devices, but this format will be deprecated at some point).
:param torch_trace_kwargs: kwargs for torch.jit.trace

Returns:

Type Description

Path

Source code in src/super_gradients/training/models/conversion.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_coreml(
    model: torch.nn.Module,
    out_path: str,
    input_size: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    export_as_ml_program=False,
    torch_trace_kwargs=None,
):
    """
        Exports a given SG model to CoreML mlprogram or package.

        :param model: torch.nn.Module, model to export to CoreML.
        :param out_path: str, destination path for the .mlmodel file.
        :param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
        :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
        :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
        :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
         prior to ct.convert call. Supported keys are:
        - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
        :param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                            (Supports more iOS versions and devices, but this format will be deprecated at some point).
        :param torch_trace_kwargs: kwargs for torch.jit.trace
    :return: Path
    """
    if ct is None:
        raise ImportError(
            '"coremltools" is required for CoreML export, but is not installed. Please install CoreML Tools using:\n'
            '   "python3 -m pip install coremltools" and try again (Tested with version 6.3.0);'
        )

    logger.debug("Building model...")
    logger.debug(model)
    logger.debug("Model child nodes:")
    logger.debug(next(model.named_children()))

    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the CoreML file.")
    torch_trace_kwargs = torch_trace_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_size is not None:
        input_size = (1, *input_size)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_coreml(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    # TODO: support more than 1 input when prep_for_conversoin will support it.
    example_inputs = [torch.Tensor(np.zeros(input_size))]

    if not out_path.endswith(".mlpackage") and not out_path.endswith(".mlmodel"):
        out_path += ".mlpackage" if export_as_ml_program else ".mlmodel"

    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    # Set the model in evaluation mode.
    complete_model.eval()

    logger.info("Creating torch jit trace...")
    traced_model = torch.jit.trace(complete_model, example_inputs, **torch_trace_kwargs)
    logger.info("Tracing the model with the provided inputs...")
    out = traced_model(*example_inputs)  # using * because example_inputs is a list
    logger.info(f"Inferred output shapes: {[o.shape for o in out]}")
    if export_as_ml_program:
        coreml_model = ct.convert(
            traced_model, convert_to="mlprogram", inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)]
        )
    else:
        coreml_model = ct.convert(traced_model, inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)])

    spec = coreml_model.get_spec()
    logger.debug(spec.description)

    # Changing the input names:
    #   In CoreML, the input name is compiled into classes (named keyword argument in predict).
    #   We want to re-use the same names among different models to make research easier.
    #   We normalize the inputs names to be x_1, x_2, etc.
    for i, _input in enumerate(spec.description.input):
        new_input_name = "x_" + str(i + 1)
        logger.info(f"Renaming input {_input.name} to {new_input_name}")
        ct.utils.rename_feature(spec, _input.name, new_input_name)

    # Re-Initializing the model with the new spec
    coreml_model = ct.models.MLModel(spec, weights_dir=coreml_model.weights_dir)

    # Saving the model
    coreml_model.save(out_path)
    logger.info(f"CoreML model successfully save to {os.path.abspath(out_path)}")
    return out_path

convert_to_onnx(model, out_path, input_shape=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, torch_onnx_export_kwargs=None, simplify=True)

Exports model to ONNX.

Parameters:

Name Type Description Default
model torch.nn.Module

torch.nn.Module, model to export to ONNX.

required
out_path str

str, destination path for the .onnx file.

required
input_shape tuple

Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1. DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.

None
pre_process torch.nn.Module

torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()

None
post_process torch.nn.Module

torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()

None
prep_model_for_conversion_kwargs

dict, for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call. Supported keys are: - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.

None
torch_onnx_export_kwargs

kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call

None
simplify bool

bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path. When true, the simplified model will be saved in out_path (default=True).

True

Returns:

Type Description

out_path

Source code in src/super_gradients/training/models/conversion.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_onnx(
    model: torch.nn.Module,
    out_path: str,
    input_shape: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    torch_onnx_export_kwargs=None,
    simplify: bool = True,
):
    """
    Exports model to ONNX.

    :param model: torch.nn.Module, model to export to ONNX.
    :param out_path: str, destination path for the .onnx file.
    :param input_shape: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
    DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.
    :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
    :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
    :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
     prior to torch.onnx.export call. Supported keys are:
    - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
    :param torch_onnx_export_kwargs: kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call
    :param simplify: bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path.
     When true, the simplified model will be saved in out_path (default=True).

    :return: out_path
    """
    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the ONNX file.")
    torch_onnx_export_kwargs = torch_onnx_export_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_shape is not None:
        input_size = (1, *input_shape)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_onnx(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    onnx_input = torch.Tensor(np.zeros(input_size))
    if not out_path.endswith(".onnx"):
        out_path = out_path + ".onnx"
    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    torch.onnx.export(model=complete_model, args=onnx_input, f=out_path, **torch_onnx_export_kwargs)
    if simplify:
        onnx_simplify(out_path, out_path)
    return out_path

onnx_simplify(onnx_path, onnx_sim_path)

onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path

Parameters:

Name Type Description Default
onnx_path str

path to onnx model

required
onnx_sim_path str

path for output onnx simplified model

required
Source code in src/super_gradients/training/models/conversion.py
274
275
276
277
278
279
280
281
282
283
def onnx_simplify(onnx_path: str, onnx_sim_path: str):
    """
    onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path
    :param onnx_path: path to onnx model
    :param onnx_sim_path: path for output onnx simplified model
    """
    model_sim, check = simplify(model=onnx_path)
    if not check:
        raise RuntimeError("Simplified ONNX model could not be validated")
    onnx.save_model(model_sim, onnx_sim_path)

prepare_conversion_cfgs(cfg)

Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name) to be used by convert_recipe_example

Parameters:

Name Type Description Default
cfg DictConfig

DictConfig, converion_params config

required

Returns:

Type Description

cfg, experiment_cfg

Source code in src/super_gradients/training/models/conversion.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def prepare_conversion_cfgs(cfg: DictConfig):
    """
    Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name)
     to be used by convert_recipe_example

    :param cfg: DictConfig, converion_params config
    :return: cfg, experiment_cfg
    """
    cfg = hydra.utils.instantiate(cfg)
    # CREATE THE EXPERIMENT CFG

    # Load the latest experiment config
    run_id = get_param(cfg, "run_id")
    if run_id is None:
        run_id = get_latest_run_id(experiment_name=cfg.experiment_name, checkpoints_root_dir=cfg.ckpt_root_dir)
    experiment_cfg = load_experiment_cfg(ckpt_root_dir=cfg.ckpt_root_dir, experiment_name=cfg.experiment_name, run_id=run_id)

    hydra.utils.instantiate(experiment_cfg)
    if cfg.checkpoint_path is None:
        logger.info(
            "checkpoint_params.checkpoint_path was not provided, so the model will be converted using weights from "
            "checkpoints_dir/training_hyperparams.ckpt_name "
        )
        checkpoints_dir = get_checkpoints_dir_path(experiment_name=cfg.experiment_name, ckpt_root_dir=cfg.ckpt_root_dir, run_id=run_id)
        cfg.checkpoint_path = os.path.join(checkpoints_dir, cfg.ckpt_name)

    cfg.out_path = cfg.out_path or cfg.checkpoint_path.replace(".pth", ".onnx")
    logger.info(f"Exporting checkpoint: {cfg.checkpoint_path} to ONNX.")
    return cfg, experiment_cfg

CSP Darknet

CSPLayer

Bases: nn.Module

CSP Bottleneck with 3 convolutions

Parameters:

Name Type Description Default
in_channels int

int, input channels.

required
out_channels int

int, output channels.

required
num_bottlenecks int

int, number of bottleneck conv layers.

required
act Type[nn.Module]

Type[nn.module], activation type.

required
shortcut bool

bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).

True
depthwise bool

bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).

False
expansion float

float, determines the number of hidden channels (default=0.5).

0.5
Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class CSPLayer(nn.Module):
    """
    CSP Bottleneck with 3 convolutions

    :param in_channels: int, input channels.
    :param out_channels: int, output channels.
    :param num_bottlenecks: int, number of bottleneck conv layers.
    :param act: Type[nn.module], activation type.
    :param shortcut: bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).
    :param depthwise: bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).
    :param expansion: float, determines the number of hidden channels (default=0.5).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        act: Type[nn.Module],
        shortcut: bool = True,
        depthwise: bool = False,
        expansion: float = 0.5,
    ):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv3 = Conv(2 * hidden_channels, out_channels, 1, stride=1, activation_type=act)
        module_list = [Bottleneck(hidden_channels, hidden_channels, shortcut, act, depthwise) for _ in range(num_bottlenecks)]
        self.bottlenecks = nn.Sequential(*module_list)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((x_1, x_2), dim=1)
        return self.conv3(x)

GroupedConvBlock

Bases: nn.Module

Grouped Conv KxK -> usual Conv 1x1

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class GroupedConvBlock(nn.Module):
    """
    Grouped Conv KxK -> usual Conv 1x1
    """

    def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
        """
        :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                        (groups = input channels)
        """
        super().__init__()

        self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
        self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

    def forward(self, x):
        return self.conv(self.dconv(x))

__init__(input_channels, output_channels, kernel, stride, activation_type, padding=None, groups=None)

Parameters:

Name Type Description Default
groups int

num of groups in the first conv; if None depthwise separable conv will be used (groups = input channels)

None
Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py
43
44
45
46
47
48
49
50
51
def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
    """
    :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                    (groups = input channels)
    """
    super().__init__()

    self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
    self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

SPP

Bases: BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@register_detection_module()
class SPP(BaseDetectionModule):
    # SPATIAL PYRAMID POOLING LAYER
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(self, in_channels, output_channels, k: Tuple, activation_type: Type[nn.Module]):
        super().__init__(in_channels)
        self._output_channels = output_channels

        hidden_channels = in_channels // 2
        self.cv1 = Conv(in_channels, hidden_channels, 1, 1, activation_type)
        self.cv2 = Conv(hidden_channels * (len(k) + 1), output_channels, 1, 1, activation_type)
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        x = self.cv1(x)
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))

    @property
    def out_channels(self):
        """
        :return: channels of tensor(s) that will be returned by a module  in forward
        """
        return self._output_channels

out_channels property

Returns:

Type Description

channels of tensor(s) that will be returned by a module in forward

ViewModule

Bases: nn.Module

Returns a reshaped version of the input, to be used in None-Backbone Mode

Source code in src/super_gradients/training/models/detection_models/csp_darknet53.py
160
161
162
163
164
165
166
167
168
169
170
class ViewModule(nn.Module):
    """
    Returns a reshaped version of the input, to be used in None-Backbone Mode
    """

    def __init__(self, features=1024):
        super(ViewModule, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(-1, self.features)

CSPResNetBackbone

Bases: nn.Module, SupportsReplaceInputChannels

CSPResNet backbone

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
@register_detection_module()
class CSPResNetBackbone(nn.Module, SupportsReplaceInputChannels):
    """
    CSPResNet backbone
    """

    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        layers: Tuple[int, ...],
        channels: Tuple[int, ...],
        activation: Type[nn.Module],
        return_idx: Tuple[int, int, int],
        use_large_stem: bool,
        width_mult: float,
        depth_mult: float,
        use_alpha: bool,
        pretrained_weights: Optional[str] = None,
        in_channels: int = 3,
    ):
        """

        :param layers: Number of blocks in each stage
        :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
        :param activation: Used activation type for all child modules.
        :param return_idx: Indexes of returned feature maps
        :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
        :param width_mult: Scaling factor for a number of channels
        :param depth_mult: Scaling factor for a number of blocks in each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        :param pretrained_weights:
        :param in_channels: Number of input channels. Default: 3
        """
        super().__init__()
        channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
        layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

        if use_large_stem:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(
                                channels[0] // 2,
                                channels[0] // 2,
                                3,
                                stride=1,
                                padding=1,
                                activation_type=activation,
                                bias=False,
                            ),
                        ),
                        (
                            "conv3",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )
        else:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )

        n = len(channels) - 1
        self.stages = nn.ModuleList(
            [
                CSPResStage(
                    channels[i],
                    channels[i + 1],
                    layers[i],
                    stride=2,
                    activation_type=activation,
                    use_alpha=use_alpha,
                )
                for i in range(n)
            ]
        )

        self._out_channels = channels[1:]
        self._out_strides = [4 * 2**i for i in range(n)]
        self.return_idx = tuple(return_idx)

        if pretrained_weights:
            if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
                state_dict = torch.load(str(pretrained_weights), map_location="cpu")
            elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
                with wait_for_the_master(get_local_rank()):
                    state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
            else:
                raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
            self.load_state_dict(state_dict)

    def forward(self, x: Tensor) -> List[Tensor]:
        x = self.stem(x)
        outs = []
        for idx, stage in enumerate(self.stages):
            x = stage(x)
            if idx in self.return_idx:
                outs.append(x)

        return outs

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """
        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.fuse_block_residual_branches()

    @property
    def out_channels(self) -> Tuple[int]:
        return tuple(self._out_channels)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        first_layer: ConvBNAct = self.stem[0]
        first_layer.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        first_layer: ConvBNAct = self.stem[0]
        return first_layer.get_input_channels()

__init__(layers, channels, activation, return_idx, use_large_stem, width_mult, depth_mult, use_alpha, pretrained_weights=None, in_channels=3)

Parameters:

Name Type Description Default
layers Tuple[int, ...]

Number of blocks in each stage

required
channels Tuple[int, ...]

Number of channels [stem, stage 0, stage 1, stage 2, ...]

required
activation Type[nn.Module]

Used activation type for all child modules.

required
return_idx Tuple[int, int, int]

Indexes of returned feature maps

required
use_large_stem bool

If True, uses 3 conv+bn+act instead of 2 in stem blocks

required
width_mult float

Scaling factor for a number of channels

required
depth_mult float

Scaling factor for a number of blocks in each stage

required
use_alpha bool

If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock

required
pretrained_weights Optional[str] None
in_channels int

Number of input channels. Default: 3

3
Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
@resolve_param("activation", ActivationsTypeFactory())
def __init__(
    self,
    layers: Tuple[int, ...],
    channels: Tuple[int, ...],
    activation: Type[nn.Module],
    return_idx: Tuple[int, int, int],
    use_large_stem: bool,
    width_mult: float,
    depth_mult: float,
    use_alpha: bool,
    pretrained_weights: Optional[str] = None,
    in_channels: int = 3,
):
    """

    :param layers: Number of blocks in each stage
    :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
    :param activation: Used activation type for all child modules.
    :param return_idx: Indexes of returned feature maps
    :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
    :param width_mult: Scaling factor for a number of channels
    :param depth_mult: Scaling factor for a number of blocks in each stage
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    :param pretrained_weights:
    :param in_channels: Number of input channels. Default: 3
    """
    super().__init__()
    channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
    layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

    if use_large_stem:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(
                            channels[0] // 2,
                            channels[0] // 2,
                            3,
                            stride=1,
                            padding=1,
                            activation_type=activation,
                            bias=False,
                        ),
                    ),
                    (
                        "conv3",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )
    else:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )

    n = len(channels) - 1
    self.stages = nn.ModuleList(
        [
            CSPResStage(
                channels[i],
                channels[i + 1],
                layers[i],
                stride=2,
                activation_type=activation,
                use_alpha=use_alpha,
            )
            for i in range(n)
        ]
    )

    self._out_channels = channels[1:]
    self._out_strides = [4 * 2**i for i in range(n)]
    self.return_idx = tuple(return_idx)

    if pretrained_weights:
        if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
            state_dict = torch.load(str(pretrained_weights), map_location="cpu")
        elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
            with wait_for_the_master(get_local_rank()):
                state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
        else:
            raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
        self.load_state_dict(state_dict)

prep_model_for_conversion(input_size=None, **kwargs)

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name Type Description Default
input_size Union[tuple, list]

[H,W]

None
Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
233
234
235
236
237
238
239
240
241
242
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """
    for module in self.modules():
        if isinstance(module, RepVGGBlock):
            module.fuse_block_residual_branches()

CSPResNetBasicBlock

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class CSPResNetBasicBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
        """

        :param in_channels:
        :param out_channels:
        :param activation_type:
        :param use_residual_connection: Whether to add input x to the output
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        """
        super().__init__()
        if use_residual_connection and in_channels != out_channels:
            raise RuntimeError(
                f"Number of input channels (got {in_channels}) must be equal to the "
                f"number of output channels (got {out_channels}) when use_residual_connection=True"
            )
        self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
        self.conv2 = RepVGGBlock(
            out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
        )
        self.use_residual_connection = use_residual_connection

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        if self.use_residual_connection:
            return x + y
        else:
            return y

__init__(in_channels, out_channels, activation_type, use_residual_connection=True, use_alpha=False)

Parameters:

Name Type Description Default
in_channels int required
out_channels int required
activation_type Type[nn.Module] required
use_residual_connection bool

Whether to add input x to the output

True
use_alpha

If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock

False
Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
    """

    :param in_channels:
    :param out_channels:
    :param activation_type:
    :param use_residual_connection: Whether to add input x to the output
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    """
    super().__init__()
    if use_residual_connection and in_channels != out_channels:
        raise RuntimeError(
            f"Number of input channels (got {in_channels}) must be equal to the "
            f"number of output channels (got {out_channels}) when use_residual_connection=True"
        )
    self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
    self.conv2 = RepVGGBlock(
        out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
    )
    self.use_residual_connection = use_residual_connection

CSPResStage

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class CSPResStage(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks,
        stride: int,
        activation_type: Type[nn.Module],
        use_attention: bool = True,
        use_alpha: bool = False,
    ):
        """

        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of blocks in stage
        :param stride: Desired down-sampling for the stage (Usually 2)
        :param activation_type: Non-linearity type used in child modules.
        :param use_attention: If True, adds EffectiveSEBlock at the end of each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)
        """
        super().__init__()

        mid_channels = (in_channels + out_channels) // 2
        half_mid_channels = mid_channels // 2
        mid_channels = 2 * half_mid_channels

        if stride != 1:
            self.conv_down = ConvBNAct(in_channels, mid_channels, 3, stride=stride, padding=1, activation_type=activation_type, bias=False)
        else:
            self.conv_down = None
        self.conv1 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
        self.conv2 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
        self.blocks = nn.Sequential(
            *[
                CSPResNetBasicBlock(
                    in_channels=half_mid_channels,
                    out_channels=half_mid_channels,
                    activation_type=activation_type,
                    use_alpha=use_alpha,
                )
                for _ in range(num_blocks)
            ]
        )
        if use_attention:
            self.attn = EffectiveSEBlock(mid_channels)
        else:
            self.attn = nn.Identity()

        self.conv3 = ConvBNAct(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)

    def forward(self, x):
        if self.conv_down is not None:
            x = self.conv_down(x)
        y1 = self.conv1(x)
        y2 = self.blocks(self.conv2(x))
        y = torch.cat([y1, y2], dim=1)
        y = self.attn(y)
        y = self.conv3(y)
        return y

__init__(in_channels, out_channels, num_blocks, stride, activation_type, use_attention=True, use_alpha=False)

Parameters:

Name Type Description Default
in_channels int

Number of input channels

required
out_channels int

Number of output channels

required
num_blocks

Number of blocks in stage

required
stride int

Desired down-sampling for the stage (Usually 2)

required
activation_type Type[nn.Module]

Non-linearity type used in child modules.

required
use_attention bool

If True, adds EffectiveSEBlock at the end of each stage

True
use_alpha bool

If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)

False
Source code in src/super_gradients/training/models/detection_models/csp_resnet.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_blocks,
    stride: int,
    activation_type: Type[nn.Module],
    use_attention: bool = True,
    use_alpha: bool = False,
):
    """

    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param num_blocks: Number of blocks in stage
    :param stride: Desired down-sampling for the stage (Usually 2)
    :param activation_type: Non-linearity type used in child modules.
    :param use_attention: If True, adds EffectiveSEBlock at the end of each stage
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)
    """
    super().__init__()

    mid_channels = (in_channels + out_channels) // 2
    half_mid_channels = mid_channels // 2
    mid_channels = 2 * half_mid_channels

    if stride != 1:
        self.conv_down = ConvBNAct(in_channels, mid_channels, 3, stride=stride, padding=1, activation_type=activation_type, bias=False)
    else:
        self.conv_down = None
    self.conv1 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
    self.conv2 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)
    self.blocks = nn.Sequential(
        *[
            CSPResNetBasicBlock(
                in_channels=half_mid_channels,
                out_channels=half_mid_channels,
                activation_type=activation_type,
                use_alpha=use_alpha,
            )
            for _ in range(num_blocks)
        ]
    )
    if use_attention:
        self.attn = EffectiveSEBlock(mid_channels)
    else:
        self.attn = nn.Identity()

    self.conv3 = ConvBNAct(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=False)

A base for a detection network built according to the following scheme: * constructed from nested arch_params; * inside arch_params each nested level (module) has an explicit type and its required parameters * each module accepts in_channels and other parameters * each module defines out_channels property on construction

CustomizableDetector

Bases: HasPredict, SgModule

A customizable detector with backbone -> neck -> heads Each submodule with its parameters must be defined explicitly. Modules should follow the interface of BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
class CustomizableDetector(HasPredict, SgModule):
    """
    A customizable detector with backbone -> neck -> heads
    Each submodule with its parameters must be defined explicitly.
    Modules should follow the interface of BaseDetectionModule
    """

    @arch_params_deprecated
    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        """
        :param backbone:    Backbone configuration.
        :param heads:       Head configuration.
        :param neck:        Neck configuration.
        :param num_classes: num classes to predict.
        :param bn_eps:      Epsilon for batch norm.
        :param bn_momentum: Momentum for batch norm.
        :param inplace_act: If True, do the operations operation in-place when possible.
        :param in_channels: number of input channels
        """
        super().__init__()

        self.heads_params = heads
        self.bn_eps = bn_eps
        self.bn_momentum = bn_momentum
        self.inplace_act = inplace_act
        self.in_channels = in_channels
        factory = det_factory.DetectionModulesFactory()

        # move num_classes into heads params
        if num_classes is not None:
            self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", num_classes)

        self.backbone = factory.get(factory.insert_module_param(backbone, "in_channels", in_channels))
        if neck is not None:
            self.neck = factory.get(factory.insert_module_param(neck, "in_channels", self.backbone.out_channels))
            self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.neck.out_channels))
        else:
            self.neck = nn.Identity()
            self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.backbone.out_channels))

        self._initialize_weights(bn_eps, bn_momentum, inplace_act)

        # Processing params
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: float = 0.7
        self._default_nms_conf: float = 0.5
        self._default_nms_top_k: int = 1024
        self._default_max_predictions = 300
        self._default_multi_label_per_box = True
        self._default_class_agnostic_nms = False

    def forward(self, x):
        x = self.backbone(x)
        x = self.neck(x)
        return self.heads(x)

    def _initialize_weights(self, bn_eps: Optional[float] = None, bn_momentum: Optional[float] = None, inplace_act: Optional[bool] = True):
        for m in self.modules():
            t = type(m)
            if t is nn.BatchNorm2d:
                m.eps = bn_eps if bn_eps else m.eps
                m.momentum = bn_momentum if bn_momentum else m.momentum
            elif inplace_act and t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, nn.Mish]:
                m.inplace = True

    def prep_model_for_conversion(self, input_size: Optional[Union[tuple, list]] = None, **kwargs):
        for module in self.modules():
            if module != self and hasattr(module, "prep_model_for_conversion"):
                module.prep_model_for_conversion(input_size, **kwargs)

    def replace_head(self, new_num_classes: Optional[int] = None, new_head: Optional[nn.Module] = None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.heads = new_head
        elif isinstance(self.heads, SupportsReplaceNumClasses):
            self.heads.replace_num_classes(new_num_classes, replace_num_classes_with_random_weights)
        else:
            factory = det_factory.DetectionModulesFactory()
            self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", new_num_classes)
            self.heads = factory.get(factory.insert_module_param(self.heads_params, "in_channels", self.neck.out_channels))
            self._initialize_weights(self.bn_eps, self.bn_momentum, self.inplace_act)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
            self.in_channels = self.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            return self.backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> DetectionPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        raise NotImplementedError

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:         (Optional) Names of the dataset the model was trained on.
        :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:                 (Optional) IoU threshold for the nms algorithm
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if class_names is not None:
            self._class_names = tuple(class_names)
        if image_processor is not None:
            self._image_processor = image_processor
        if iou is not None:
            self._default_nms_iou = float(iou)
        if conf is not None:
            self._default_nms_conf = float(conf)
        if nms_top_k is not None:
            self._default_nms_top_k = int(nms_top_k)
        if max_predictions is not None:
            self._default_max_predictions = int(max_predictions)
        if multi_label_per_box is not None:
            self._default_multi_label_per_box = bool(multi_label_per_box)
        if class_agnostic_nms is not None:
            self._default_class_agnostic_nms = bool(class_agnostic_nms)

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        *,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = self._default_nms_iou if iou is None else iou
        conf = self._default_nms_conf if conf is None else conf
        nms_top_k = self._default_nms_top_k if nms_top_k is None else nms_top_k
        max_predictions = self._default_max_predictions if max_predictions is None else max_predictions
        multi_label_per_box = self._default_multi_label_per_box if multi_label_per_box is None else multi_label_per_box
        class_agnostic_nms = self._default_class_agnostic_nms if class_agnostic_nms is None else class_agnostic_nms

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                nms_top_k=nms_top_k,
                max_predictions=max_predictions,
                multi_label_per_box=multi_label_per_box,
                class_agnostic_nms=class_agnostic_nms,
            ),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:              Images to predict.
        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                        If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def get_finetune_lr_dict(self, lr: float):
        return {"heads": lr, "default": 0}

__init__(backbone, heads, neck=None, num_classes=None, bn_eps=None, bn_momentum=None, inplace_act=True, in_channels=3)

Parameters:

Name Type Description Default
backbone Union[str, dict, HpmStruct, DictConfig]

Backbone configuration.

required
heads Union[str, dict, HpmStruct, DictConfig]

Head configuration.

required
neck Optional[Union[str, dict, HpmStruct, DictConfig]]

Neck configuration.

None
num_classes int

num classes to predict.

None
bn_eps Optional[float]

Epsilon for batch norm.

None
bn_momentum Optional[float]

Momentum for batch norm.

None
inplace_act Optional[bool]

If True, do the operations operation in-place when possible.

True
in_channels int

number of input channels

3
Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@arch_params_deprecated
def __init__(
    self,
    backbone: Union[str, dict, HpmStruct, DictConfig],
    heads: Union[str, dict, HpmStruct, DictConfig],
    neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
    num_classes: int = None,
    bn_eps: Optional[float] = None,
    bn_momentum: Optional[float] = None,
    inplace_act: Optional[bool] = True,
    in_channels: int = 3,
):
    """
    :param backbone:    Backbone configuration.
    :param heads:       Head configuration.
    :param neck:        Neck configuration.
    :param num_classes: num classes to predict.
    :param bn_eps:      Epsilon for batch norm.
    :param bn_momentum: Momentum for batch norm.
    :param inplace_act: If True, do the operations operation in-place when possible.
    :param in_channels: number of input channels
    """
    super().__init__()

    self.heads_params = heads
    self.bn_eps = bn_eps
    self.bn_momentum = bn_momentum
    self.inplace_act = inplace_act
    self.in_channels = in_channels
    factory = det_factory.DetectionModulesFactory()

    # move num_classes into heads params
    if num_classes is not None:
        self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", num_classes)

    self.backbone = factory.get(factory.insert_module_param(backbone, "in_channels", in_channels))
    if neck is not None:
        self.neck = factory.get(factory.insert_module_param(neck, "in_channels", self.backbone.out_channels))
        self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.neck.out_channels))
    else:
        self.neck = nn.Identity()
        self.heads = factory.get(factory.insert_module_param(heads, "in_channels", self.backbone.out_channels))

    self._initialize_weights(bn_eps, bn_momentum, inplace_act)

    # Processing params
    self._class_names: Optional[List[str]] = None
    self._image_processor: Optional[Processing] = None
    self._default_nms_iou: float = 0.7
    self._default_nms_conf: float = 0.5
    self._default_nms_top_k: int = 1024
    self._default_max_predictions = 300
    self._default_multi_label_per_box = True
    self._default_class_agnostic_nms = False

get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)

Get a post prediction callback for this model.

Parameters:

Name Type Description Default
conf float

A minimum confidence threshold for predictions to be used in post-processing.

required
iou float

A IoU threshold for boxes non-maximum suppression.

required
nms_top_k int

The maximum number of detections to consider for NMS.

required
max_predictions int

The maximum number of detections to return.

required
multi_label_per_box bool

If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

required
class_agnostic_nms bool

If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

required

Returns:

Type Description
DetectionPostPredictionCallback
Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> DetectionPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    raise NotImplementedError

predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:              Images to predict.
    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                        If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size

Maximum number of images to process at the same time.

required
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    pipeline.predict_webcam()

set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
iou Optional[float]

(Optional) IoU threshold for the nms algorithm

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded

None
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
Source code in src/super_gradients/training/models/detection_models/customizable_detector.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:         (Optional) Names of the dataset the model was trained on.
    :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:                 (Optional) IoU threshold for the nms algorithm
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    """
    if class_names is not None:
        self._class_names = tuple(class_names)
    if image_processor is not None:
        self._image_processor = image_processor
    if iou is not None:
        self._default_nms_iou = float(iou)
    if conf is not None:
        self._default_nms_conf = float(conf)
    if nms_top_k is not None:
        self._default_nms_top_k = int(nms_top_k)
    if max_predictions is not None:
        self._default_max_predictions = int(max_predictions)
    if multi_label_per_box is not None:
        self._default_multi_label_per_box = bool(multi_label_per_box)
    if class_agnostic_nms is not None:
        self._default_class_agnostic_nms = bool(class_agnostic_nms)

DarkResidualBlock

Bases: nn.Module

DarkResidualBlock - The Darknet Residual Block

Source code in src/super_gradients/training/models/detection_models/darknet53.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class DarkResidualBlock(nn.Module):
    """
    DarkResidualBlock - The Darknet Residual Block
    """

    def __init__(self, in_channels, shortcut=True):
        super(DarkResidualBlock, self).__init__()
        self.shortcut = shortcut
        reduced_channels = int(in_channels / 2)

        self.layer1 = create_conv_module(in_channels, reduced_channels, kernel_size=1)
        self.layer2 = create_conv_module(reduced_channels, in_channels)

    def forward(self, x):
        residual = x

        out = self.layer1(x)
        out = self.layer2(out)
        out += residual if self.shortcut else out
        return out

Darknet53

Bases: Darknet53Base

Source code in src/super_gradients/training/models/detection_models/darknet53.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
@register_model(Models.DARKNET53)
class Darknet53(Darknet53Base):
    def __init__(self, arch_params=None, backbone_mode=True, num_classes=None):
        super(Darknet53, self).__init__()

        # IN ORDER TO ALLOW PASSING PARAMETERS WITH ARCH_PARAMS BUT NOT BREAK YOLOV3 INTEGRATION
        self.backbone_mode = get_param(arch_params, "backbone_mode", backbone_mode)
        self.num_classes = get_param(arch_params, "num_classes", num_classes)

        if not self.backbone_mode:
            # IF NOT USED AS A BACKEND BUT AS A CLASSIFIER WE ADD THE CLASSIFICATION LAYERS
            if self.num_classes is not None:
                nn_sequential_block = nn.Sequential()
                nn_sequential_block.add_module("global_avg_pool", nn.AdaptiveAvgPool2d((1, 1)))
                nn_sequential_block.add_module("view", ViewModule(1024))
                nn_sequential_block.add_module("fc", nn.Linear(1024, self.num_classes))
                self.modules_list.append(nn_sequential_block)
            else:
                raise ValueError("num_classes must be specified to use Darknet53 as a classifier")

    def get_modules_list(self):
        return self.modules_list

    def forward(self, x):
        """
        forward - Forward pass on the modules list
            :param x: The input data
            :return: forward pass for backbone pass or classification pass
        """
        return super().forward(x)

forward(x)

forward - Forward pass on the modules list :param x: The input data :return: forward pass for backbone pass or classification pass

Source code in src/super_gradients/training/models/detection_models/darknet53.py
104
105
106
107
108
109
110
def forward(self, x):
    """
    forward - Forward pass on the modules list
        :param x: The input data
        :return: forward pass for backbone pass or classification pass
    """
    return super().forward(x)

ViewModule

Bases: nn.Module

Returns a reshaped version of the input, to be used in None-Backbone Mode

Source code in src/super_gradients/training/models/detection_models/darknet53.py
114
115
116
117
118
119
120
121
122
123
124
class ViewModule(nn.Module):
    """
    Returns a reshaped version of the input, to be used in None-Backbone Mode
    """

    def __init__(self, features=1024):
        super(ViewModule, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(-1, self.features)

PPYoloEPostPredictionCallback

Bases: DetectionPostPredictionCallback

Non-Maximum Suppression (NMS) module

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class PPYoloEPostPredictionCallback(DetectionPostPredictionCallback):
    """Non-Maximum Suppression (NMS) module"""

    def __init__(
        self,
        *,
        score_threshold: float,
        nms_threshold: float,
        nms_top_k: int,
        max_predictions: int,
        multi_label_per_box: bool = True,
        class_agnostic_nms: bool = False,
    ):
        """
        :param score_threshold:     Predictions confidence threshold.
                                    Predictions with score lower than score_threshold will not participate in Top-K & NMS
        :param nms_threshold:       IoU threshold for NMS step.
        :param nms_top_k:           Number of predictions participating in NMS step
        :param max_predictions:     Maximum number of boxes to return after NMS step
        :param multi_label_per_box: Controls whether to decode multiple labels per box.
                                    True - each anchor can produce multiple labels of different classes
                                           that pass confidence threshold check (default).
                                    False - each anchor can produce only one label of the class with the highest score.
        """
        super(PPYoloEPostPredictionCallback, self).__init__()
        self.score_threshold = score_threshold
        self.nms_threshold = nms_threshold
        self.nms_top_k = nms_top_k
        self.max_predictions = max_predictions
        self.multi_label_per_box = multi_label_per_box
        self.class_agnostic_nms = class_agnostic_nms

    @torch.no_grad()
    def forward(self, outputs: Any, device: str = None) -> List[List[Tensor]]:
        """

        :param outputs: Outputs of model's forward() method
        :param device:  (Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class.
                        Will be removed in the SG 3.7.0.
                        A device parameter in case we want to move tensors to a specific device.
        :return:        List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image.
                        Format of each row is [x1, y1, x2, y2, confidence, class]
        """
        nms_result = []
        predictions = self._get_decoded_predictions_from_model_output(outputs)

        for pred_bboxes, pred_scores in zip(*predictions):
            # Cast to float to avoid lack of fp16 support in torchvision.ops.boxes.batched_nms when doing CPU inference
            pred_bboxes = pred_bboxes.float()  # [Anchors, 4]
            pred_scores = pred_scores.float()  # [Anchors, C]

            # Filter all predictions by self.score_threshold
            if self.multi_label_per_box:
                i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
                pred_bboxes = pred_bboxes[i]
                pred_cls_conf = pred_scores[i, j]
                pred_cls_label = j[:]

            else:
                pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
                conf_mask = pred_cls_conf >= self.score_threshold

                pred_cls_conf = pred_cls_conf[conf_mask]
                pred_cls_label = pred_cls_label[conf_mask]
                pred_bboxes = pred_bboxes[conf_mask, :]

            # Filter all predictions by self.nms_top_k
            if pred_cls_conf.size(0) > self.nms_top_k:
                topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
                pred_cls_conf = pred_cls_conf[topk_candidates.indices]
                pred_cls_label = pred_cls_label[topk_candidates.indices]
                pred_bboxes = pred_bboxes[topk_candidates.indices, :]

            # NMS
            if self.class_agnostic_nms:
                idx_to_keep = torchvision.ops.boxes.nms(pred_bboxes, pred_cls_conf, iou_threshold=self.nms_threshold)
            else:
                idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)

            pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
            pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
            pred_bboxes = pred_bboxes[idx_to_keep, :]

            #  nx6 (x1, y1, x2, y2, confidence, class) in pixel units
            final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1)  # [N,6]

            nms_result.append(final_boxes)

        return self._filter_max_predictions(nms_result)

    def _get_decoded_predictions_from_model_output(self, outputs: Any) -> Tuple[Tensor, Tensor]:
        """
        Get the decoded predictions from the PPYoloE/YoloNAS output.
        Depending on the model regime (train/eval) the output format may differ so this method picks the right output.

        :param outputs: Model's forward() return value
        :return:        Tuple of (bboxes, scores) of shape [B, Anchors, 4], [B, Anchors, C]
        """
        if isinstance(outputs, tuple) and len(outputs) == 2:
            if torch.is_tensor(outputs[0]) and torch.is_tensor(outputs[1]) and outputs[0].shape[1] == outputs[1].shape[1] and outputs[0].shape[2] == 4:
                # This path happens when we are using traced model or ONNX model without postprocessing for inference.
                predictions = outputs
            else:
                # First is model predictions, second element of tuple is logits for loss computation
                predictions = outputs[0]
        else:
            raise ValueError(f"Unsupported output format: {outputs}")

        return predictions

    def _filter_max_predictions(self, res: List) -> List:
        res[:] = [im[: self.max_predictions] if (im is not None and im.shape[0] > self.max_predictions) else im for im in res]

        return res

__init__(*, score_threshold, nms_threshold, nms_top_k, max_predictions, multi_label_per_box=True, class_agnostic_nms=False)

Parameters:

Name Type Description Default
score_threshold float

Predictions confidence threshold. Predictions with score lower than score_threshold will not participate in Top-K & NMS

required
nms_threshold float

IoU threshold for NMS step.

required
nms_top_k int

Number of predictions participating in NMS step

required
max_predictions int

Maximum number of boxes to return after NMS step

required
multi_label_per_box bool

Controls whether to decode multiple labels per box. True - each anchor can produce multiple labels of different classes that pass confidence threshold check (default). False - each anchor can produce only one label of the class with the highest score.

True
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    *,
    score_threshold: float,
    nms_threshold: float,
    nms_top_k: int,
    max_predictions: int,
    multi_label_per_box: bool = True,
    class_agnostic_nms: bool = False,
):
    """
    :param score_threshold:     Predictions confidence threshold.
                                Predictions with score lower than score_threshold will not participate in Top-K & NMS
    :param nms_threshold:       IoU threshold for NMS step.
    :param nms_top_k:           Number of predictions participating in NMS step
    :param max_predictions:     Maximum number of boxes to return after NMS step
    :param multi_label_per_box: Controls whether to decode multiple labels per box.
                                True - each anchor can produce multiple labels of different classes
                                       that pass confidence threshold check (default).
                                False - each anchor can produce only one label of the class with the highest score.
    """
    super(PPYoloEPostPredictionCallback, self).__init__()
    self.score_threshold = score_threshold
    self.nms_threshold = nms_threshold
    self.nms_top_k = nms_top_k
    self.max_predictions = max_predictions
    self.multi_label_per_box = multi_label_per_box
    self.class_agnostic_nms = class_agnostic_nms

forward(outputs, device=None)

Parameters:

Name Type Description Default
outputs Any

Outputs of model's forward() method

required
device str

(Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class. Will be removed in the SG 3.7.0. A device parameter in case we want to move tensors to a specific device.

None

Returns:

Type Description
List[List[Tensor]]

List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image. Format of each row is [x1, y1, x2, y2, confidence, class]

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
@torch.no_grad()
def forward(self, outputs: Any, device: str = None) -> List[List[Tensor]]:
    """

    :param outputs: Outputs of model's forward() method
    :param device:  (Deprecated) Not used anymore, exists only for sake of keeping the same interface as in the parent class.
                    Will be removed in the SG 3.7.0.
                    A device parameter in case we want to move tensors to a specific device.
    :return:        List of lists of tensors of shape [Ni, 6] where Ni is the number of detections in i-th image.
                    Format of each row is [x1, y1, x2, y2, confidence, class]
    """
    nms_result = []
    predictions = self._get_decoded_predictions_from_model_output(outputs)

    for pred_bboxes, pred_scores in zip(*predictions):
        # Cast to float to avoid lack of fp16 support in torchvision.ops.boxes.batched_nms when doing CPU inference
        pred_bboxes = pred_bboxes.float()  # [Anchors, 4]
        pred_scores = pred_scores.float()  # [Anchors, C]

        # Filter all predictions by self.score_threshold
        if self.multi_label_per_box:
            i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
            pred_bboxes = pred_bboxes[i]
            pred_cls_conf = pred_scores[i, j]
            pred_cls_label = j[:]

        else:
            pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
            conf_mask = pred_cls_conf >= self.score_threshold

            pred_cls_conf = pred_cls_conf[conf_mask]
            pred_cls_label = pred_cls_label[conf_mask]
            pred_bboxes = pred_bboxes[conf_mask, :]

        # Filter all predictions by self.nms_top_k
        if pred_cls_conf.size(0) > self.nms_top_k:
            topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
            pred_cls_conf = pred_cls_conf[topk_candidates.indices]
            pred_cls_label = pred_cls_label[topk_candidates.indices]
            pred_bboxes = pred_bboxes[topk_candidates.indices, :]

        # NMS
        if self.class_agnostic_nms:
            idx_to_keep = torchvision.ops.boxes.nms(pred_bboxes, pred_cls_conf, iou_threshold=self.nms_threshold)
        else:
            idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)

        pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
        pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
        pred_bboxes = pred_bboxes[idx_to_keep, :]

        #  nx6 (x1, y1, x2, y2, confidence, class) in pixel units
        final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1)  # [N,6]

        nms_result.append(final_boxes)

    return self._filter_max_predictions(nms_result)

PPYoloE

Bases: SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
class PPYoloE(SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck):
    def __init__(self, arch_params):
        super().__init__()
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()

        self.backbone = CSPResNetBackbone(**arch_params["backbone"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.neck = PPYoloECSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.head = PPYOLOEHead(**arch_params["head"], width_mult=arch_params["width_mult"], num_classes=arch_params["num_classes"])
        self.in_channels = 3

        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: float = 0.7
        self._default_nms_conf: float = 0.5
        self._default_nms_top_k: int = 1024
        self._default_max_predictions = 300
        self._default_multi_label_per_box = True
        self._default_class_agnostic_nms = False

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> PPYoloEPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        return PPYoloEPostPredictionCallback(
            score_threshold=conf,
            nms_threshold=iou,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
        )

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:         (Optional) Names of the dataset the model was trained on.
        :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:                 (Optional) IoU threshold for the nms algorithm
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if class_names is not None:
            self._class_names = tuple(class_names)
        if image_processor is not None:
            self._image_processor = image_processor
        if iou is not None:
            self._default_nms_iou = float(iou)
        if conf is not None:
            self._default_nms_conf = float(conf)
        if nms_top_k is not None:
            self._default_nms_top_k = int(nms_top_k)
        if max_predictions is not None:
            self._default_max_predictions = int(max_predictions)
        if multi_label_per_box is not None:
            self._default_multi_label_per_box = bool(multi_label_per_box)
        if class_agnostic_nms is not None:
            self._default_class_agnostic_nms = bool(class_agnostic_nms)

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        *,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = self._default_nms_iou if iou is None else iou
        conf = self._default_nms_conf if conf is None else conf
        nms_top_k = self._default_nms_top_k if nms_top_k is None else nms_top_k
        max_predictions = self._default_max_predictions if max_predictions is None else max_predictions
        multi_label_per_box = self._default_multi_label_per_box if multi_label_per_box is None else multi_label_per_box
        class_agnostic_nms = self._default_class_agnostic_nms if class_agnostic_nms is None else class_agnostic_nms

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                nms_top_k=nms_top_k,
                max_predictions=max_predictions,
                multi_label_per_box=multi_label_per_box,
                class_agnostic_nms=class_agnostic_nms,
            ),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:              Images to predict.
        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param batch_size:          Maximum number of images to process at the same time.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, the model will use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        nms_top_k: Optional[int] = None,
        max_predictions: Optional[int] = None,
        multi_label_per_box: Optional[bool] = None,
        class_agnostic_nms: Optional[bool] = None,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                    If None, the default value associated to the training is used.
        :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
        :param max_predictions:     (Optional) The maximum number of detections to return.
        :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :param fp16:                If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def forward(self, x: Tensor):
        features = self.backbone(x)
        features = self.neck(features)
        return self.head(features)

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """

        # There is some discrepancy of what input_size is.
        # When exporting to ONNX it is passed as 4-element tuple (B,C,H,W)
        # When called from predict() it is just (H,W)
        # So we take two last elements of the tuple which handles both cases but ultimately we should fix this
        h, w = input_size[-2:]

        self.head.cache_anchors((h, w))

        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.fuse_block_residual_branches()

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head.replace_num_classes(new_num_classes)

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return PPYoloEDecodingModule(num_pre_nms_predictions=num_pre_nms_predictions)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        self.in_channels = self.get_input_channels()

    def get_input_channels(self) -> int:
        return self.backbone.get_input_channels()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"head": lr, "default": 0}

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)

Get a post prediction callback for this model.

Parameters:

Name Type Description Default
conf float

A minimum confidence threshold for predictions to be used in post-processing.

required
iou float

A IoU threshold for boxes non-maximum suppression.

required
nms_top_k int

The maximum number of detections to consider for NMS.

required
max_predictions int

The maximum number of detections to return.

required
multi_label_per_box bool

If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

required
class_agnostic_nms bool

If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

required

Returns:

Type Description
PPYoloEPostPredictionCallback
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> PPYoloEPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    return PPYoloEPostPredictionCallback(
        score_threshold=conf,
        nms_threshold=iou,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
    )

predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
fp16 bool

If True, the model will use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:              Images to predict.
    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param batch_size:          Maximum number of images to process at the same time.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, the model will use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:                 (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded.
                                If None, the default value associated to the training is used.
    :param fuse_model:          If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :param fp16:                If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
        fp16=fp16,
    )
    pipeline.predict_webcam()

prep_model_for_conversion(input_size=None, **kwargs)

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name Type Description Default
input_size Union[tuple, list]

[H,W]

None
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """

    # There is some discrepancy of what input_size is.
    # When exporting to ONNX it is passed as 4-element tuple (B,C,H,W)
    # When called from predict() it is just (H,W)
    # So we take two last elements of the tuple which handles both cases but ultimately we should fix this
    h, w = input_size[-2:]

    self.head.cache_anchors((h, w))

    for module in self.modules():
        if isinstance(module, RepVGGBlock):
            module.fuse_block_residual_branches()

set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None, nms_top_k=None, max_predictions=None, multi_label_per_box=None, class_agnostic_nms=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
iou Optional[float]

(Optional) IoU threshold for the nms algorithm

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded

None
nms_top_k Optional[int]

(Optional) The maximum number of detections to consider for NMS.

None
max_predictions Optional[int]

(Optional) The maximum number of detections to return.

None
multi_label_per_box Optional[bool]

(Optional) If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

None
class_agnostic_nms Optional[bool]

(Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

None
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    nms_top_k: Optional[int] = None,
    max_predictions: Optional[int] = None,
    multi_label_per_box: Optional[bool] = None,
    class_agnostic_nms: Optional[bool] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:         (Optional) Names of the dataset the model was trained on.
    :param image_processor:     (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:                 (Optional) IoU threshold for the nms algorithm
    :param conf:                (Optional) Below the confidence threshold, prediction are discarded
    :param nms_top_k:           (Optional) The maximum number of detections to consider for NMS.
    :param max_predictions:     (Optional) The maximum number of detections to return.
    :param multi_label_per_box: (Optional) If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  (Optional) If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    """
    if class_names is not None:
        self._class_names = tuple(class_names)
    if image_processor is not None:
        self._image_processor = image_processor
    if iou is not None:
        self._default_nms_iou = float(iou)
    if conf is not None:
        self._default_nms_conf = float(conf)
    if nms_top_k is not None:
        self._default_nms_top_k = int(nms_top_k)
    if max_predictions is not None:
        self._default_max_predictions = int(max_predictions)
    if multi_label_per_box is not None:
        self._default_multi_label_per_box = bool(multi_label_per_box)
    if class_agnostic_nms is not None:
        self._default_class_agnostic_nms = bool(class_agnostic_nms)

PPYoloEDecodingModule

Bases: AbstractObjectDetectionDecodingModule

Decoding module for PPYoloE model. This module used only to export model to ONNX/TensorRT and is not used during training.

Takes in the output of the model and returns the decoded boxes in the format Tuple[Tensor, Tensor] * boxes [batch_size, number_boxes, 4], boxes are in format (x1, y1, x2, y2) * scores [batch_size, number_boxes, number_classes]

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class PPYoloEDecodingModule(AbstractObjectDetectionDecodingModule):
    """
    Decoding module for PPYoloE model. This module used only to export model to ONNX/TensorRT and is not used during training.

    Takes in the output of the model and returns the decoded boxes in the format Tuple[Tensor, Tensor]
    * boxes [batch_size, number_boxes, 4], boxes are in format (x1, y1, x2, y2)
    * scores [batch_size, number_boxes, number_classes]
    """

    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        """
        :param num_pre_nms_predictions: Number of predictions to keep before NMS. This is mainly to reject
        low-confidence predictions and thus reduce the number of boxes to process in NMS.

        """
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]) -> Tuple[Tensor, Tensor]:
        """

        :param inputs: Tuple [Tensor, Tensor]
            * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2)
            * scores [B, N, C]
        :return:
            * boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2)
            * scores [B, Nout, C]
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = inputs
        else:
            pred_bboxes, pred_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = predictions
        else:
            pred_bboxes, pred_scores = predictions[0]

        return pred_bboxes.size(1)

__init__(num_pre_nms_predictions=1000)

Parameters:

Name Type Description Default
num_pre_nms_predictions int

Number of predictions to keep before NMS. This is mainly to reject low-confidence predictions and thus reduce the number of boxes to process in NMS.

1000
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    num_pre_nms_predictions: int = 1000,
):
    """
    :param num_pre_nms_predictions: Number of predictions to keep before NMS. This is mainly to reject
    low-confidence predictions and thus reduce the number of boxes to process in NMS.

    """
    super().__init__()
    self.num_pre_nms_predictions = num_pre_nms_predictions

forward(inputs)

Parameters:

Name Type Description Default
inputs Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]

Tuple [Tensor, Tensor] * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2) * scores [B, N, C]

required

Returns:

Type Description
Tuple[Tensor, Tensor]
  • boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2) * scores [B, Nout, C]
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]) -> Tuple[Tensor, Tensor]:
    """

    :param inputs: Tuple [Tensor, Tensor]
        * boxes [B, N, 4], boxes are in format (x1, y1, x2, y2)
        * scores [B, N, C]
    :return:
        * boxes [B, Nout, 4], boxes are in format (x1, y1, x2, y2)
        * scores [B, Nout, C]
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = inputs
    else:
        pred_bboxes, pred_scores = inputs[0]

    nms_top_k = self.num_pre_nms_predictions
    batch_size, num_anchors, _ = pred_scores.size()

    pred_cls_conf, _ = torch.max(pred_scores, dim=2)
    topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

    offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
    indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
    flat_indices = torch.flatten(indices_with_offset)

    output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
    output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

    return output_pred_bboxes, output_pred_scores

infer_total_number_of_predictions(predictions)

Parameters:

Name Type Description Default
inputs required

Returns:

Type Description
int
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
87
88
89
90
91
92
93
94
95
96
97
98
99
@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = predictions
    else:
        pred_bboxes, pred_scores = predictions[0]

    return pred_bboxes.size(1)

PPYOLOEHead

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
class PPYOLOEHead(nn.Module):
    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        activation: Type[nn.Module] = nn.SiLU,
        fpn_strides: Tuple[int, int, int] = (32, 16, 8),
        grid_cell_scale=5.0,
        grid_cell_offset=0.5,
        reg_max=16,
        eval_size: Tuple[int, int] = None,
        width_mult: float = 1.0,
    ):
        """

        :param num_classes:
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param activation: Type of the activation used in module
        :param fpn_strides: Output strides of the feature maps from the neck
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max:
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param exclude_nms:
        :param exclude_post_process:
        :param width_mult: A scaling factor applied to in_channels in order.
        """
        super(PPYOLOEHead, self).__init__()
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.fpn_strides = tuple(fpn_strides)
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # stem
        self.stem_cls = nn.ModuleList()
        self.stem_reg = nn.ModuleList()

        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
            self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
        # pred head
        self.pred_cls = nn.ModuleList()
        self.pred_reg = nn.ModuleList()
        for in_c in self.in_channels:
            self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

    @torch.jit.ignore
    def cache_anchors(self, input_size: Tuple[int, int]):
        self.eval_size = list(input_size)[-2:]
        device = infer_model_device(self.pred_cls)
        dtype = infer_model_dtype(self.pred_cls)
        anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
        self.register_buffer("anchor_points", anchor_points, persistent=False)
        self.register_buffer("stride_tensor", stride_tensor, persistent=False)

    @torch.jit.ignore
    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
            torch.nn.init.constant_(cls_.weight, 0.0)
            torch.nn.init.constant_(cls_.bias, bias_cls)
            torch.nn.init.constant_(reg_.weight, 0.0)
            torch.nn.init.constant_(reg_.bias, 1.0)

        if self.eval_size:
            device = infer_model_device(self.pred_cls)
            dtype = infer_model_dtype(self.pred_cls)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    @torch.jit.ignore
    def replace_num_classes(self, num_classes: int):
        bias_cls = bias_init_with_prob(0.01)
        device = self.pred_cls[0].weight.device
        self.pred_cls = nn.ModuleList()
        self.num_classes = num_classes

        for in_c in self.in_channels:
            predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1, device=device)
            torch.nn.init.constant_(predict_layer.weight, 0.0)
            torch.nn.init.constant_(predict_layer.bias, bias_cls)
            self.pred_cls.append(predict_layer)

    @torch.jit.ignore
    def forward_train(self, feats: Tuple[Tensor, ...]):
        anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
        )

        cls_score_list, reg_distri_list = [], []
        for i, feat in enumerate(feats):
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            # cls and reg
            # Note we don't apply sigmoid on class predictions to ensure good numerical stability at loss computation
            cls_score_list.append(torch.permute(cls_logit.flatten(2), [0, 2, 1]))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
        cls_score_list = torch.cat(cls_score_list, dim=1)
        reg_distri_list = torch.cat(reg_distri_list, dim=1)

        return cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor

    def forward_eval(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        if torch.jit.is_tracing():
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)

            # ONNX export does not support arange with float16, so it is created as fp32 and then casted to fp16
            # This produce correct fp16 weights in ONNX model when exported
            shift_x = torch.arange(end=w, dtype=torch.float32, device=device) + self.grid_cell_offset
            shift_y = torch.arange(end=h, dtype=torch.float32, device=device) + self.grid_cell_offset

            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype, device=device))

        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        return anchor_points, stride_tensor

    def forward(self, feats: Tuple[Tensor]):
        if self.training:
            return self.forward_train(feats)
        else:
            return self.forward_eval(feats)

__init__(num_classes, in_channels, activation=nn.SiLU, fpn_strides=(32, 16, 8), grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, eval_size=None, width_mult=1.0)

Parameters:

Name Type Description Default
num_classes int required
in_channels Tuple[int, int, int]

Number of channels for each feature map (See width_mult)

required
activation Type[nn.Module]

Type of the activation used in module

nn.SiLU
fpn_strides Tuple[int, int, int]

Output strides of the feature maps from the neck

(32, 16, 8)
grid_cell_scale 5.0
grid_cell_offset 0.5
reg_max 16
eval_size Tuple[int, int]

(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.

None
exclude_nms required
exclude_post_process required
width_mult float

A scaling factor applied to in_channels in order.

1.0
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
@resolve_param("activation", ActivationsTypeFactory())
def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    activation: Type[nn.Module] = nn.SiLU,
    fpn_strides: Tuple[int, int, int] = (32, 16, 8),
    grid_cell_scale=5.0,
    grid_cell_offset=0.5,
    reg_max=16,
    eval_size: Tuple[int, int] = None,
    width_mult: float = 1.0,
):
    """

    :param num_classes:
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param activation: Type of the activation used in module
    :param fpn_strides: Output strides of the feature maps from the neck
    :param grid_cell_scale:
    :param grid_cell_offset:
    :param reg_max:
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param exclude_nms:
    :param exclude_post_process:
    :param width_mult: A scaling factor applied to in_channels in order.
    """
    super(PPYOLOEHead, self).__init__()
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.fpn_strides = tuple(fpn_strides)
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size

    # stem
    self.stem_cls = nn.ModuleList()
    self.stem_reg = nn.ModuleList()

    for in_c in self.in_channels:
        self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
        self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
    # pred head
    self.pred_cls = nn.ModuleList()
    self.pred_reg = nn.ModuleList()
    for in_c in self.in_channels:
        self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
        self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

bias_init_with_prob(prior_prob=0.01)

initialize conv/fc bias value according to a given probability value.

Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
15
16
17
18
def bias_init_with_prob(prior_prob=0.01):
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init

generate_anchors_for_grid_cell(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.5, dtype=torch.float)

Like ATSS, generate anchors based on grid size.

Parameters:

Name Type Description Default
feats Tuple[Tensor, ...]

shape[s, (b, c, h, w)]

required
fpn_strides Tuple[int, ...]

shape[s], stride for each scale feature

required
grid_cell_size float

anchor size

5.0
grid_cell_offset float

The range is between 0 and 1.

0.5
dtype torch.dtype

Type of the anchors.

torch.float

Returns:

Type Description
Tuple[Tensor, Tensor, List[int], Tensor]
  • anchors: shape[l, 4], "xmin, ymin, xmax, ymax" format. - anchor_points: shape[l, 2], "x, y" format. - num_anchors_list: shape[s], contains [s_1, s_2, ...]. - stride_tensor: shape[l, 1], contains the stride for each scale.
Source code in src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@torch.no_grad()
def generate_anchors_for_grid_cell(
    feats: Tuple[Tensor, ...],
    fpn_strides: Tuple[int, ...],
    grid_cell_size: float = 5.0,
    grid_cell_offset: float = 0.5,
    dtype: torch.dtype = torch.float,
) -> Tuple[Tensor, Tensor, List[int], Tensor]:
    """
    Like ATSS, generate anchors based on grid size.

    :param feats: shape[s, (b, c, h, w)]
    :param fpn_strides: shape[s], stride for each scale feature
    :param grid_cell_size: anchor size
    :param grid_cell_offset: The range is between 0 and 1.
    :param dtype: Type of the anchors.

    :return:
        - anchors: shape[l, 4], "xmin, ymin, xmax, ymax" format.
        - anchor_points: shape[l, 2], "x, y" format.
        - num_anchors_list: shape[s], contains [s_1, s_2, ...].
        - stride_tensor: shape[l, 1], contains the stride for each scale.
    """
    assert len(feats) == len(fpn_strides)
    device = feats[0].device
    anchors = []
    anchor_points = []
    num_anchors_list = []
    stride_tensor = []
    for feat, stride in zip(feats, fpn_strides):
        _, _, h, w = feat.shape
        cell_half_size = grid_cell_size * stride * 0.5
        shift_x = (torch.arange(end=w) + grid_cell_offset) * stride
        shift_y = (torch.arange(end=h) + grid_cell_offset) * stride

        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
        else:
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

        anchor = torch.stack(
            [shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size],
            dim=-1,
        ).to(dtype=dtype)
        anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)

        anchors.append(anchor.reshape([-1, 4]))
        anchor_points.append(anchor_point.reshape([-1, 2]))
        num_anchors_list.append(len(anchors[-1]))
        stride_tensor.append(torch.full([num_anchors_list[-1], 1], stride, dtype=dtype))

    anchors = torch.cat(anchors).to(device)
    anchor_points = torch.cat(anchor_points).to(device)
    stride_tensor = torch.cat(stride_tensor).to(device)
    return anchors, anchor_points, num_anchors_list, stride_tensor

AbstractYoloBackbone

Bases: SupportsReplaceInputChannels

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
class AbstractYoloBackbone(SupportsReplaceInputChannels):
    def __init__(self, arch_params):
        # CREATE A LIST CONTAINING THE LAYERS TO EXTRACT FROM THE BACKBONE AND ADD THE FINAL LAYER
        self._layer_idx_to_extract = [idx for sub_l in arch_params.skip_connections_dict.values() for idx in sub_l]
        self._layer_idx_to_extract.append(len(self._modules_list) - 1)

    def forward(self, x):
        """:return A list, the length of self._modules_list containing the output of the layer if specified in
        self._layers_to_extract and None otherwise"""
        extracted_intermediate_layers = []
        for layer_idx, layer_module in enumerate(self._modules_list):
            # PREDICT THE NEXT LAYER'S OUTPUT
            x = layer_module(x)
            # IF INDICATED APPEND THE OUTPUT TO extracted_intermediate_layers O.W. APPEND None
            if layer_idx in self._layer_idx_to_extract:
                extracted_intermediate_layers.append(x)
            else:
                extracted_intermediate_layers.append(None)

        return extracted_intermediate_layers

forward(x)

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def forward(self, x):
    """:return A list, the length of self._modules_list containing the output of the layer if specified in
    self._layers_to_extract and None otherwise"""
    extracted_intermediate_layers = []
    for layer_idx, layer_module in enumerate(self._modules_list):
        # PREDICT THE NEXT LAYER'S OUTPUT
        x = layer_module(x)
        # IF INDICATED APPEND THE OUTPUT TO extracted_intermediate_layers O.W. APPEND None
        if layer_idx in self._layer_idx_to_extract:
            extracted_intermediate_layers.append(x)
        else:
            extracted_intermediate_layers.append(None)

    return extracted_intermediate_layers

Concat

Bases: nn.Module

CONCATENATE A LIST OF TENSORS ALONG DIMENSION

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
169
170
171
172
173
174
175
176
177
class Concat(nn.Module):
    """CONCATENATE A LIST OF TENSORS ALONG DIMENSION"""

    def __init__(self, dimension=1):
        super().__init__()
        self.dimension = dimension

    def forward(self, x):
        return torch.cat(x, self.dimension)

DetectX

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class DetectX(nn.Module):
    def __init__(
        self,
        num_classes: int,
        stride: np.ndarray,
        activation_func_type: type,
        channels: list,
        depthwise=False,
        groups: int = None,
        inter_channels: Union[int, List] = None,
    ):
        """
        :param stride:          strides of each predicting level
        :param channels:        input channels into all detecting layers
                                (from all neck layers that will be used for predicting)
        :param depthwise:       defines conv type in classification and regression branches (Conv or GroupedConvBlock)
                                depthwise is False by default in favor of a usual Conv
        :param groups:          num groups in convs in classification and regression branches;
                                if None default groups will be used according to conv type
                                (1 for Conv and depthwise for GroupedConvBlock)
        :param inter_channels:  channels in classification and regression branches;
                                if None channels[0] will be used by default
        """
        super().__init__()

        self.num_classes = num_classes
        self.detection_layers_num = len(channels)
        self.n_anchors = 1
        self.grid = [torch.zeros(1)] * self.detection_layers_num  # init grid

        if torch.is_tensor(stride):
            stride = stride.clone().detach()
        else:
            stride = torch.tensor(stride)

        self.register_buffer("stride", stride, persistent=False)

        self.cls_convs = nn.ModuleList()
        self.reg_convs = nn.ModuleList()
        self.cls_preds = nn.ModuleList()
        self.reg_preds = nn.ModuleList()
        self.obj_preds = nn.ModuleList()
        self.stems = nn.ModuleList()

        ConvBlock = GroupedConvBlock if depthwise else Conv

        inter_channels = inter_channels or channels[0]
        inter_channels = inter_channels if isinstance(inter_channels, list) else [inter_channels] * self.detection_layers_num
        for i in range(self.detection_layers_num):
            self.stems.append(Conv(channels[i], inter_channels[i], 1, 1, activation_func_type))

            self.cls_convs.append(
                nn.Sequential(
                    *[
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ]
                )
            )
            self.reg_convs.append(
                nn.Sequential(
                    *[
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                        ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ]
                )
            )

            self.cls_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * self.num_classes, 1, 1, 0))
            self.reg_preds.append(nn.Conv2d(inter_channels[i], 4, 1, 1, 0))
            self.obj_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * 1, 1, 1, 0))

    def forward(self, inputs):
        outputs = []
        outputs_logits = []
        for i in range(self.detection_layers_num):
            x = self.stems[i](inputs[i])

            cls_feat = self.cls_convs[i](x)
            cls_output = self.cls_preds[i](cls_feat)

            reg_feat = self.reg_convs[i](x)
            reg_output = self.reg_preds[i](reg_feat)
            obj_output = self.obj_preds[i](reg_feat)

            bs, _, ny, nx = reg_feat.shape
            output = torch.cat([reg_output, obj_output, cls_output], 1)
            output = output.view(bs, self.n_anchors, -1, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            if not self.training:
                outputs_logits.append(output.clone())
                if self.grid[i].shape[2:4] != output.shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny, dtype=reg_output.dtype, device=output.device)

                xy = (output[..., :2] + self.grid[i].to(output.device)) * self.stride[i]
                wh = torch.exp(output[..., 2:4]) * self.stride[i]
                output = torch.cat([xy, wh, output[..., 4:].sigmoid()], dim=4)
                output = output.view(bs, -1, output.shape[-1])

            outputs.append(output)

        return outputs if self.training else (torch.cat(outputs, 1), outputs_logits)

    @staticmethod
    def _make_grid(nx: int, ny: int, dtype: torch.dtype, device: torch.device):
        y, x = torch.arange(ny, dtype=torch.float32, device=device), torch.arange(nx, dtype=torch.float32, device=device)

        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([y, x], indexing="ij")
        else:
            yv, xv = torch.meshgrid([y, x])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).to(dtype)

__init__(num_classes, stride, activation_func_type, channels, depthwise=False, groups=None, inter_channels=None)

Parameters:

Name Type Description Default
stride np.ndarray

strides of each predicting level

required
channels list

input channels into all detecting layers (from all neck layers that will be used for predicting)

required
depthwise

defines conv type in classification and regression branches (Conv or GroupedConvBlock) depthwise is False by default in favor of a usual Conv

False
groups int

num groups in convs in classification and regression branches; if None default groups will be used according to conv type (1 for Conv and depthwise for GroupedConvBlock)

None
inter_channels Union[int, List]

channels in classification and regression branches; if None channels[0] will be used by default

None
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def __init__(
    self,
    num_classes: int,
    stride: np.ndarray,
    activation_func_type: type,
    channels: list,
    depthwise=False,
    groups: int = None,
    inter_channels: Union[int, List] = None,
):
    """
    :param stride:          strides of each predicting level
    :param channels:        input channels into all detecting layers
                            (from all neck layers that will be used for predicting)
    :param depthwise:       defines conv type in classification and regression branches (Conv or GroupedConvBlock)
                            depthwise is False by default in favor of a usual Conv
    :param groups:          num groups in convs in classification and regression branches;
                            if None default groups will be used according to conv type
                            (1 for Conv and depthwise for GroupedConvBlock)
    :param inter_channels:  channels in classification and regression branches;
                            if None channels[0] will be used by default
    """
    super().__init__()

    self.num_classes = num_classes
    self.detection_layers_num = len(channels)
    self.n_anchors = 1
    self.grid = [torch.zeros(1)] * self.detection_layers_num  # init grid

    if torch.is_tensor(stride):
        stride = stride.clone().detach()
    else:
        stride = torch.tensor(stride)

    self.register_buffer("stride", stride, persistent=False)

    self.cls_convs = nn.ModuleList()
    self.reg_convs = nn.ModuleList()
    self.cls_preds = nn.ModuleList()
    self.reg_preds = nn.ModuleList()
    self.obj_preds = nn.ModuleList()
    self.stems = nn.ModuleList()

    ConvBlock = GroupedConvBlock if depthwise else Conv

    inter_channels = inter_channels or channels[0]
    inter_channels = inter_channels if isinstance(inter_channels, list) else [inter_channels] * self.detection_layers_num
    for i in range(self.detection_layers_num):
        self.stems.append(Conv(channels[i], inter_channels[i], 1, 1, activation_func_type))

        self.cls_convs.append(
            nn.Sequential(
                *[
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                ]
            )
        )
        self.reg_convs.append(
            nn.Sequential(
                *[
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                    ConvBlock(inter_channels[i], inter_channels[i], 3, 1, activation_func_type, groups=groups),
                ]
            )
        )

        self.cls_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * self.num_classes, 1, 1, 0))
        self.reg_preds.append(nn.Conv2d(inter_channels[i], 4, 1, 1, 0))
        self.obj_preds.append(nn.Conv2d(inter_channels[i], self.n_anchors * 1, 1, 1, 0))

YoloBase

Bases: SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
class YoloBase(SgModule, ExportableObjectDetectionModel, HasPredict, SupportsInputShapeCheck):
    def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
        super().__init__()
        # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
        self.arch_params = HpmStruct(**DEFAULT_YOLO_ARCH_PARAMS)
        # FIXME: REMOVE anchors ATTRIBUTE, WHICH HAS NO MEANING OTHER THAN COMPATIBILITY.
        self.arch_params.anchors = COCO_DETECTION_80_CLASSES_BBOX_ANCHORS
        self.arch_params.override(**arch_params.to_dict())
        self.arch_params.skip_connections_dict = {k: v for k, v in self.arch_params.skip_connections_list}
        self.in_channels = 3

        self.num_classes = self.arch_params.num_classes

        # THE MODEL'S MODULES
        self._backbone = backbone(arch_params=self.arch_params)
        if hasattr(self._backbone, "backbone_connection_channels"):
            self.arch_params.scaled_backbone_width = False
            self.arch_params.backbone_connection_channels = self._backbone.backbone_connection_channels

        self._nms = nn.Identity()

        # A FLAG TO DEFINE augment_forward IN INFERENCE
        self.augmented_inference = False

        if initialize_module:
            self._head = YoloHead(self.arch_params)
            self._initialize_module()

        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None
        self._default_nms_iou: Optional[float] = None
        self._default_nms_conf: Optional[float] = None
        self.register_buffer("strides", torch.tensor(self.arch_params.anchors.stride), persistent=False)

    @staticmethod
    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
        return YoloXPostPredictionCallback(conf=conf, iou=iou)

    def get_processing_params(self) -> Optional[Processing]:
        return self._image_processor

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param iou:             (Optional) IoU threshold for the nms algorithm
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor
        self._default_nms_iou = iou or self._default_nms_iou
        self._default_nms_conf = conf or self._default_nms_conf

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True
    ) -> DetectionPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:    If True, use mixed precision for inference.
        """
        if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = iou or self._default_nms_iou
        conf = conf or self._default_nms_conf

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = DetectionPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf),
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesDetectionPrediction:
        """Predict an image or a list of images.

        :param images:      Images to predict.
        :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                            If None, the default value associated to the training is used.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:        If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16=True):
        """Predict using webcam.

        :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:    If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

    def forward(self, x):
        out = self._backbone(x)
        out = self._head(out)
        # THIS HAS NO EFFECT IF add_nms() WAS NOT DONE
        out = self._nms(out)
        return out

    def load_state_dict(self, state_dict, strict=True):
        try:
            keys_dropped_in_sg_320 = {
                "stride",
                "_head.anchors._stride",
                "_head.anchors._anchors",
                "_head.anchors._anchor_grid",
                "_head._modules_list.14.stride",
            }
            state_dict = collections.OrderedDict([(k, v) for k, v in state_dict.items() if k not in keys_dropped_in_sg_320])

            super().load_state_dict(state_dict, strict)
        except RuntimeError as e:
            raise RuntimeError(
                f"Got exception {e}, if a mismatch between expected and given state_dict keys exist, "
                f"checkpoint may have been saved after fusing conv and bn. use fuse_conv_bn before loading."
            )

    def _initialize_module(self):
        self._check_strides()
        self._initialize_biases()
        self._initialize_weights()
        if self.arch_params.add_nms:
            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)

    def _check_strides(self):
        m = self._head._modules_list[-1]  # DetectX()
        # Do inference in train mode on a dummy image to get output stride of each head output layer
        s = 128  # twice the minimum acceptable image size
        device = infer_model_device(m)
        dtype = infer_model_dtype(m)

        dummy_input = torch.zeros((1, self.arch_params.channels_in, s, s), device=device, dtype=dtype)
        dummy_input = dummy_input.to(next(self._backbone.parameters()).device)

        stride = torch.tensor([s / x.shape[-2] for x in self.forward(dummy_input)])
        stride = stride.to(m.stride.device)
        if not torch.equal(m.stride, stride):
            raise RuntimeError("Provided anchor strides do not match the model strides")

    def _initialize_biases(self):
        """initialize biases into DetectX()"""
        detect_module = self._head._modules_list[-1]  # DetectX() module
        prior_prob = 1e-2
        for conv in detect_module.cls_preds:
            bias = conv.bias.view(detect_module.n_anchors, -1)
            bias.data.fill_(-math.log((1 - prior_prob) / prior_prob))
            conv.bias = torch.nn.Parameter(bias.view(-1), requires_grad=True)

        for conv in detect_module.obj_preds:
            bias = conv.bias.view(detect_module.n_anchors, -1)
            bias.data.fill_(-math.log((1 - prior_prob) / prior_prob))
            conv.bias = torch.nn.Parameter(bias.view(-1), requires_grad=True)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eps = 1e-3
                m.momentum = 0.03
            elif isinstance(m, (nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.Hardswish, nn.SiLU)):
                m.inplace = True

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)
        :param input_size: expected input size
        :return:
        """
        assert not self.training, "model has to be in eval mode to be converted"

        # Verify dummy_input from converter is of multiple of the grid size
        max_stride = int(max(self.strides))

        # Validate the image size
        image_dims = input_size[-2:]  # assume torch uses channels first layout
        for dim in image_dims:
            res_flag, suggestion = check_img_size_divisibility(dim, max_stride)
            if not res_flag:
                raise ValueError(
                    f"Invalid input size: {input_size}. The input size must be multiple of max stride: "
                    f"{max_stride}. The closest suggestions are: {suggestion[0]}x{suggestion[0]} or "
                    f"{suggestion[1]}x{suggestion[1]}"
                )

    def get_include_attributes(self) -> list:
        return ["grid", "anchors", "anchors_grid"]

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self._head = new_head
        else:
            self.arch_params.num_classes = new_num_classes
            self.num_classes = new_num_classes
            old_detectx = self._head._modules_list[-1]
            _, block, activation_type, width_mult, depth_mult = get_yolo_type_params(
                self.arch_params.yolo_type, self.arch_params.width_mult_factor, self.arch_params.depth_mult_factor
            )

            new_last_layer = DetectX(
                num_classes=new_num_classes,
                stride=self.strides,
                activation_func_type=activation_type,
                channels=[width_mult(v) for v in (256, 512, 1024)],
                depthwise=isinstance(old_detectx.cls_convs[0][0], GroupedConvBlock),
                groups=self.arch_params.xhead_groups,
                inter_channels=self.arch_params.xhead_inter_channels,
            )
            new_last_layer = new_last_layer.to(next(self.parameters()).device)
            self._head._modules_list[-1] = new_last_layer
            self._check_strides()
            self._initialize_biases()
            self._initialize_weights()

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {"_head": lr, "default": 0}

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return YoloXDecodingModule(num_pre_nms_predictions=num_pre_nms_predictions, **kwargs)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self._backbone, SupportsReplaceInputChannels):
            self._backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
            self.in_channels = self.get_input_channels()
        else:
            raise NotImplementedError(f"`{self._backbone.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self._backbone, SupportsReplaceInputChannels):
            return self._backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self._backbone.__class__.__name__}` does not support `get_input_channels`")

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

predict(images, iou=None, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesDetectionPrediction:
    """Predict an image or a list of images.

    :param images:      Images to predict.
    :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:        If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(iou=None, conf=None, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
601
602
603
604
605
606
607
608
609
610
611
612
def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16=True):
    """Predict using webcam.

    :param iou:     (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:    If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

prep_model_for_conversion(input_size=None, **kwargs)

A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)

Parameters:

Name Type Description Default
input_size Union[tuple, list]

expected input size

None

Returns:

Type Description
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    A method for preparing the Yolo model for conversion to other frameworks (ONNX, CoreML etc)
    :param input_size: expected input size
    :return:
    """
    assert not self.training, "model has to be in eval mode to be converted"

    # Verify dummy_input from converter is of multiple of the grid size
    max_stride = int(max(self.strides))

    # Validate the image size
    image_dims = input_size[-2:]  # assume torch uses channels first layout
    for dim in image_dims:
        res_flag, suggestion = check_img_size_divisibility(dim, max_stride)
        if not res_flag:
            raise ValueError(
                f"Invalid input size: {input_size}. The input size must be multiple of max stride: "
                f"{max_stride}. The closest suggestions are: {suggestion[0]}x{suggestion[0]} or "
                f"{suggestion[1]}x{suggestion[1]}"
            )

set_dataset_processing_params(class_names=None, image_processor=None, iou=None, conf=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
iou Optional[float]

(Optional) IoU threshold for the nms algorithm

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded

None
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param iou:             (Optional) IoU threshold for the nms algorithm
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor
    self._default_nms_iou = iou or self._default_nms_iou
    self._default_nms_conf = conf or self._default_nms_conf

YoloDarknetBackbone

Bases: AbstractYoloBackbone, CSPDarknet53

Implements the CSP_Darknet53 module and inherit the forward pass to extract layers indicated in arch_params

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
class YoloDarknetBackbone(AbstractYoloBackbone, CSPDarknet53):
    """Implements the CSP_Darknet53 module and inherit the forward pass to extract layers indicated in arch_params"""

    def __init__(self, arch_params):
        arch_params.backbone_mode = True
        CSPDarknet53.__init__(self, arch_params)
        AbstractYoloBackbone.__init__(self, arch_params)

    def forward(self, x):
        return AbstractYoloBackbone.forward(self, x)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        CSPDarknet53.replace_input_channels(self, in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return CSPDarknet53.get_input_channels(self)

YoloHead

Bases: nn.Module

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class YoloHead(nn.Module):
    def __init__(self, arch_params):
        super().__init__()
        # PARSE arch_params
        num_classes = arch_params.num_classes
        anchors = arch_params.anchors
        depthwise = arch_params.depthwise
        xhead_groups = arch_params.xhead_groups
        xhead_inter_channels = arch_params.xhead_inter_channels

        self._skip_connections_dict = arch_params.skip_connections_dict
        # FLATTEN THE SOURCE LIST INTO A LIST OF INDICES
        self._layer_idx_to_extract = [idx for sub_l in self._skip_connections_dict.values() for idx in sub_l]

        _, block, activation_type, width_mult, depth_mult = get_yolo_type_params(
            arch_params.yolo_type, arch_params.width_mult_factor, arch_params.depth_mult_factor
        )

        backbone_connector = [width_mult(c) if arch_params.scaled_backbone_width else c for c in arch_params.backbone_connection_channels]

        DownConv = GroupedConvBlock if depthwise else Conv

        self._modules_list = nn.ModuleList()
        self._modules_list.append(Conv(backbone_connector[0], width_mult(512), 1, 1, activation_type))  # 10
        self._modules_list.append(nn.Upsample(None, 2, "nearest"))  # 11
        self._modules_list.append(Concat(1))  # 12
        self._modules_list.append(block(backbone_connector[1] + width_mult(512), width_mult(512), depth_mult(3), activation_type, False, depthwise))  # 13

        self._modules_list.append(Conv(width_mult(512), width_mult(256), 1, 1, activation_type))  # 14
        self._modules_list.append(nn.Upsample(None, 2, "nearest"))  # 15
        self._modules_list.append(Concat(1))  # 16
        self._modules_list.append(block(backbone_connector[2] + width_mult(256), width_mult(256), depth_mult(3), activation_type, False, depthwise))  # 17

        self._modules_list.append(DownConv(width_mult(256), width_mult(256), 3, 2, activation_type))  # 18
        self._modules_list.append(Concat(1))  # 19
        self._modules_list.append(block(2 * width_mult(256), width_mult(512), depth_mult(3), activation_type, False, depthwise))  # 20

        self._modules_list.append(DownConv(width_mult(512), width_mult(512), 3, 2, activation_type))  # 21
        self._modules_list.append(Concat(1))  # 22
        self._modules_list.append(block(2 * width_mult(512), width_mult(1024), depth_mult(3), activation_type, False, depthwise))  # 23

        detect_input_channels = [width_mult(v) for v in (256, 512, 1024)]
        strides = anchors.stride
        self._modules_list.append(
            DetectX(
                num_classes,
                strides,
                activation_type,
                channels=detect_input_channels,
                depthwise=depthwise,
                groups=xhead_groups,
                inter_channels=xhead_inter_channels,
            )
        )  # 24

        self._shortcuts = nn.ModuleList([CrossModelSkipConnection() for _ in range(len(self._skip_connections_dict.keys()) - 1)])

        self.width_mult = width_mult

    def forward(self, intermediate_output):
        """
        :param intermediate_output: A list of the intermediate prediction of layers specified in the
        self._inter_layer_idx_to_extract from the Backbone
        """
        # COUNT THE NUMBER OF LAYERS IN THE BACKBONE TO CONTINUE THE COUNTER
        num_layers_in_backbone = len(intermediate_output)
        # INPUT TO HEAD IS THE LAST ELEMENT OF THE BACKBONE'S OUTPUT
        out = intermediate_output[-1]
        # RUN OVER THE MODULE LIST WITHOUT THE FINAL LAYER & START COUNTER FROM THE END OF THE BACKBONE
        i = 0
        for layer_idx, layer_module in enumerate(self._modules_list[:-1], start=num_layers_in_backbone):
            # IF THE LAYER APPEARS IN THE KEYS IT INSERT THE PRECIOUS OUTPUT AND THE INDICATED SKIP CONNECTIONS

            if layer_idx in self._skip_connections_dict.keys():
                out = layer_module([out, self._shortcuts[i](intermediate_output[self._skip_connections_dict[layer_idx][0]])])
                i += 1
            else:
                out = layer_module(out)

            # IF INDICATED APPEND THE OUTPUT TO inter_layer_idx_to_extract O.W. APPEND None
            if layer_idx in self._layer_idx_to_extract:
                intermediate_output.append(out)
            else:
                intermediate_output.append(None)

        # INSERT THE REMAINING LAYERS INTO THE Detect LAYER
        last_idx = len(self._modules_list) + num_layers_in_backbone - 1

        return self._modules_list[-1](
            [
                intermediate_output[self._skip_connections_dict[last_idx][0]],
                intermediate_output[self._skip_connections_dict[last_idx][1]],
                out,
            ]
        )

forward(intermediate_output)

Parameters:

Name Type Description Default
intermediate_output

A list of the intermediate prediction of layers specified in the self._inter_layer_idx_to_extract from the Backbone

required
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
def forward(self, intermediate_output):
    """
    :param intermediate_output: A list of the intermediate prediction of layers specified in the
    self._inter_layer_idx_to_extract from the Backbone
    """
    # COUNT THE NUMBER OF LAYERS IN THE BACKBONE TO CONTINUE THE COUNTER
    num_layers_in_backbone = len(intermediate_output)
    # INPUT TO HEAD IS THE LAST ELEMENT OF THE BACKBONE'S OUTPUT
    out = intermediate_output[-1]
    # RUN OVER THE MODULE LIST WITHOUT THE FINAL LAYER & START COUNTER FROM THE END OF THE BACKBONE
    i = 0
    for layer_idx, layer_module in enumerate(self._modules_list[:-1], start=num_layers_in_backbone):
        # IF THE LAYER APPEARS IN THE KEYS IT INSERT THE PRECIOUS OUTPUT AND THE INDICATED SKIP CONNECTIONS

        if layer_idx in self._skip_connections_dict.keys():
            out = layer_module([out, self._shortcuts[i](intermediate_output[self._skip_connections_dict[layer_idx][0]])])
            i += 1
        else:
            out = layer_module(out)

        # IF INDICATED APPEND THE OUTPUT TO inter_layer_idx_to_extract O.W. APPEND None
        if layer_idx in self._layer_idx_to_extract:
            intermediate_output.append(out)
        else:
            intermediate_output.append(None)

    # INSERT THE REMAINING LAYERS INTO THE Detect LAYER
    last_idx = len(self._modules_list) + num_layers_in_backbone - 1

    return self._modules_list[-1](
        [
            intermediate_output[self._skip_connections_dict[last_idx][0]],
            intermediate_output[self._skip_connections_dict[last_idx][1]],
            out,
        ]
    )

YoloRegnetBackbone

Bases: AbstractYoloBackbone, AnyNetX

Implements the Regnet module and inherits the forward pass to extract layers indicated in arch_params

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
class YoloRegnetBackbone(AbstractYoloBackbone, AnyNetX):
    """Implements the Regnet module and inherits the forward pass to extract layers indicated in arch_params"""

    def __init__(self, arch_params):
        backbone_params = {**arch_params.backbone_params, "backbone_mode": True, "num_classes": None}
        backbone_params.pop("spp_kernels", None)
        AnyNetX.__init__(self, **backbone_params)

        # LAST ANYNETX STAGE -> STAGE + SPP IF SPP_KERNELS IS GIVEN
        spp_kernels = get_param(arch_params.backbone_params, "spp_kernels", None)
        if spp_kernels:
            activation_type = nn.SiLU if arch_params.yolo_type == "yoloX" else nn.Hardswish
            self.net.stage_3 = self.add_spp_to_stage(self.net.stage_3, spp_kernels, activation_type=activation_type)
            self.initialize_weight()

        # CREATE A LIST CONTAINING THE LAYERS TO EXTRACT FROM THE BACKBONE AND ADD THE FINAL LAYER
        self._modules_list = nn.ModuleList()
        for layer in self.net:
            self._modules_list.append(layer)

        AbstractYoloBackbone.__init__(self, arch_params)

        # WE KEEP A LIST OF THE OUTPUTS WIDTHS (NUM FEATURES) TO BE CONNECTED TO THE HEAD
        self.backbone_connection_channels = arch_params.backbone_params["ls_block_width"][1:][::-1]

    @staticmethod
    def add_spp_to_stage(anynetx_stage: Stage, spp_kernels: Tuple[int], activation_type):
        """
        Add SPP in the end of an AnyNetX Stage
        """
        # Last block in a Stage -> conv_block_3 -> Conv2d -> out_channels
        out_channels = anynetx_stage.blocks[-1].conv_block_3[0].out_channels
        anynetx_stage.blocks.add_module("spp_block", SPP(out_channels, out_channels, spp_kernels, activation_type=activation_type))
        return anynetx_stage

    def forward(self, x):
        return AbstractYoloBackbone.forward(self, x)

add_spp_to_stage(anynetx_stage, spp_kernels, activation_type) staticmethod

Add SPP in the end of an AnyNetX Stage

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
359
360
361
362
363
364
365
366
367
@staticmethod
def add_spp_to_stage(anynetx_stage: Stage, spp_kernels: Tuple[int], activation_type):
    """
    Add SPP in the end of an AnyNetX Stage
    """
    # Last block in a Stage -> conv_block_3 -> Conv2d -> out_channels
    out_channels = anynetx_stage.blocks[-1].conv_block_3[0].out_channels
    anynetx_stage.blocks.add_module("spp_block", SPP(out_channels, out_channels, spp_kernels, activation_type=activation_type))
    return anynetx_stage

YoloXDecodingModule

Bases: AbstractObjectDetectionDecodingModule

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
class YoloXDecodingModule(AbstractObjectDetectionDecodingModule):
    __constants__ = ["num_pre_nms_predictions", "with_confidence"]

    def __init__(self, num_pre_nms_predictions: int, with_confidence: bool = True):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions
        self.with_confidence = with_confidence

    def forward(self, predictions):
        if isinstance(predictions, (tuple, list)):
            predictions = predictions[0]

        cxcywh = predictions[:, :, :4]
        conf = predictions[:, :, 4:5]
        pred_scores = predictions[:, :, 5:]
        pred_bboxes = convert_cxcywh_bbox_to_xyxy(cxcywh)

        if self.with_confidence:
            pred_scores = pred_scores * conf

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if isinstance(predictions, (tuple, list)):
            predictions = predictions[0]

        return predictions.size(1)

infer_total_number_of_predictions(predictions)

Parameters:

Name Type Description Default
inputs required

Returns:

Type Description
int
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
805
806
807
808
809
810
811
812
813
814
815
@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if isinstance(predictions, (tuple, list)):
        predictions = predictions[0]

    return predictions.size(1)

YoloXPostPredictionCallback

Bases: DetectionPostPredictionCallback

Post-prediction callback to decode YoloX model's output and apply Non-Maximum Suppression (NMS) to get the final predictions.

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class YoloXPostPredictionCallback(DetectionPostPredictionCallback):
    """Post-prediction callback to decode YoloX model's output and apply Non-Maximum Suppression (NMS) to get
    the final predictions.
    """

    def __init__(
        self,
        conf: float = 0.001,
        iou: float = 0.6,
        classes: List[int] = None,
        nms_type: NMS_Type = NMS_Type.ITERATIVE,
        max_predictions: int = 300,
        with_confidence: bool = True,
        class_agnostic_nms: bool = False,
        multi_label_per_box: bool = True,
    ):
        """
        :param conf: confidence threshold
        :param iou: IoU threshold                                       (used in NMS_Type.ITERATIVE)
        :param classes: (optional list) filter by class                 (used in NMS_Type.ITERATIVE)
        :param nms_type: the type of nms to use (iterative or matrix)
        :param max_predictions: maximum number of boxes to output       (used in NMS_Type.MATRIX)
        :param with_confidence: in NMS, whether to multiply objectness  (used in NMS_Type.ITERATIVE)
                                score with class score
        :param class_agnostic_nms: indicates how boxes of different classes will be treated during
                                   NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX)
                                   True - NMS will be performed on all classes together.
                                   False - NMS will be performed on each class separately (default).
        :param multi_label_per_box: controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE)
                                    True - each anchor can produce multiple labels of different classes
                                           that pass confidence threshold check (default).
                                    False - each anchor can produce only one label of the class with the highest score.
        """
        super(YoloXPostPredictionCallback, self).__init__()
        self.conf = conf
        self.iou = iou
        self.classes = classes
        self.nms_type = nms_type
        self.max_pred = max_predictions
        self.with_confidence = with_confidence
        self.class_agnostic_nms = class_agnostic_nms
        self.multi_label_per_box = multi_label_per_box

    def forward(self, x: Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]], device: str = None):
        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.

        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
        """
        # Use the main output features in case of multiple outputs.
        if isinstance(x, (tuple, list)):
            x = x[0]

        if self.nms_type == NMS_Type.ITERATIVE:
            nms_result = non_max_suppression(
                x,
                conf_thres=self.conf,
                iou_thres=self.iou,
                with_confidence=self.with_confidence,
                multi_label_per_box=self.multi_label_per_box,
                class_agnostic_nms=self.class_agnostic_nms,
            )
        else:
            nms_result = matrix_non_max_suppression(x, conf_thres=self.conf, max_num_of_detections=self.max_pred, class_agnostic_nms=self.class_agnostic_nms)

        return self._filter_max_predictions(nms_result)

    def _filter_max_predictions(self, res: List) -> List:
        res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
        return res

__init__(conf=0.001, iou=0.6, classes=None, nms_type=NMS_Type.ITERATIVE, max_predictions=300, with_confidence=True, class_agnostic_nms=False, multi_label_per_box=True)

Parameters:

Name Type Description Default
conf float

confidence threshold

0.001
iou float

IoU threshold (used in NMS_Type.ITERATIVE)

0.6
classes List[int]

(optional list) filter by class (used in NMS_Type.ITERATIVE)

None
nms_type NMS_Type

the type of nms to use (iterative or matrix)

NMS_Type.ITERATIVE
max_predictions int

maximum number of boxes to output (used in NMS_Type.MATRIX)

300
with_confidence bool

in NMS, whether to multiply objectness (used in NMS_Type.ITERATIVE) score with class score

True
class_agnostic_nms bool

indicates how boxes of different classes will be treated during NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX) True - NMS will be performed on all classes together. False - NMS will be performed on each class separately (default).

False
multi_label_per_box bool

controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE) True - each anchor can produce multiple labels of different classes that pass confidence threshold check (default). False - each anchor can produce only one label of the class with the highest score.

True
Source code in src/super_gradients/training/models/detection_models/yolo_base.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def __init__(
    self,
    conf: float = 0.001,
    iou: float = 0.6,
    classes: List[int] = None,
    nms_type: NMS_Type = NMS_Type.ITERATIVE,
    max_predictions: int = 300,
    with_confidence: bool = True,
    class_agnostic_nms: bool = False,
    multi_label_per_box: bool = True,
):
    """
    :param conf: confidence threshold
    :param iou: IoU threshold                                       (used in NMS_Type.ITERATIVE)
    :param classes: (optional list) filter by class                 (used in NMS_Type.ITERATIVE)
    :param nms_type: the type of nms to use (iterative or matrix)
    :param max_predictions: maximum number of boxes to output       (used in NMS_Type.MATRIX)
    :param with_confidence: in NMS, whether to multiply objectness  (used in NMS_Type.ITERATIVE)
                            score with class score
    :param class_agnostic_nms: indicates how boxes of different classes will be treated during
                               NMS step (used in NMS_Type.ITERATIVE and NMS_Type.MATRIX)
                               True - NMS will be performed on all classes together.
                               False - NMS will be performed on each class separately (default).
    :param multi_label_per_box: controls whether to decode multiple labels per box (used in NMS_Type.ITERATIVE)
                                True - each anchor can produce multiple labels of different classes
                                       that pass confidence threshold check (default).
                                False - each anchor can produce only one label of the class with the highest score.
    """
    super(YoloXPostPredictionCallback, self).__init__()
    self.conf = conf
    self.iou = iou
    self.classes = classes
    self.nms_type = nms_type
    self.max_pred = max_predictions
    self.with_confidence = with_confidence
    self.class_agnostic_nms = class_agnostic_nms
    self.multi_label_per_box = multi_label_per_box

forward(x, device=None)

Apply NMS to the raw output of the model and keep only top max_predictions results.

Parameters:

Name Type Description Default
x Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]

Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)

required

Returns:

Type Description

List of Tensors of shape (x1, y1, x2, y2, conf, cls)

Source code in src/super_gradients/training/models/detection_models/yolo_base.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def forward(self, x: Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]], device: str = None):
    """Apply NMS to the raw output of the model and keep only top `max_predictions` results.

    :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
    :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
    """
    # Use the main output features in case of multiple outputs.
    if isinstance(x, (tuple, list)):
        x = x[0]

    if self.nms_type == NMS_Type.ITERATIVE:
        nms_result = non_max_suppression(
            x,
            conf_thres=self.conf,
            iou_thres=self.iou,
            with_confidence=self.with_confidence,
            multi_label_per_box=self.multi_label_per_box,
            class_agnostic_nms=self.class_agnostic_nms,
        )
    else:
        nms_result = matrix_non_max_suppression(x, conf_thres=self.conf, max_num_of_detections=self.max_pred, class_agnostic_nms=self.class_agnostic_nms)

    return self._filter_max_predictions(nms_result)

NDFLHeads

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
@register_detection_module()
class NDFLHeads(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        heads_list: Union[str, HpmStruct, DictConfig],
        grid_cell_scale: float = 5.0,
        grid_cell_offset: float = 0.5,
        reg_max: int = 16,
        eval_size: Optional[Tuple[int, int]] = None,
        width_mult: float = 1.0,
    ):
        """
        Initializes the NDFLHeads module.

        :param num_classes: Number of detection classes
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max: Number of bins in the regression head
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param width_mult: A scaling factor applied to in_channels.
        """
        super(NDFLHeads, self).__init__(in_channels)
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

        factory = det_factory.DetectionModulesFactory()
        heads_list = self._pass_args(heads_list, factory, num_classes, reg_max)

        self.num_heads = len(heads_list)
        fpn_strides: List[int] = []
        for i in range(self.num_heads):
            new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
            fpn_strides.append(new_head.stride)
            setattr(self, f"head{i + 1}", new_head)

        self.fpn_strides = tuple(fpn_strides)

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        for i in range(self.num_heads):
            head = getattr(self, f"head{i + 1}")
            head.replace_num_classes(num_classes, compute_new_weights_fn)

        self.num_classes = num_classes

    @staticmethod
    def _pass_args(heads_list, factory, num_classes, reg_max):
        for i in range(len(heads_list)):
            heads_list[i] = factory.insert_module_param(heads_list[i], "num_classes", num_classes)
            heads_list[i] = factory.insert_module_param(heads_list[i], "reg_max", reg_max)
        return heads_list

    @torch.jit.ignore
    def cache_anchors(self, input_size: Tuple[int, int]):
        self.eval_size = input_size
        device = infer_model_device(self)
        dtype = infer_model_dtype(self)

        anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
        self.register_buffer("anchor_points", anchor_points, persistent=False)
        self.register_buffer("stride_tensor", stride_tensor, persistent=False)

    @torch.jit.ignore
    def _init_weights(self):
        if self.eval_size:
            device = infer_model_device(self)
            dtype = infer_model_dtype(self)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def forward(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:
        feats = feats[: self.num_heads]
        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            reg_distri, cls_logit = getattr(self, f"head{i + 1}")(feat)
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        if torch.jit.is_tracing():
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    @property
    def out_channels(self):
        return None

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)

            # ONNX export does not support arange with float16, so it is created as fp32 and then casted to fp16
            # This produce correct fp16 weights in ONNX model when exported
            shift_x = torch.arange(end=w, dtype=torch.float32, device=device) + self.grid_cell_offset
            shift_y = torch.arange(end=h, dtype=torch.float32, device=device) + self.grid_cell_offset

            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype, device=device))

        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        return anchor_points, stride_tensor

__init__(num_classes, in_channels, heads_list, grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, eval_size=None, width_mult=1.0)

Initializes the NDFLHeads module.

Parameters:

Name Type Description Default
num_classes int

Number of detection classes

required
in_channels Tuple[int, int, int]

Number of channels for each feature map (See width_mult)

required
grid_cell_scale float 5.0
grid_cell_offset float 0.5
reg_max int

Number of bins in the regression head

16
eval_size Optional[Tuple[int, int]]

(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.

None
width_mult float

A scaling factor applied to in_channels.

1.0
Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    heads_list: Union[str, HpmStruct, DictConfig],
    grid_cell_scale: float = 5.0,
    grid_cell_offset: float = 0.5,
    reg_max: int = 16,
    eval_size: Optional[Tuple[int, int]] = None,
    width_mult: float = 1.0,
):
    """
    Initializes the NDFLHeads module.

    :param num_classes: Number of detection classes
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param grid_cell_scale:
    :param grid_cell_offset:
    :param reg_max: Number of bins in the regression head
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param width_mult: A scaling factor applied to in_channels.
    """
    super(NDFLHeads, self).__init__(in_channels)
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

    factory = det_factory.DetectionModulesFactory()
    heads_list = self._pass_args(heads_list, factory, num_classes, reg_max)

    self.num_heads = len(heads_list)
    fpn_strides: List[int] = []
    for i in range(self.num_heads):
        new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
        fpn_strides.append(new_head.stride)
        setattr(self, f"head{i + 1}", new_head)

    self.fpn_strides = tuple(fpn_strides)

YoloNASDFLHead

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@register_detection_module()
class YoloNASDFLHead(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        in_channels: int,
        inter_channels: int,
        width_mult: float,
        first_conv_group_size: int,
        num_classes: int,
        stride: int,
        reg_max: int,
        cls_dropout_rate: float = 0.0,
        reg_dropout_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASDFLHead
        :param in_channels: Input channels
        :param inter_channels: Intermediate number of channels
        :param width_mult: Width multiplier
        :param first_conv_group_size: Group size
        :param num_classes: Number of detection classes
        :param stride: Output stride for this head
        :param reg_max: Number of bins in the regression head
        :param cls_dropout_rate: Dropout rate for the classification head
        :param reg_dropout_rate: Dropout rate for the regression head
        """
        super().__init__(in_channels)

        inter_channels = width_multiplier(inter_channels, width_mult, 8)
        if first_conv_group_size == 0:
            groups = 0
        elif first_conv_group_size == -1:
            groups = 1
        else:
            groups = inter_channels // first_conv_group_size

        self.num_classes = num_classes
        self.stem = ConvBNReLU(in_channels, inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

        first_cls_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        first_reg_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        self.cls_pred = nn.Conv2d(inter_channels, self.num_classes, 1, 1, 0)
        self.reg_pred = nn.Conv2d(inter_channels, 4 * (reg_max + 1), 1, 1, 0)

        self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
        self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

        self.grid = torch.zeros(1)
        self.stride = stride

        self.prior_prob = 1e-2
        self._initialize_biases()

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        self.cls_pred = compute_new_weights_fn(self.cls_pred, num_classes)
        self.num_classes = num_classes

    @property
    def out_channels(self):
        return None

    def forward(self, x):
        x = self.stem(x)

        cls_feat = self.cls_convs(x)
        cls_feat = self.cls_dropout_rate(cls_feat)
        cls_output = self.cls_pred(cls_feat)

        reg_feat = self.reg_convs(x)
        reg_feat = self.reg_dropout_rate(reg_feat)
        reg_output = self.reg_pred(reg_feat)

        return reg_output, cls_output

    def _initialize_biases(self):
        prior_bias = -math.log((1 - self.prior_prob) / self.prior_prob)
        torch.nn.init.constant_(self.cls_pred.bias, prior_bias)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij")
        else:
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

__init__(in_channels, inter_channels, width_mult, first_conv_group_size, num_classes, stride, reg_max, cls_dropout_rate=0.0, reg_dropout_rate=0.0)

Initialize the YoloNASDFLHead

Parameters:

Name Type Description Default
in_channels int

Input channels

required
inter_channels int

Intermediate number of channels

required
width_mult float

Width multiplier

required
first_conv_group_size int

Group size

required
num_classes int

Number of detection classes

required
stride int

Output stride for this head

required
reg_max int

Number of bins in the regression head

required
cls_dropout_rate float

Dropout rate for the classification head

0.0
reg_dropout_rate float

Dropout rate for the regression head

0.0
Source code in src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(
    self,
    in_channels: int,
    inter_channels: int,
    width_mult: float,
    first_conv_group_size: int,
    num_classes: int,
    stride: int,
    reg_max: int,
    cls_dropout_rate: float = 0.0,
    reg_dropout_rate: float = 0.0,
):
    """
    Initialize the YoloNASDFLHead
    :param in_channels: Input channels
    :param inter_channels: Intermediate number of channels
    :param width_mult: Width multiplier
    :param first_conv_group_size: Group size
    :param num_classes: Number of detection classes
    :param stride: Output stride for this head
    :param reg_max: Number of bins in the regression head
    :param cls_dropout_rate: Dropout rate for the classification head
    :param reg_dropout_rate: Dropout rate for the regression head
    """
    super().__init__(in_channels)

    inter_channels = width_multiplier(inter_channels, width_mult, 8)
    if first_conv_group_size == 0:
        groups = 0
    elif first_conv_group_size == -1:
        groups = 1
    else:
        groups = inter_channels // first_conv_group_size

    self.num_classes = num_classes
    self.stem = ConvBNReLU(in_channels, inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

    first_cls_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    first_reg_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    self.cls_pred = nn.Conv2d(inter_channels, self.num_classes, 1, 1, 0)
    self.reg_pred = nn.Conv2d(inter_channels, 4 * (reg_max + 1), 1, 1, 0)

    self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
    self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

    self.grid = torch.zeros(1)
    self.stride = stride

    self.prior_prob = 1e-2
    self._initialize_biases()

YoloNASPANNeckWithC2

Bases: BaseDetectionModule

A PAN (path aggregation network) neck with 4 stages (2 up-sampling and 2 down-sampling stages) where the up-sampling stages include a higher resolution skip Returns outputs of neck stage 2, stage 3, stage 4

Source code in src/super_gradients/training/models/detection_models/yolo_nas/panneck.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@register_detection_module("YoloNASPANNeckWithC2")
class YoloNASPANNeckWithC2(BaseDetectionModule):
    """
    A PAN (path aggregation network) neck with 4 stages (2 up-sampling and 2 down-sampling stages)
    where the up-sampling stages include a higher resolution skip
    Returns outputs of neck stage 2, stage 3, stage 4
    """

    def __init__(
        self,
        in_channels: List[int],
        neck1: Union[str, HpmStruct, DictConfig],
        neck2: Union[str, HpmStruct, DictConfig],
        neck3: Union[str, HpmStruct, DictConfig],
        neck4: Union[str, HpmStruct, DictConfig],
    ):
        """
        Initialize the PAN neck

        :param in_channels: Input channels of the 4 feature maps from the backbone
        :param neck1: First neck stage config
        :param neck2: Second neck stage config
        :param neck3: Third neck stage config
        :param neck4: Fourth neck stage config
        """
        super().__init__(in_channels)
        c2_out_channels, c3_out_channels, c4_out_channels, c5_out_channels = in_channels

        factory = det_factory.DetectionModulesFactory()
        self.neck1 = factory.get(factory.insert_module_param(neck1, "in_channels", [c5_out_channels, c4_out_channels, c3_out_channels]))
        self.neck2 = factory.get(factory.insert_module_param(neck2, "in_channels", [self.neck1.out_channels[1], c3_out_channels, c2_out_channels]))
        self.neck3 = factory.get(factory.insert_module_param(neck3, "in_channels", [self.neck2.out_channels[1], self.neck2.out_channels[0]]))
        self.neck4 = factory.get(factory.insert_module_param(neck4, "in_channels", [self.neck3.out_channels, self.neck1.out_channels[0]]))

        self._out_channels = [
            self.neck2.out_channels[1],
            self.neck3.out_channels,
            self.neck4.out_channels,
        ]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
        c2, c3, c4, c5 = inputs

        x_n1_inter, x = self.neck1([c5, c4, c3])
        x_n2_inter, p3 = self.neck2([x, c3, c2])
        p4 = self.neck3([p3, x_n2_inter])
        p5 = self.neck4([p4, x_n1_inter])

        return p3, p4, p5

__init__(in_channels, neck1, neck2, neck3, neck4)

Initialize the PAN neck

Parameters:

Name Type Description Default
in_channels List[int]

Input channels of the 4 feature maps from the backbone

required
neck1 Union[str, HpmStruct, DictConfig]

First neck stage config

required
neck2 Union[str, HpmStruct, DictConfig]

Second neck stage config

required
neck3 Union[str, HpmStruct, DictConfig]

Third neck stage config

required
neck4 Union[str, HpmStruct, DictConfig]

Fourth neck stage config

required
Source code in src/super_gradients/training/models/detection_models/yolo_nas/panneck.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def __init__(
    self,
    in_channels: List[int],
    neck1: Union[str, HpmStruct, DictConfig],
    neck2: Union[str, HpmStruct, DictConfig],
    neck3: Union[str, HpmStruct, DictConfig],
    neck4: Union[str, HpmStruct, DictConfig],
):
    """
    Initialize the PAN neck

    :param in_channels: Input channels of the 4 feature maps from the backbone
    :param neck1: First neck stage config
    :param neck2: Second neck stage config
    :param neck3: Third neck stage config
    :param neck4: Fourth neck stage config
    """
    super().__init__(in_channels)
    c2_out_channels, c3_out_channels, c4_out_channels, c5_out_channels = in_channels

    factory = det_factory.DetectionModulesFactory()
    self.neck1 = factory.get(factory.insert_module_param(neck1, "in_channels", [c5_out_channels, c4_out_channels, c3_out_channels]))
    self.neck2 = factory.get(factory.insert_module_param(neck2, "in_channels", [self.neck1.out_channels[1], c3_out_channels, c2_out_channels]))
    self.neck3 = factory.get(factory.insert_module_param(neck3, "in_channels", [self.neck2.out_channels[1], self.neck2.out_channels[0]]))
    self.neck4 = factory.get(factory.insert_module_param(neck4, "in_channels", [self.neck3.out_channels, self.neck1.out_channels[0]]))

    self._out_channels = [
        self.neck2.out_channels[1],
        self.neck3.out_channels,
        self.neck4.out_channels,
    ]

YoloNAS

Bases: ExportableObjectDetectionModel, SupportsInputShapeCheck, CustomizableDetector

Export to ONNX/TRT Support matrix ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

Batch Size Export Engine Format OnnxRuntime 1.13.1 TensorRT 8.4.2 TensorRT 8.5.3 TensorRT 8.6.1
1 ONNX Flat Yes Yes Yes Yes
>1 ONNX Flat Yes No No No
1 ONNX Batch Yes No Yes Yes
>1 ONNX Batch Yes No No Yes
1 TensorRT Flat No No Yes Yes
>1 TensorRT Flat No No Yes Yes
1 TensorRT Batch No Yes Yes Yes
>1 TensorRT Batch No Yes Yes Yes
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class YoloNAS(ExportableObjectDetectionModel, SupportsInputShapeCheck, CustomizableDetector):
    """

    Export to ONNX/TRT Support matrix
    ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

    | Batch Size | Export Engine | Format | OnnxRuntime 1.13.1 | TensorRT 8.4.2 | TensorRT 8.5.3 | TensorRT 8.6.1 |
    |------------|---------------|--------|--------------------|----------------|----------------|----------------|
    | 1          | ONNX          | Flat   | Yes                | Yes            | Yes            | Yes            |
    | >1         | ONNX          | Flat   | Yes                | No             | No             | No             |
    | 1          | ONNX          | Batch  | Yes                | No             | Yes            | Yes            |
    | >1         | ONNX          | Batch  | Yes                | No             | No             | Yes            |
    | 1          | TensorRT      | Flat   | No                 | No             | Yes            | Yes            |
    | >1         | TensorRT      | Flat   | No                 | No             | Yes            | Yes            |
    | 1          | TensorRT      | Batch  | No                 | Yes            | Yes            | Yes            |
    | >1         | TensorRT      | Batch  | No                 | Yes            | Yes            | Yes            |

    """

    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        super().__init__(backbone, heads, neck, num_classes, bn_eps, bn_momentum, inplace_act, in_channels)

    def get_post_prediction_callback(
        self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
    ) -> PPYoloEPostPredictionCallback:
        """
        Get a post prediction callback for this model.

        :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
        :param iou:                 A IoU threshold for boxes non-maximum suppression.
        :param nms_top_k:           The maximum number of detections to consider for NMS.
        :param max_predictions:     The maximum number of detections to return.
        :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                    If False, each anchor can produce only one label of the class with the highest score.
        :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                    If False NMS is performed separately for each class.
        :return:
        """
        return PPYoloEPostPredictionCallback(
            score_threshold=conf,
            nms_threshold=iou,
            nms_top_k=nms_top_k,
            max_predictions=max_predictions,
            multi_label_per_box=multi_label_per_box,
            class_agnostic_nms=class_agnostic_nms,
        )

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractObjectDetectionDecodingModule:
        return YoloNASDecodingModule(num_pre_nms_predictions)

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        if processing is None:
            raise ModelHasNoPreprocessingParamsException()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

get_post_prediction_callback(*, conf, iou, nms_top_k, max_predictions, multi_label_per_box, class_agnostic_nms)

Get a post prediction callback for this model.

Parameters:

Name Type Description Default
conf float

A minimum confidence threshold for predictions to be used in post-processing.

required
iou float

A IoU threshold for boxes non-maximum suppression.

required
nms_top_k int

The maximum number of detections to consider for NMS.

required
max_predictions int

The maximum number of detections to return.

required
multi_label_per_box bool

If True, each anchor can produce multiple labels of different classes. If False, each anchor can produce only one label of the class with the highest score.

required
class_agnostic_nms bool

If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked). If False NMS is performed separately for each class.

required

Returns:

Type Description
PPYoloEPostPredictionCallback
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def get_post_prediction_callback(
    self, *, conf: float, iou: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool, class_agnostic_nms: bool
) -> PPYoloEPostPredictionCallback:
    """
    Get a post prediction callback for this model.

    :param conf:                A minimum confidence threshold for predictions to be used in post-processing.
    :param iou:                 A IoU threshold for boxes non-maximum suppression.
    :param nms_top_k:           The maximum number of detections to consider for NMS.
    :param max_predictions:     The maximum number of detections to return.
    :param multi_label_per_box: If True, each anchor can produce multiple labels of different classes.
                                If False, each anchor can produce only one label of the class with the highest score.
    :param class_agnostic_nms:  If True, perform class-agnostic NMS (i.e IoU of boxes of different classes is checked).
                                If False NMS is performed separately for each class.
    :return:
    """
    return PPYoloEPostPredictionCallback(
        score_threshold=conf,
        nms_threshold=iou,
        nms_top_k=nms_top_k,
        max_predictions=max_predictions,
        multi_label_per_box=multi_label_per_box,
        class_agnostic_nms=class_agnostic_nms,
    )

YoloNASDecodingModule

Bases: AbstractObjectDetectionDecodingModule

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class YoloNASDecodingModule(AbstractObjectDetectionDecodingModule):
    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, predictions: Any) -> int:
        """

        :param inputs:
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = predictions
        else:
            pred_bboxes, pred_scores = predictions[0]

        return pred_bboxes.size(1)

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
        if torch.jit.is_tracing():
            pred_bboxes, pred_scores = inputs
        else:
            pred_bboxes, pred_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_scores.size()

        pred_cls_conf, _ = torch.max(pred_scores, dim=2)  # [B, Anchors]
        topk_candidates = torch.topk(pred_cls_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_cls_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1)
        flat_indices = torch.flatten(indices_with_offset)

        output_pred_bboxes = pred_bboxes.reshape(-1, pred_bboxes.size(2))[flat_indices, :].reshape(pred_bboxes.size(0), nms_top_k, pred_bboxes.size(2))
        output_pred_scores = pred_scores.reshape(-1, pred_scores.size(2))[flat_indices, :].reshape(pred_scores.size(0), nms_top_k, pred_scores.size(2))

        return output_pred_bboxes, output_pred_scores

infer_total_number_of_predictions(predictions)

Parameters:

Name Type Description Default
inputs required

Returns:

Type Description
int
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py
36
37
38
39
40
41
42
43
44
45
46
47
48
@torch.jit.ignore
def infer_total_number_of_predictions(self, predictions: Any) -> int:
    """

    :param inputs:
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes, pred_scores = predictions
    else:
        pred_bboxes, pred_scores = predictions[0]

    return pred_bboxes.size(1)

SequentialWithIntermediates

Bases: nn.Sequential

A Sequential module that can return all intermediate values as a list of Tensors

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class SequentialWithIntermediates(nn.Sequential):
    """
    A Sequential module that can return all intermediate values as a list of Tensors
    """

    def __init__(self, output_intermediates: bool, *args):
        super(SequentialWithIntermediates, self).__init__(*args)
        self.output_intermediates = output_intermediates

    def forward(self, input: Tensor) -> List[Tensor]:
        if self.output_intermediates:
            output = [input]
            for module in self:
                output.append(module(output[-1]))
            return output
        #  For uniformity, we return a list even if we don't output intermediates
        return [super(SequentialWithIntermediates, self).forward(input)]

YoloNASBottleneck

Bases: nn.Module

A bottleneck block for YoloNAS. Consists of two consecutive blocks and optional residual connection.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class YoloNASBottleneck(nn.Module):
    """
    A bottleneck block for YoloNAS. Consists of two consecutive blocks and optional residual connection.
    """

    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        block_type: Type[nn.Module],
        activation_type: Type[nn.Module],
        shortcut: bool,
        use_alpha: bool,
        drop_path_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASBottleneck block

        :param input_channels: Number of input channels
        :param output_channels: Number of output channels
        :param block_type: Type of the convolutional block
        :param activation_type: Activation type for the convolutional block
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        :param drop_path_rate: Drop path rate for the residual path of the block
        """
        super().__init__()

        self.cv1 = block_type(input_channels, output_channels, activation_type=activation_type)
        self.cv2 = block_type(output_channels, output_channels, activation_type=activation_type)
        self.add = shortcut and input_channels == output_channels
        self.shortcut = Residual() if self.add else None
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        if use_alpha:
            self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
        else:
            self.alpha = 1.0

    def forward(self, x):
        y = self.drop_path(self.cv2(self.cv1(x)))
        return self.alpha * self.shortcut(x) + y if self.add else y

__init__(input_channels, output_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=0.0)

Initialize the YoloNASBottleneck block

Parameters:

Name Type Description Default
input_channels int

Number of input channels

required
output_channels int

Number of output channels

required
block_type Type[nn.Module]

Type of the convolutional block

required
activation_type Type[nn.Module]

Activation type for the convolutional block

required
shortcut bool

If True, adds the residual connection from input to output.

required
use_alpha bool

If True, adds the learnable alpha parameter (multiplier for the residual connection).

required
drop_path_rate float

Drop path rate for the residual path of the block

0.0
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    input_channels: int,
    output_channels: int,
    block_type: Type[nn.Module],
    activation_type: Type[nn.Module],
    shortcut: bool,
    use_alpha: bool,
    drop_path_rate: float = 0.0,
):
    """
    Initialize the YoloNASBottleneck block

    :param input_channels: Number of input channels
    :param output_channels: Number of output channels
    :param block_type: Type of the convolutional block
    :param activation_type: Activation type for the convolutional block
    :param shortcut: If True, adds the residual connection from input to output.
    :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
    :param drop_path_rate: Drop path rate for the residual path of the block
    """
    super().__init__()

    self.cv1 = block_type(input_channels, output_channels, activation_type=activation_type)
    self.cv2 = block_type(output_channels, output_channels, activation_type=activation_type)
    self.add = shortcut and input_channels == output_channels
    self.shortcut = Residual() if self.add else None
    self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
    if use_alpha:
        self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
    else:
        self.alpha = 1.0

YoloNASCSPLayer

Bases: nn.Module

Cross-stage layer module for YoloNAS.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class YoloNASCSPLayer(nn.Module):
    """
    Cross-stage layer module for YoloNAS.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        block_type: Type[nn.Module],
        activation_type: Type[nn.Module],
        shortcut: bool = True,
        use_alpha: bool = True,
        expansion: float = 0.5,
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
    ):
        """

        :param in_channels: Number of input channels.
        :param out_channels:  Number of output channels.
        :param num_bottlenecks: Number of bottleneck blocks.
        :param block_type: Bottleneck block type.
        :param activation_type: Activation type for all blocks.
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        :param expansion: If hidden_channels is None, hidden_channels is set to in_channels * expansion.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                                Must have the length equal to the num_bottlenecks or None.
        :param dropout_rate: Dropout probability before the last convolution in this layer.
        """
        if drop_path_rates is None:
            drop_path_rates = [0.0] * num_bottlenecks
        else:
            drop_path_rates = tuple(drop_path_rates)
        if len(drop_path_rates) != num_bottlenecks:
            raise ValueError(
                f"Argument drop_path_rates ({drop_path_rates}, len {len(drop_path_rates)} "
                f"must have the length equal to the num_bottlenecks ({num_bottlenecks})."
            )

        super(YoloNASCSPLayer, self).__init__()
        if hidden_channels is None:
            hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv3 = Conv(hidden_channels * (2 + concat_intermediates * num_bottlenecks), out_channels, 1, stride=1, activation_type=activation_type)
        module_list = [
            YoloNASBottleneck(hidden_channels, hidden_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=drop_path_rates[i])
            for i in range(num_bottlenecks)
        ]
        self.bottlenecks = SequentialWithIntermediates(concat_intermediates, *module_list)
        self.dropout = nn.Dropout2d(dropout_rate, inplace=True) if dropout_rate > 0.0 else nn.Identity()

    def forward(self, x: Tensor) -> Tensor:
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((*x_1, x_2), dim=1)
        x = self.dropout(x)
        return self.conv3(x)

__init__(in_channels, out_channels, num_bottlenecks, block_type, activation_type, shortcut=True, use_alpha=True, expansion=0.5, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0)

Parameters:

Name Type Description Default
in_channels int

Number of input channels.

required
out_channels int

Number of output channels.

required
num_bottlenecks int

Number of bottleneck blocks.

required
block_type Type[nn.Module]

Bottleneck block type.

required
activation_type Type[nn.Module]

Activation type for all blocks.

required
shortcut bool

If True, adds the residual connection from input to output.

True
use_alpha bool

If True, adds the learnable alpha parameter (multiplier for the residual connection).

True
expansion float

If hidden_channels is None, hidden_channels is set to in_channels * expansion.

0.5
hidden_channels int

If not None, sets the number of hidden channels used inside the bottleneck blocks.

None
concat_intermediates bool False
drop_path_rates Union[Iterable[float], None]

List of drop path probabilities for each bottleneck block. Must have the length equal to the num_bottlenecks or None.

None
dropout_rate float

Dropout probability before the last convolution in this layer.

0.0
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_bottlenecks: int,
    block_type: Type[nn.Module],
    activation_type: Type[nn.Module],
    shortcut: bool = True,
    use_alpha: bool = True,
    expansion: float = 0.5,
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
):
    """

    :param in_channels: Number of input channels.
    :param out_channels:  Number of output channels.
    :param num_bottlenecks: Number of bottleneck blocks.
    :param block_type: Bottleneck block type.
    :param activation_type: Activation type for all blocks.
    :param shortcut: If True, adds the residual connection from input to output.
    :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
    :param expansion: If hidden_channels is None, hidden_channels is set to in_channels * expansion.
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates:
    :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                            Must have the length equal to the num_bottlenecks or None.
    :param dropout_rate: Dropout probability before the last convolution in this layer.
    """
    if drop_path_rates is None:
        drop_path_rates = [0.0] * num_bottlenecks
    else:
        drop_path_rates = tuple(drop_path_rates)
    if len(drop_path_rates) != num_bottlenecks:
        raise ValueError(
            f"Argument drop_path_rates ({drop_path_rates}, len {len(drop_path_rates)} "
            f"must have the length equal to the num_bottlenecks ({num_bottlenecks})."
        )

    super(YoloNASCSPLayer, self).__init__()
    if hidden_channels is None:
        hidden_channels = int(out_channels * expansion)
    self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
    self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
    self.conv3 = Conv(hidden_channels * (2 + concat_intermediates * num_bottlenecks), out_channels, 1, stride=1, activation_type=activation_type)
    module_list = [
        YoloNASBottleneck(hidden_channels, hidden_channels, block_type, activation_type, shortcut, use_alpha, drop_path_rate=drop_path_rates[i])
        for i in range(num_bottlenecks)
    ]
    self.bottlenecks = SequentialWithIntermediates(concat_intermediates, *module_list)
    self.dropout = nn.Dropout2d(dropout_rate, inplace=True) if dropout_rate > 0.0 else nn.Identity()

YoloNASDownStage

Bases: BaseDetectionModule

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
@register_detection_module()
class YoloNASDownStage(BaseDetectionModule):
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
    ):
        """
        Initializes a YoloNASDownStage.

        :param in_channels: Number of input channels.
        :param out_channels: Number of output channels.
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of blocks in the stage.
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Type of activation to use inside the blocks.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        """

        super().__init__(in_channels)

        in_channels, skip_in_channels = in_channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        self.conv = Conv(in_channels, out_channels // 2, 3, 2, activation_type)
        after_concat_channels = out_channels // 2 + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            in_channels=after_concat_channels,
            out_channels=out_channels,
            num_bottlenecks=num_blocks,
            block_type=partial(Conv, kernel=3, stride=1),
            activation_type=activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

        self._out_channels = out_channels

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        x, skip_x = inputs
        x = self.conv(x)
        x = torch.cat([x, skip_x], 1)
        x = self.blocks(x)
        return x

__init__(in_channels, out_channels, width_mult, num_blocks, depth_mult, activation_type, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0)

Initializes a YoloNASDownStage.

Parameters:

Name Type Description Default
in_channels List[int]

Number of input channels.

required
out_channels int

Number of output channels.

required
width_mult float

Multiplier for the number of channels in the stage.

required
num_blocks int

Number of blocks in the stage.

required
depth_mult float

Multiplier for the number of blocks in the stage.

required
activation_type Type[nn.Module]

Type of activation to use inside the blocks.

required
hidden_channels int

If not None, sets the number of hidden channels used inside the bottleneck blocks.

None
concat_intermediates bool False
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
@resolve_param("activation_type", ActivationsTypeFactory())
def __init__(
    self,
    in_channels: List[int],
    out_channels: int,
    width_mult: float,
    num_blocks: int,
    depth_mult: float,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
):
    """
    Initializes a YoloNASDownStage.

    :param in_channels: Number of input channels.
    :param out_channels: Number of output channels.
    :param width_mult: Multiplier for the number of channels in the stage.
    :param num_blocks: Number of blocks in the stage.
    :param depth_mult: Multiplier for the number of blocks in the stage.
    :param activation_type: Type of activation to use inside the blocks.
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates:
    """

    super().__init__(in_channels)

    in_channels, skip_in_channels = in_channels
    out_channels = width_multiplier(out_channels, width_mult, 8)
    num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

    self.conv = Conv(in_channels, out_channels // 2, 3, 2, activation_type)
    after_concat_channels = out_channels // 2 + skip_in_channels
    self.blocks = YoloNASCSPLayer(
        in_channels=after_concat_channels,
        out_channels=out_channels,
        num_bottlenecks=num_blocks,
        block_type=partial(Conv, kernel=3, stride=1),
        activation_type=activation_type,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

    self._out_channels = out_channels

YoloNASStage

Bases: BaseDetectionModule

A single stage module for YoloNAS. It consists of a downsample block (QARepVGGBlock) followed by YoloNASCSPLayer.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
@register_detection_module()
class YoloNASStage(BaseDetectionModule):
    """
    A single stage module for YoloNAS. It consists of a downsample block (QARepVGGBlock) followed by YoloNASCSPLayer.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks: int,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
        stride: int = 2,
    ):
        """
        Initialize the YoloNASStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of bottleneck blocks in the YoloNASCSPLayer
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates: If True, concatenates the intermediate values from the YoloNASCSPLayer.
        :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                                Must have the length equal to the num_blocks or None.
        :param dropout_rate: Dropout probability before the last convolution in this layer.
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.downsample = QARepVGGBlock(in_channels, out_channels, stride=stride, activation_type=activation_type, use_residual_connection=False)
        self.blocks = YoloNASCSPLayer(
            out_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            True,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x):
        return self.blocks(self.downsample(x))

__init__(in_channels, out_channels, num_blocks, activation_type, hidden_channels=None, concat_intermediates=False, drop_path_rates=None, dropout_rate=0.0, stride=2)

Initialize the YoloNASStage module

Parameters:

Name Type Description Default
in_channels int

Number of input channels

required
out_channels int

Number of output channels

required
num_blocks int

Number of bottleneck blocks in the YoloNASCSPLayer

required
activation_type Type[nn.Module]

Activation type for all blocks

required
hidden_channels int

If not None, sets the number of hidden channels used inside the bottleneck blocks.

None
concat_intermediates bool

If True, concatenates the intermediate values from the YoloNASCSPLayer.

False
drop_path_rates Union[Iterable[float], None]

List of drop path probabilities for each bottleneck block. Must have the length equal to the num_blocks or None.

None
dropout_rate float

Dropout probability before the last convolution in this layer.

0.0
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
@resolve_param("activation_type", ActivationsTypeFactory())
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    num_blocks: int,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
    stride: int = 2,
):
    """
    Initialize the YoloNASStage module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param num_blocks: Number of bottleneck blocks in the YoloNASCSPLayer
    :param activation_type: Activation type for all blocks
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
    :param concat_intermediates: If True, concatenates the intermediate values from the YoloNASCSPLayer.
    :param drop_path_rates: List of drop path probabilities for each bottleneck block.
                            Must have the length equal to the num_blocks or None.
    :param dropout_rate: Dropout probability before the last convolution in this layer.
    """
    super().__init__(in_channels)
    self._out_channels = out_channels
    self.downsample = QARepVGGBlock(in_channels, out_channels, stride=stride, activation_type=activation_type, use_residual_connection=False)
    self.blocks = YoloNASCSPLayer(
        out_channels,
        out_channels,
        num_blocks,
        QARepVGGBlock,
        activation_type,
        True,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

YoloNASStem

Bases: BaseDetectionModule, SupportsReplaceInputChannels

Stem module for YoloNAS. Consists of a single QARepVGGBlock with stride of two.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
@register_detection_module()
class YoloNASStem(BaseDetectionModule, SupportsReplaceInputChannels):
    """
    Stem module for YoloNAS. Consists of a single QARepVGGBlock with stride of two.
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
        """
        Initialize the YoloNASStem module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.conv = QARepVGGBlock(in_channels, out_channels, stride=stride, use_residual_connection=False)

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x: Tensor) -> Tensor:
        return self.conv(x)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.conv = QARepVGGBlock(in_channels, self._out_channels, stride=2, use_residual_connection=False)

    def get_input_channels(self) -> int:
        return self.conv.in_channels

__init__(in_channels, out_channels, stride=2)

Initialize the YoloNASStem module

Parameters:

Name Type Description Default
in_channels int

Number of input channels

required
out_channels int

Number of output channels

required
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
159
160
161
162
163
164
165
166
167
def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
    """
    Initialize the YoloNASStem module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    """
    super().__init__(in_channels)
    self._out_channels = out_channels
    self.conv = QARepVGGBlock(in_channels, out_channels, stride=stride, use_residual_connection=False)

YoloNASUpStage

Bases: BaseDetectionModule

Upsampling stage for YoloNAS.

Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
@register_detection_module()
class YoloNASUpStage(BaseDetectionModule):
    """
    Upsampling stage for YoloNAS.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    @resolve_param("upsample_mode", TypeFactory.from_enum_cls(UpsampleMode))
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        reduce_channels: bool = False,
        drop_path_rates: Union[Iterable[float], None] = None,
        dropout_rate: float = 0.0,
        upsample_mode: UpsampleMode = UpsampleMode.CONV_TRANSPOSE,
    ):
        """
        Initialize the YoloNASUpStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of bottleneck blocks
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks
        :param concat_intermediates:
        :param reduce_channels:
        """
        super().__init__(in_channels)

        num_inputs = len(in_channels)
        if num_inputs == 2:
            in_channels, skip_in_channels = in_channels
        else:
            in_channels, skip_in_channels1, skip_in_channels2 = in_channels
            skip_in_channels = skip_in_channels1 + out_channels  # skip2 downsample results in out_channels channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        if num_inputs == 2:
            self.reduce_skip = Conv(skip_in_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
        else:
            self.reduce_skip1 = Conv(skip_in_channels1, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
            self.reduce_skip2 = Conv(skip_in_channels2, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        self.conv = Conv(in_channels, out_channels, 1, 1, activation_type)

        self.upsample = make_upsample_module_with_explicit_channels(
            in_channels=out_channels, out_channels=out_channels, scale_factor=2, upsample_mode=upsample_mode, align_corners=True
        )
        if num_inputs == 3:
            self.downsample = Conv(out_channels if reduce_channels else skip_in_channels2, out_channels, kernel=3, stride=2, activation_type=activation_type)

        self.reduce_after_concat = Conv(num_inputs * out_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        after_concat_channels = out_channels if reduce_channels else out_channels + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            after_concat_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
            drop_path_rates=drop_path_rates,
            dropout_rate=dropout_rate,
        )

        self._out_channels = [out_channels, out_channels]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        if len(inputs) == 2:
            x, skip_x = inputs
            skip_x = [self.reduce_skip(skip_x)]
        else:
            x, skip_x1, skip_x2 = inputs
            skip_x1, skip_x2 = self.reduce_skip1(skip_x1), self.reduce_skip2(skip_x2)
            skip_x = [skip_x1, self.downsample(skip_x2)]
        x_inter = self.conv(x)
        x = self.upsample(x_inter)
        x = torch.cat([x, *skip_x], 1)
        x = self.reduce_after_concat(x)
        x = self.blocks(x)
        return x_inter, x

__init__(in_channels, out_channels, width_mult, num_blocks, depth_mult, activation_type, hidden_channels=None, concat_intermediates=False, reduce_channels=False, drop_path_rates=None, dropout_rate=0.0, upsample_mode=UpsampleMode.CONV_TRANSPOSE)

Initialize the YoloNASUpStage module

Parameters:

Name Type Description Default
in_channels List[int]

Number of input channels

required
out_channels int

Number of output channels

required
width_mult float

Multiplier for the number of channels in the stage.

required
num_blocks int

Number of bottleneck blocks

required
depth_mult float

Multiplier for the number of blocks in the stage.

required
activation_type Type[nn.Module]

Activation type for all blocks

required
hidden_channels int

If not None, sets the number of hidden channels used inside the bottleneck blocks

None
concat_intermediates bool False
reduce_channels bool False
Source code in src/super_gradients/training/models/detection_models/yolo_nas/yolo_stages.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
@resolve_param("activation_type", ActivationsTypeFactory())
@resolve_param("upsample_mode", TypeFactory.from_enum_cls(UpsampleMode))
def __init__(
    self,
    in_channels: List[int],
    out_channels: int,
    width_mult: float,
    num_blocks: int,
    depth_mult: float,
    activation_type: Type[nn.Module],
    hidden_channels: int = None,
    concat_intermediates: bool = False,
    reduce_channels: bool = False,
    drop_path_rates: Union[Iterable[float], None] = None,
    dropout_rate: float = 0.0,
    upsample_mode: UpsampleMode = UpsampleMode.CONV_TRANSPOSE,
):
    """
    Initialize the YoloNASUpStage module
    :param in_channels: Number of input channels
    :param out_channels: Number of output channels
    :param width_mult: Multiplier for the number of channels in the stage.
    :param num_blocks: Number of bottleneck blocks
    :param depth_mult: Multiplier for the number of blocks in the stage.
    :param activation_type: Activation type for all blocks
    :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks
    :param concat_intermediates:
    :param reduce_channels:
    """
    super().__init__(in_channels)

    num_inputs = len(in_channels)
    if num_inputs == 2:
        in_channels, skip_in_channels = in_channels
    else:
        in_channels, skip_in_channels1, skip_in_channels2 = in_channels
        skip_in_channels = skip_in_channels1 + out_channels  # skip2 downsample results in out_channels channels
    out_channels = width_multiplier(out_channels, width_mult, 8)
    num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

    if num_inputs == 2:
        self.reduce_skip = Conv(skip_in_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
    else:
        self.reduce_skip1 = Conv(skip_in_channels1, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
        self.reduce_skip2 = Conv(skip_in_channels2, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

    self.conv = Conv(in_channels, out_channels, 1, 1, activation_type)

    self.upsample = make_upsample_module_with_explicit_channels(
        in_channels=out_channels, out_channels=out_channels, scale_factor=2, upsample_mode=upsample_mode, align_corners=True
    )
    if num_inputs == 3:
        self.downsample = Conv(out_channels if reduce_channels else skip_in_channels2, out_channels, kernel=3, stride=2, activation_type=activation_type)

    self.reduce_after_concat = Conv(num_inputs * out_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

    after_concat_channels = out_channels if reduce_channels else out_channels + skip_in_channels
    self.blocks = YoloNASCSPLayer(
        after_concat_channels,
        out_channels,
        num_blocks,
        QARepVGGBlock,
        activation_type,
        hidden_channels=hidden_channels,
        concat_intermediates=concat_intermediates,
        drop_path_rates=drop_path_rates,
        dropout_rate=dropout_rate,
    )

    self._out_channels = [out_channels, out_channels]

KDModule

Bases: SgModule

KDModule

class implementing Knowledge Distillation logic as an SgModule

attributes: student: SgModule - the student model teacher: torch.nn.Module- the teacher model run_teacher_on_eval: bool- whether to run self.teacher at eval mode regardless of self.train(mode) arch_params: HpmStruct- Architecture H.P.

    Additionally, by passing teacher_input_adapter (torch.nn.Module) one can modify the teacher net to act as if
    teacher = torch.nn.Sequential(teacher_input_adapter, teacher). This is useful when teacher net expects a
    different input format from the student (for example different normalization).
    Equivalent arg for the student model, can be passed through student_input_adapter.
Source code in src/super_gradients/training/models/kd_modules/kd_module.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
@register_model(Models.KD_MODULE)
@register_kd_model(Models.KD_MODULE)
class KDModule(SgModule):
    """
    KDModule

    class implementing Knowledge Distillation logic as an SgModule

    attributes:
        student: SgModule - the student model
        teacher: torch.nn.Module- the teacher model
        run_teacher_on_eval: bool- whether to run self.teacher at eval mode regardless of self.train(mode)
        arch_params: HpmStruct- Architecture H.P.

            Additionally, by passing teacher_input_adapter (torch.nn.Module) one can modify the teacher net to act as if
            teacher = torch.nn.Sequential(teacher_input_adapter, teacher). This is useful when teacher net expects a
            different input format from the student (for example different normalization).
            Equivalent arg for the student model, can be passed through student_input_adapter.

    """

    def __init__(self, arch_params: HpmStruct, student: SgModule, teacher: torch.nn.Module, run_teacher_on_eval=False):
        super(KDModule, self).__init__()
        self.arch_params = arch_params
        self.student = student
        self.teacher = teacher
        self.teacher_input_adapter = get_param(self.arch_params, "teacher_input_adapter")
        self.student_input_adapter = get_param(self.arch_params, "student_input_adapter")
        self.run_teacher_on_eval = run_teacher_on_eval
        self._freeze_teacher()

        # WHEN CREATING A MODULE SELF.TRAIN() ISN'T CALLED AND SO THE TEACHER MUST BE MOVED TO EVAL MODE EXPLICITLY
        if self.run_teacher_on_eval:
            self.teacher.eval()

    def _freeze_teacher(self):
        for p in self.teacher.parameters():
            p.requires_grad = False

        if self.teacher_input_adapter is not None:
            for p in self.teacher_input_adapter.parameters():
                p.requires_grad = False
            self.teacher_input_adapter.eval()

    def train(self, mode=True):
        self.student.train(mode)
        if not self.run_teacher_on_eval:
            self.teacher.train(mode)

    def eval(self):
        self.student.eval()
        self.teacher.eval()

    def forward(self, x):
        if self.student_input_adapter is not None:
            student_output = self.student(self.student_input_adapter(x))
        else:
            student_output = self.student(x)

        if self.teacher_input_adapter is not None:
            teacher_output = self.teacher(self.teacher_input_adapter(x))
        else:
            teacher_output = self.teacher(x)

        return KDOutput(student_output=student_output, teacher_output=teacher_output)

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        return self.student.initialize_param_groups(lr, training_params)

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        return self.student.update_param_groups(param_groups, lr, epoch, iter, training_params, total_batch)

    def replace_head(self, **kwargs):
        self.student.replace_head(**kwargs)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.student.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.student.get_input_channels()

get(model_name, arch_params=None, num_classes=None, strict_load=StrictLoad.NO_KEY_MATCHING, checkpoint_path=None, pretrained_weights=None, load_backbone=False, download_required_code=True, checkpoint_num_classes=None, num_input_channels=None)

Parameters:

Name Type Description Default
model_name str

Defines the model's architecture from models/ALL_ARCHITECTURES

required
arch_params Optional[dict]

Architecture hyper parameters. e.g.: block, num_blocks, etc.

None
num_classes Optional[int]

Number of classes (defines the net's structure). If None is given, will try to derive from pretrained_weight's corresponding dataset.

None
strict_load Union[str, StrictLoad]

See super_gradients.common.data_types.enum.strict_load.StrictLoad class documentation for details (default=NO_KEY_MATCHING to suport SG trained checkpoints)

StrictLoad.NO_KEY_MATCHING
checkpoint_path Optional[str]

The path to the external checkpoint to be loaded. Can be absolute or relative (ie: path/to/checkpoint.pth) path or URL. If provided, will automatically attempt to load the checkpoint.

None
pretrained_weights Optional[str]

Describe the dataset of the pretrained weights (for example "imagenent").

None
load_backbone bool

Load the provided checkpoint to model.backbone instead of model.

False
download_required_code bool

If model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.

True
checkpoint_num_classes Optional[int]

num_classes of checkpoint_path/ pretrained_weights, when checkpoint_path is not None. Used when num_classes != checkpoint_num_class. In this case, the module will be initialized with checkpoint_num_class, then weights will be loaded. Finaly replace_head(new_num_classes=num_classes) is called (useful when wanting to perform transfer learning, from a checkpoint outside of then ones offered in SG model zoo).

None
num_input_channels Optional[int]

Number of input channels. If None, use the default model's input channels (most likely 3). NOTE: Passing pretrained_weights and checkpoint_path is ill-defined and will raise an error.

None
Source code in src/super_gradients/training/models/model_factory.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
@resolve_param("strict_load", TypeFactory.from_enum_cls(StrictLoad))
def get(
    model_name: str,
    arch_params: Optional[dict] = None,
    num_classes: Optional[int] = None,
    strict_load: Union[str, StrictLoad] = StrictLoad.NO_KEY_MATCHING,
    checkpoint_path: Optional[str] = None,
    pretrained_weights: Optional[str] = None,
    load_backbone: bool = False,
    download_required_code: bool = True,
    checkpoint_num_classes: Optional[int] = None,
    num_input_channels: Optional[int] = None,
) -> Union[SgModule, torch.nn.Module]:
    """
    :param model_name:              Defines the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:             Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param num_classes:             Number of classes (defines the net's structure).
                                        If None is given, will try to derive from pretrained_weight's corresponding dataset.
    :param strict_load:             See super_gradients.common.data_types.enum.strict_load.StrictLoad class documentation for details
                                        (default=NO_KEY_MATCHING to suport SG trained checkpoints)
    :param checkpoint_path:         The path to the external checkpoint to be loaded. Can be absolute or relative (ie: path/to/checkpoint.pth) path or URL.
                                        If provided, will automatically attempt to load the checkpoint.
    :param pretrained_weights:      Describe the dataset of the pretrained weights (for example "imagenent").
    :param load_backbone:           Load the provided checkpoint to model.backbone instead of model.
    :param download_required_code:  If model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                        will prevent additional code from being downloaded. This affects only models from remote client.
    :param checkpoint_num_classes:  num_classes of checkpoint_path/ pretrained_weights, when checkpoint_path is not None.
                                        Used when num_classes != checkpoint_num_class. In this case, the module will be initialized with checkpoint_num_class,
                                        then weights will be loaded.
                                        Finaly replace_head(new_num_classes=num_classes) is called (useful when wanting to perform transfer learning,
                                        from a checkpoint outside of then ones offered in SG model zoo).
    :param num_input_channels:      Number of input channels.
                                        If None, use the default model's input channels (most likely 3).

    NOTE: Passing pretrained_weights and checkpoint_path is ill-defined and will raise an error.
    """
    checkpoint_num_classes = checkpoint_num_classes or num_classes

    if checkpoint_num_classes:
        net = instantiate_model(model_name, arch_params, checkpoint_num_classes, pretrained_weights, download_required_code)
    else:
        net = instantiate_model(model_name, arch_params, num_classes, pretrained_weights, download_required_code)

    if load_backbone and not checkpoint_path:
        raise ValueError("Please set checkpoint_path when load_backbone=True")

    if checkpoint_path:
        ckpt_entries = read_ckpt_state_dict(ckpt_path=checkpoint_path).keys()
        load_processing = "processing_params" in ckpt_entries
        load_ema_as_net = "ema_net" in ckpt_entries
        _ = load_checkpoint_to_model(
            ckpt_local_path=checkpoint_path,
            load_backbone=load_backbone,
            net=net,
            strict=strict_load,
            load_weights_only=True,
            load_ema_as_net=load_ema_as_net,
            load_processing_params=load_processing,
        )
    if checkpoint_num_classes != num_classes:
        net.replace_head(new_num_classes=num_classes)

    if num_input_channels is not None and num_input_channels != net.get_input_channels():
        net.replace_input_channels(in_channels=num_input_channels)

    return net

get_architecture(model_name, arch_params, download_required_code=True, download_platform_weights=True)

Get the corresponding architecture class.

Parameters:

Name Type Description Default
model_name str

Define the model's architecture from models/ALL_ARCHITECTURES

required
arch_params HpmStruct

Architecture hyper parameters. e.g.: block, num_blocks, etc.

required
download_required_code bool

if model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.

True
download_platform_weights bool

bool, when getting a model from the platform, whether to downlaod the pretrained weights as well. In any other case this parameter will be ignored. (default=True).

True

Returns:

Type Description
Tuple[Type[torch.nn.Module], HpmStruct, str, bool]
  • architecture_cls: Class of the model - arch_params: Might be updated if loading from remote deci lab - pretrained_weights_path: path to the pretrained weights from deci lab (None for local models or when deci client is not enabled). - is_remote: True if loading from remote deci lab
Source code in src/super_gradients/training/models/model_factory.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def get_architecture(
    model_name: str, arch_params: HpmStruct, download_required_code: bool = True, download_platform_weights: bool = True
) -> Tuple[Type[torch.nn.Module], HpmStruct, str, bool]:
    """
    Get the corresponding architecture class.

    :param model_name:          Define the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:         Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param download_required_code: if model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                        will prevent additional code from being downloaded. This affects only models from remote client.

    :param download_platform_weights:  bool, when getting a model from the platform, whether to downlaod the pretrained weights as well.
        In any other case this parameter will be ignored. (default=True).

    :return:
        - architecture_cls:     Class of the model
        - arch_params:          Might be updated if loading from remote deci lab
        - pretrained_weights_path:   path to the pretrained weights from deci lab (None for local models or when deci
            client is not enabled).

        - is_remote:            True if loading from remote deci lab
    """
    pretrained_weights_path = None
    is_remote = False
    if not isinstance(model_name, str):
        raise ValueError(f"Input parameter `model_name` should be a string. Got {model_name} of type {type(model_name)}.")

    architecture = get_param(ARCHITECTURES, model_name)
    if model_name not in ARCHITECTURES.keys() and architecture is None:
        if client_enabled:
            logger.info(f'The requested model "{model_name}" was not found in SuperGradients. Trying to load a model from the Platform...')
            deci_client = DeciClient()

            _arch_params = deci_client.get_model_arch_params(model_name)
            if _arch_params is None:
                raise ValueError(
                    f'The requested model "{model_name}" was not found in the Platform. See docs or all_architectures.py for supported model names.'
                )
            else:
                logger.info(f'The requested model "{model_name}" is available in the platform and will now be downloaded...')

            if download_required_code:  # Some extra code might be required to instantiate the arch params.
                deci_client.download_and_load_model_additional_code(model_name, target_path=str(Path.cwd()))
                logger.debug(f'Additional code for model "{model_name}" has been downloaded from the platform.')

            _arch_params = hydra.utils.instantiate(_arch_params)
            if download_platform_weights:
                pretrained_weights_path = deci_client.get_model_weights(model_name)
                logger.info("The model weights were downloaded from the platform.")
            else:
                pretrained_weights_path = None
            model_name = _arch_params["model_name"]
            del _arch_params["model_name"]
            _arch_params = HpmStruct(**_arch_params)
            _arch_params.override(**arch_params.to_dict())
            arch_params, is_remote = _arch_params, True
        else:
            raise UnknownTypeException(
                message=f'The requested model "{model_name}" was not found in SuperGradients. See docs or all_architectures.py for supported model names.',
                unknown_type=model_name,
                choices=list(ARCHITECTURES.keys()),
            )

    return get_param(ARCHITECTURES, model_name), arch_params, pretrained_weights_path, is_remote

get_model_name(model)

Get the name of a model loaded by SuperGradients' models.get(). If the model was not loaded using models.get(), return None.

Source code in src/super_gradients/training/models/model_factory.py
186
187
188
def get_model_name(model: torch.nn.Module) -> Optional[str]:
    """Get the name of a model loaded by SuperGradients' `models.get()`. If the model was not loaded using `models.get()`, return None."""
    return getattr(model, "_sg_model_name", None)

instantiate_model(model_name, arch_params, num_classes, pretrained_weights=None, download_required_code=True)

Instantiates nn.Module according to architecture and arch_params, and handles pretrained weights and the required module manipulation (i.e head replacement).

Parameters:

Name Type Description Default
model_name str

Define the model's architecture from models/ALL_ARCHITECTURES

required
arch_params dict

Architecture hyper parameters. e.g.: block, num_blocks, etc.

required
num_classes int

Number of classes (defines the net's structure). If None is given, will try to derrive from pretrained_weight's corresponding dataset.

required
pretrained_weights str

Describe the dataset of the pretrained weights (for example "imagenent"). Add platform/ prefix if the weights are stored in the platform - Please note that in this case, num_classes is expected to be the checkpoints number of classes, and not the number of class that you want to use - you will need to replace the head afterward if you want to work with a different number of classes.

None
download_required_code bool

if model is not found in SG and is downloaded from a remote client, overriding this parameter with False will prevent additional code from being downloaded. This affects only models from remote client.

True

Returns:

Type Description
Union[SgModule, torch.nn.Module]

Instantiated model i.e torch.nn.Module, architecture_class (will be none when architecture is not str)

Source code in src/super_gradients/training/models/model_factory.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def instantiate_model(
    model_name: str, arch_params: dict, num_classes: int, pretrained_weights: str = None, download_required_code: bool = True
) -> Union[SgModule, torch.nn.Module]:
    """
    Instantiates nn.Module according to architecture and arch_params, and handles pretrained weights and the required
        module manipulation (i.e head replacement).

    :param model_name:          Define the model's architecture from models/ALL_ARCHITECTURES
    :param arch_params:         Architecture hyper parameters. e.g.: block, num_blocks, etc.
    :param num_classes:         Number of classes (defines the net's structure).
                                    If None is given, will try to derrive from pretrained_weight's corresponding dataset.
    :param pretrained_weights:  Describe the dataset of the pretrained weights (for example "imagenent").
                                Add `platform/` prefix if the weights are stored in the platform -
                                Please note that in this case, `num_classes` is expected to be the checkpoints number of classes, and not the number of class
                                that you want to use - you will need to replace the head afterward if you want to work with a different number of classes.
    :param download_required_code: if model is not found in SG and is downloaded from a remote client, overriding this parameter with False
                                will prevent additional code from being downloaded. This affects only models from remote client.

    :return:                    Instantiated model i.e torch.nn.Module, architecture_class (will be none when architecture is not str)
    """
    if arch_params is None:
        arch_params = {}
    arch_params = core_utils.HpmStruct(**arch_params)
    download_platform_weights = isinstance(pretrained_weights, str) and pretrained_weights.startswith("platform/")
    architecture_cls, arch_params, pretrained_weights_path, is_remote = get_architecture(
        model_name, arch_params, download_required_code, download_platform_weights
    )

    if not issubclass(architecture_cls, SgModule):
        net = architecture_cls(**arch_params.to_dict(include_schema=False))
    else:
        if core_utils.get_param(arch_params, "num_classes"):
            logger.warning(
                "Passing num_classes through arch_params is deprecated and will be removed in the next version. " "Pass num_classes explicitly to models.get"
            )
            num_classes = num_classes or arch_params.num_classes

        if num_classes is not None:
            arch_params.override(num_classes=num_classes)

        if pretrained_weights is None and num_classes is None:
            raise ValueError("num_classes or pretrained_weights must be passed to determine net's structure.")

        if pretrained_weights:
            if pretrained_weights in PRETRAINED_NUM_CLASSES.keys():
                num_classes_new_head = core_utils.get_param(arch_params, "num_classes", PRETRAINED_NUM_CLASSES[pretrained_weights])
                arch_params.num_classes = PRETRAINED_NUM_CLASSES[pretrained_weights]
            elif not download_platform_weights:
                raise ValueError(
                    f'`pretrained_weights="{pretrained_weights}"` is not a valid and was not found in that platform. '
                    f'Valid pretrained weights are: "{PRETRAINED_NUM_CLASSES.keys()}"'
                )

        # Most of the SG models work with a single params names "arch_params" of type HpmStruct, but a few take
        # **kwargs instead
        if "arch_params" not in get_callable_param_names(architecture_cls):
            net = architecture_cls(**arch_params.to_dict(include_schema=False))
        else:
            net = architecture_cls(arch_params=arch_params)

        if pretrained_weights:
            # The logic is follows - first we initialize the preprocessing params using default hard-coded params
            # If pretrained checkpoint contains preprocessing params, new params will be loaded and override the ones from
            # this step in load_pretrained_weights_local/load_pretrained_weights
            if isinstance(net, HasPredict):
                processing_params = get_pretrained_processing_params(model_name, pretrained_weights)
                net.set_dataset_processing_params(**processing_params)

            if is_remote and pretrained_weights_path:
                load_pretrained_weights_local(net, model_name, pretrained_weights_path)
            else:
                load_pretrained_weights(net, model_name, pretrained_weights)

            if pretrained_weights in PRETRAINED_NUM_CLASSES.keys() and num_classes_new_head != arch_params.num_classes:
                net.replace_head(new_num_classes=num_classes_new_head)
                arch_params.num_classes = num_classes_new_head

    _add_model_name_attribute(net, model_name)

    return net

AdaptBlock

Bases: nn.Module

Residual block with deformable convolution

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class AdaptBlock(nn.Module):
    """
    Residual block with deformable convolution
    """

    expansion = 1

    def __init__(self, inplanes, outplanes, stride=1, downsample=None, dilation=1, deformable_groups=1):
        super(AdaptBlock, self).__init__()
        regular_matrix = torch.tensor([[-1, -1, -1, 0, 0, 0, 1, 1, 1], [-1, 0, 1, -1, 0, 1, -1, 0, 1]])
        self.register_buffer("regular_matrix", regular_matrix.float())
        self.downsample = downsample
        self.transform_matrix_conv = nn.Conv2d(inplanes, 4, 3, 1, 1, bias=True)
        self.translation_conv = nn.Conv2d(inplanes, 2, 3, 1, 1, bias=True)

        self.adapt_conv = torchvision.ops.DeformConv2d(
            inplanes, outplanes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False, groups=deformable_groups
        )

        self.bn = nn.BatchNorm2d(outplanes)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x

        N, _, H, W = x.shape
        transform_matrix = self.transform_matrix_conv(x)
        transform_matrix = transform_matrix.permute(0, 2, 3, 1).reshape((N * H * W, 2, 2))
        offset = torch.matmul(transform_matrix, self.regular_matrix)
        offset = offset - self.regular_matrix
        offset = offset.transpose(1, 2).reshape((N, H, W, 18)).permute(0, 3, 1, 2)

        translation = self.translation_conv(x)
        offset[:, 0::2, :, :] += translation[:, 0:1, :, :]
        offset[:, 1::2, :, :] += translation[:, 1:2, :, :]

        out = self.adapt_conv(x, offset)
        out = self.bn(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

BasicBlock

Bases: nn.Module

ResNet basic block

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class BasicBlock(nn.Module):
    """
    ResNet basic block
    """

    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

Bottleneck

Bases: nn.Module

ResNet bottleneck block

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class Bottleneck(nn.Module):
    """
    ResNet bottleneck block
    """

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

DEKRPoseEstimationModel

Bases: SgModule, HasPredict

Implementation of HRNet model from DEKR paper (https://arxiv.org/abs/2104.02300).

The model takes an image of (B,C,H,W) shape and outputs two tensors (heatmap, offset) as predictions: - heatmap (B, NumJoints+1,H * upsample_factor, W * upsample_factor) - offset (B, NumJoints*2, H * upsample_factor, W * upsample_factor)

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
@register_model(Models.DEKR_CUSTOM)
class DEKRPoseEstimationModel(SgModule, HasPredict):
    """
    Implementation of HRNet model from DEKR paper (https://arxiv.org/abs/2104.02300).

    The model takes an image of (B,C,H,W) shape and outputs two tensors (heatmap, offset) as predictions:
      - heatmap (B, NumJoints+1,H * upsample_factor, W * upsample_factor)
      - offset (B, NumJoints*2, H * upsample_factor, W * upsample_factor)
    """

    def __init__(self, arch_params):
        super(DEKRPoseEstimationModel, self).__init__()

        # stem net
        in_channels = get_param(arch_params, "in_channels", 3)
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(Bottleneck, 64, 64, 4)

        # build stage
        self.spec = arch_params.SPEC
        self.stages_spec = self.spec.STAGES
        self.num_stages = self.spec.STAGES.NUM_STAGES
        num_channels_last = [256]
        for i in range(self.num_stages):
            num_channels = self.stages_spec.NUM_CHANNELS[i]
            transition_layer = self._make_transition_layer(num_channels_last, num_channels)
            setattr(self, "transition{}".format(i + 1), transition_layer)

            stage, num_channels_last = self._make_stage(self.stages_spec, i, num_channels, True)
            setattr(self, "stage{}".format(i + 2), stage)

        # build head net
        self.head_inp_channels = int(sum(self.stages_spec.NUM_CHANNELS[-1]))
        self.config_heatmap = self.spec.HEAD_HEATMAP
        self.config_offset = self.spec.HEAD_OFFSET
        self.num_joints = arch_params.num_classes
        self.num_offset = self.num_joints * 2
        self.num_joints_with_center = self.num_joints + 1
        self.offset_prekpt = self.config_offset["NUM_CHANNELS_PERKPT"]

        offset_channels = self.num_joints * self.offset_prekpt
        self.transition_heatmap = self._make_transition_for_head(self.head_inp_channels, self.config_heatmap["NUM_CHANNELS"])
        self.transition_offset = self._make_transition_for_head(self.head_inp_channels, offset_channels)
        self.head_heatmap = self._make_heatmap_head(self.config_heatmap)
        self.offset_feature_layers, self.offset_final_layer = self._make_separete_regression_head(self.config_offset)
        self.heatmap_activation = nn.Sigmoid() if self.config_heatmap["HEATMAP_APPLY_SIGMOID"] else nn.Identity()
        self.init_weights()

    def replace_head(self, new_num_classes: int):
        self.num_joints = new_num_classes
        self.num_offset = new_num_classes * 2
        self.num_joints_with_center = new_num_classes + 1

        offset_channels = self.num_joints * self.offset_prekpt
        self.head_heatmap = self._make_heatmap_head(self.config_heatmap)
        self.transition_offset = self._make_transition_for_head(self.head_inp_channels, offset_channels)
        self.offset_feature_layers, self.offset_final_layer = self._make_separete_regression_head(self.config_offset)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.conv1 = replace_conv2d_input_channels(conv=self.conv1, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.conv1.in_channels

    def _make_transition_for_head(self, inplanes: int, outplanes: int) -> nn.Module:
        transition_layer = [nn.Conv2d(inplanes, outplanes, 1, 1, 0, bias=False), nn.BatchNorm2d(outplanes), nn.ReLU(True)]
        return nn.Sequential(*transition_layer)

    def _make_heatmap_head(self, layer_config: Mapping[str, Any]) -> nn.ModuleList:
        heatmap_head_layers = []

        feature_conv = self._make_layer(
            blocks_dict[layer_config["BLOCK"]],
            layer_config["NUM_CHANNELS"],
            layer_config["NUM_CHANNELS"],
            layer_config["NUM_BLOCKS"],
            dilation=layer_config["DILATION_RATE"],
        )
        heatmap_head_layers.append(feature_conv)

        heatmap_conv = nn.Conv2d(
            in_channels=layer_config["NUM_CHANNELS"],
            out_channels=self.num_joints_with_center,
            kernel_size=self.spec.FINAL_CONV_KERNEL,
            stride=1,
            padding=1 if self.spec.FINAL_CONV_KERNEL == 3 else 0,
        )
        heatmap_head_layers.append(heatmap_conv)

        return nn.ModuleList(heatmap_head_layers)

    def _make_separete_regression_head(self, layer_config) -> Tuple[nn.ModuleList, nn.ModuleList]:
        """
        Build offset regression head for each joint
        :param layer_config:
        :return:
        """
        offset_feature_layers = []
        offset_final_layer = []

        for _ in range(self.num_joints):
            feature_conv = self._make_layer(
                blocks_dict[layer_config["BLOCK"]],
                layer_config["NUM_CHANNELS_PERKPT"],
                layer_config["NUM_CHANNELS_PERKPT"],
                layer_config["NUM_BLOCKS"],
                dilation=layer_config["DILATION_RATE"],
            )
            offset_feature_layers.append(feature_conv)

            offset_conv = nn.Conv2d(
                in_channels=layer_config["NUM_CHANNELS_PERKPT"],
                out_channels=2,
                kernel_size=self.spec.FINAL_CONV_KERNEL,
                stride=1,
                padding=1 if self.spec.FINAL_CONV_KERNEL == 3 else 0,
            )
            offset_final_layer.append(offset_conv)

        return nn.ModuleList(offset_feature_layers), nn.ModuleList(offset_final_layer)

    def _make_layer(self, block, inplanes, planes, blocks, stride=1, dilation=1):
        downsample = None
        if stride != 1 or inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(inplanes, planes, stride, downsample, dilation=dilation))
        inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(inplanes, planes, dilation=dilation))

        return nn.Sequential(*layers)

    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(inplace=True),
                        )
                    )
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i + 1 - num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
                    conv3x3s.append(nn.Sequential(nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False), nn.BatchNorm2d(outchannels), nn.ReLU(inplace=True)))
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_stage(self, stages_spec, stage_index, num_inchannels, multi_scale_output=True):
        num_modules = stages_spec.NUM_MODULES[stage_index]
        num_branches = stages_spec.NUM_BRANCHES[stage_index]
        num_blocks = stages_spec.NUM_BLOCKS[stage_index]
        num_channels = stages_spec.NUM_CHANNELS[stage_index]
        block = blocks_dict[stages_spec["BLOCK"][stage_index]]
        fuse_method = stages_spec.FUSE_METHOD[stage_index]

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(HighResolutionModule(num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output))
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        y_list = [x]
        for i in range(self.num_stages):
            x_list = []
            transition = getattr(self, "transition{}".format(i + 1))
            for j in range(self.stages_spec["NUM_BRANCHES"][i]):
                if transition[j]:
                    x_list.append(transition[j](y_list[-1]))
                else:
                    x_list.append(y_list[j])
            y_list = getattr(self, "stage{}".format(i + 2))(x_list)

        x0_h, x0_w = y_list[0].size(2), y_list[0].size(3)
        x = torch.cat(
            [
                y_list[0],
                F.upsample(y_list[1], size=(x0_h, x0_w), mode="bilinear"),
                F.upsample(y_list[2], size=(x0_h, x0_w), mode="bilinear"),
                F.upsample(y_list[3], size=(x0_h, x0_w), mode="bilinear"),
            ],
            1,
        )

        heatmap = self.head_heatmap[1](self.head_heatmap[0](self.transition_heatmap(x)))

        final_offset = []
        offset_feature = self.transition_offset(x)

        for j in range(self.num_joints):
            final_offset.append(
                self.offset_final_layer[j](self.offset_feature_layers[j](offset_feature[:, j * self.offset_prekpt : (j + 1) * self.offset_prekpt]))
            )

        offset = torch.cat(final_offset, dim=1)
        return self.heatmap_activation(heatmap), offset

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ["bias"]:
                        nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        for m in self.modules():
            if hasattr(m, "transform_matrix_conv"):
                nn.init.constant_(m.transform_matrix_conv.weight, 0)
                if hasattr(m, "bias"):
                    nn.init.constant_(m.transform_matrix_conv.bias, 0)
            if hasattr(m, "translation_conv"):
                nn.init.constant_(m.translation_conv.weight, 0)
                if hasattr(m, "bias"):
                    nn.init.constant_(m.translation_conv.bias, 0)

    @staticmethod
    def get_post_prediction_callback(conf: float = 0.05):
        return DEKRPoseEstimationDecodeCallback(
            min_confidence=conf,
            keypoint_threshold=0.05,
            nms_threshold=0.05,
            apply_sigmoid=True,
            max_num_people=30,
            nms_num_threshold=8,
            output_stride=4,
        )

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        edge_links: Union[np.ndarray, List[Tuple[int, int]]],
        edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        image_processor: Optional[Processing] = None,
        conf: Optional[float] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._edge_links = edge_links or self._edge_links
        self._edge_colors = edge_colors or self._edge_colors
        self._keypoint_colors = keypoint_colors or self._keypoint_colors
        self._image_processor = image_processor or self._image_processor
        self._default_nms_conf = conf or self._default_nms_conf

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True
    ) -> PoseEstimationPipeline:
        """Instantiate the prediction pipeline of this model.

        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        """
        if None in (self._edge_links, self._image_processor, self._default_nms_conf):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        conf = conf or self._default_nms_conf

        if len(self._keypoint_colors) != self.num_joints:
            raise RuntimeError(
                "The number of colors for the keypoints ({}) does not match the number of joints ({})".format(len(self._keypoint_colors), self.num_joints)
            )
        if len(self._edge_colors) != len(self._edge_links):
            raise RuntimeError(
                "The number of colors for the joints ({}) does not match the number of joint links ({})".format(len(self._edge_colors), len(self._edge_links))
            )

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0))
        else:
            image_processor = self._image_processor

        pipeline = PoseEstimationPipeline(
            model=self,
            image_processor=image_processor,
            edge_links=self._edge_links,
            edge_colors=self._edge_colors,
            keypoint_colors=self._keypoint_colors,
            post_prediction_callback=self.get_post_prediction_callback(conf=conf),
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(
        self,
        images: ImageSource,
        conf: Optional[float] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> ImagesPoseEstimationPrediction:
        """Predict an image or a list of images.

        :param images:  Images to predict.
        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True):
        """Predict using webcam.

        :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                        If None, the default value associated to the training is used.
        :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        """
        pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
        pipeline.predict_webcam()

    def train(self, mode: bool = True):
        self._get_pipeline.cache_clear()
        torch.cuda.empty_cache()
        return super().train(mode)

predict(images, conf=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
def predict(
    self,
    images: ImageSource,
    conf: Optional[float] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> ImagesPoseEstimationPrediction:
    """Predict an image or a list of images.

    :param images:  Images to predict.
    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(conf=None, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
652
653
654
655
656
657
658
659
660
661
def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False, fp16: bool = True):
    """Predict using webcam.

    :param conf:    (Optional) Below the confidence threshold, prediction are discarded.
                    If None, the default value associated to the training is used.
    :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    """
    pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing, fp16=fp16)
    pipeline.predict_webcam()

set_dataset_processing_params(edge_links, edge_colors, keypoint_colors, image_processor=None, conf=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded

None
Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    edge_links: Union[np.ndarray, List[Tuple[int, int]]],
    edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    image_processor: Optional[Processing] = None,
    conf: Optional[float] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._edge_links = edge_links or self._edge_links
    self._edge_colors = edge_colors or self._edge_colors
    self._keypoint_colors = keypoint_colors or self._keypoint_colors
    self._image_processor = image_processor or self._image_processor
    self._default_nms_conf = conf or self._default_nms_conf

DEKRW32NODC

Bases: DEKRPoseEstimationModel

DEKR-W32 model for pose estimation without deformable convolutions.

Source code in src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py
669
670
671
672
673
674
675
676
677
678
679
680
@register_model(Models.DEKR_W32_NO_DC)
class DEKRW32NODC(DEKRPoseEstimationModel):
    """
    DEKR-W32 model for pose estimation without deformable convolutions.
    """

    def __init__(self, arch_params):
        POSE_DEKR_W32_NO_DC_ARCH_PARAMS = get_arch_params("pose_dekr_w32_no_dc_arch_params")

        merged_arch_params = HpmStruct(**copy.deepcopy(POSE_DEKR_W32_NO_DC_ARCH_PARAMS))
        merged_arch_params.override(**arch_params.to_dict())
        super().__init__(merged_arch_params)

PoseRescoringNet

Bases: SgModule

Rescoring network for pose estimation. It takes input features and predicts the single scalar score which is the multiplication factor for original score prediction. This model learns what are the reasonable/possible joint configurations. So it may downweight confidence of impossible joint configurations.

The model is a simple 3-layer MLP with ReLU activation. The input is the concatenation of the predicted poses and prior information in the form of the joint links. See RescoringNet.get_feature() for details. The output is a single scalar value.

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@register_model(Models.POSE_RESCORING)
class PoseRescoringNet(SgModule):
    """
    Rescoring network for pose estimation. It takes input features and predicts the single scalar score
    which is the multiplication factor for original score prediction. This model learns what are the reasonable/possible
    joint configurations. So it may downweight confidence of impossible joint configurations.

    The model is a simple 3-layer MLP with ReLU activation. The input is the concatenation of the predicted poses and prior
    information in the form of the joint links. See RescoringNet.get_feature() for details.
    The output is a single scalar value.
    """

    def __init__(self, num_classes: int, hidden_channels: int, num_layers: int, edge_links: List[Tuple[int, int]]):
        super(PoseRescoringNet, self).__init__()
        in_channels = len(edge_links) * 2 + len(edge_links) + num_classes  # [joint_relate, joint_length, visibility]
        layers = []
        for _ in range(num_layers):
            layers.append(nn.Linear(in_channels, hidden_channels, bias=True))
            layers.append(nn.ReLU())
            in_channels = hidden_channels
        self.layers = nn.Sequential(*layers)
        self.final = nn.Linear(hidden_channels, 1, bias=True)
        self.edge_links = torch.tensor(edge_links).long()

    def forward(self, poses: Tensor) -> Tuple[Tensor, Tensor]:
        """

        :param x: Predicted poses or shape [N, J, 3] or [B, N, J, 3]
        :return: Tuple of input poses and corresponding scores
        """

        x = self.get_feature(poses, self.edge_links)
        x = self.layers(x)
        y_pred = self.final(x)
        return poses, y_pred

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
                nn.init.constant_(m.bias, 0)

    @classmethod
    def get_feature(cls, poses: Tensor, edge_links: Tensor) -> Tensor:
        """
        Compute the feature vector input to the rescoring network.

        :param poses: [N, J, 3] Predicted poses
        :param edge_links: [L,2] List of joint indices
        :return: [N, L*2+L+J] Feature vector
        """
        joint_xy = poses[..., :2]
        visibility = poses[..., 2]

        joint_1 = edge_links[:, 0]
        joint_2 = edge_links[:, 1]

        # To get the Delta x Delta y
        joint_relate = joint_xy[..., joint_1, :] - joint_xy[..., joint_2, :]  # [N, L, 2]
        joint_length = ((joint_relate**2)[..., 0] + (joint_relate**2)[..., 1]) ** (0.5)  # [N, L]

        # To use the torso distance to normalize
        normalize = (joint_length[..., 9] + joint_length[..., 11]) / 2  # [N] # NOTE: THIS IS COCO-SPECIFIC
        normalize_tiled = torch.tile(normalize, (len(joint_1), 2, 1)).permute(2, 0, 1)
        normalize_tiled = normalize_tiled.clamp_min(1)

        joint_length = joint_length / normalize_tiled[..., 0]
        joint_relate = joint_relate / normalize_tiled
        joint_relate = torch.flatten(joint_relate, start_dim=-2)  # .reshape((-1, len(joint_1) * 2))

        feature = [joint_relate, joint_length, visibility]
        feature = torch.cat(feature, dim=-1)
        return feature

forward(poses)

Parameters:

Name Type Description Default
x

Predicted poses or shape [N, J, 3] or [B, N, J, 3]

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple of input poses and corresponding scores

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py
39
40
41
42
43
44
45
46
47
48
49
def forward(self, poses: Tensor) -> Tuple[Tensor, Tensor]:
    """

    :param x: Predicted poses or shape [N, J, 3] or [B, N, J, 3]
    :return: Tuple of input poses and corresponding scores
    """

    x = self.get_feature(poses, self.edge_links)
    x = self.layers(x)
    y_pred = self.final(x)
    return poses, y_pred

get_feature(poses, edge_links) classmethod

Compute the feature vector input to the rescoring network.

Parameters:

Name Type Description Default
poses Tensor

[N, J, 3] Predicted poses

required
edge_links Tensor

[L,2] List of joint indices

required

Returns:

Type Description
Tensor

[N, L*2+L+J] Feature vector

Source code in src/super_gradients/training/models/pose_estimation_models/rescoring_net.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def get_feature(cls, poses: Tensor, edge_links: Tensor) -> Tensor:
    """
    Compute the feature vector input to the rescoring network.

    :param poses: [N, J, 3] Predicted poses
    :param edge_links: [L,2] List of joint indices
    :return: [N, L*2+L+J] Feature vector
    """
    joint_xy = poses[..., :2]
    visibility = poses[..., 2]

    joint_1 = edge_links[:, 0]
    joint_2 = edge_links[:, 1]

    # To get the Delta x Delta y
    joint_relate = joint_xy[..., joint_1, :] - joint_xy[..., joint_2, :]  # [N, L, 2]
    joint_length = ((joint_relate**2)[..., 0] + (joint_relate**2)[..., 1]) ** (0.5)  # [N, L]

    # To use the torso distance to normalize
    normalize = (joint_length[..., 9] + joint_length[..., 11]) / 2  # [N] # NOTE: THIS IS COCO-SPECIFIC
    normalize_tiled = torch.tile(normalize, (len(joint_1), 2, 1)).permute(2, 0, 1)
    normalize_tiled = normalize_tiled.clamp_min(1)

    joint_length = joint_length / normalize_tiled[..., 0]
    joint_relate = joint_relate / normalize_tiled
    joint_relate = torch.flatten(joint_relate, start_dim=-2)  # .reshape((-1, len(joint_1) * 2))

    feature = [joint_relate, joint_length, visibility]
    feature = torch.cat(feature, dim=-1)
    return feature

YoloNASPoseDFLHead

Bases: BaseDetectionModule, SupportsReplaceNumClasses

YoloNASPoseDFLHead is the head used in YoloNASPose model. This class implements single-class object detection and keypoints regression on a single scale feature map

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@register_detection_module()
class YoloNASPoseDFLHead(BaseDetectionModule, SupportsReplaceNumClasses):
    """
    YoloNASPoseDFLHead is the head used in YoloNASPose model.
    This class implements single-class object detection and keypoints regression on a single scale feature map
    """

    def __init__(
        self,
        in_channels: int,
        bbox_inter_channels: int,
        pose_inter_channels: int,
        pose_regression_blocks: int,
        shared_stem: bool,
        pose_conf_in_class_head: bool,
        pose_block_use_repvgg: bool,
        width_mult: float,
        first_conv_group_size: int,
        num_classes: int,
        stride: int,
        reg_max: int,
        cls_dropout_rate: float = 0.0,
        reg_dropout_rate: float = 0.0,
    ):
        """
        Initialize the YoloNASDFLHead
        :param in_channels: Input channels
        :param bbox_inter_channels: Intermediate number of channels for box detection & regression
        :param pose_inter_channels: Intermediate number of channels for pose regression
        :param shared_stem: Whether to share the stem between the pose and bbox heads
        :param pose_conf_in_class_head: Whether to include the pose confidence in the classification head
        :param width_mult: Width multiplier
        :param first_conv_group_size: Group size
        :param num_classes: Number of keypoints classes for pose regression. Number of detection classes is always 1.
        :param stride: Output stride for this head
        :param reg_max: Number of bins in the regression head
        :param cls_dropout_rate: Dropout rate for the classification head
        :param reg_dropout_rate: Dropout rate for the regression head
        """
        super().__init__(in_channels)

        bbox_inter_channels = width_multiplier(bbox_inter_channels, width_mult, 8)
        pose_inter_channels = width_multiplier(pose_inter_channels, width_mult, 8)

        if first_conv_group_size == 0:
            groups = 0
        elif first_conv_group_size == -1:
            groups = 1
        else:
            groups = bbox_inter_channels // first_conv_group_size

        self.num_classes = num_classes
        self.shared_stem = shared_stem
        self.pose_conf_in_class_head = pose_conf_in_class_head

        if self.shared_stem:
            max_input = max(bbox_inter_channels, pose_inter_channels)
            self.stem = ConvBNReLU(in_channels, max_input, kernel_size=1, stride=1, padding=0, bias=False)

            if max_input != pose_inter_channels:
                self.pose_stem = nn.Conv2d(max_input, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            else:
                self.pose_stem = nn.Identity()

            if max_input != bbox_inter_channels:
                self.bbox_stem = nn.Conv2d(max_input, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            else:
                self.bbox_stem = nn.Identity()

        else:
            self.stem = nn.Identity()
            self.pose_stem = ConvBNReLU(in_channels, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
            self.bbox_stem = ConvBNReLU(in_channels, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

        first_cls_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        first_reg_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        if pose_block_use_repvgg:
            pose_block = partial(QARepVGGBlock, use_alpha=True)
        else:
            pose_block = partial(ConvBNReLU, kernel_size=3, stride=1, padding=1, bias=False)

        pose_convs = [pose_block(pose_inter_channels, pose_inter_channels) for _ in range(pose_regression_blocks)]
        self.pose_convs = nn.Sequential(*pose_convs)

        self.reg_pred = nn.Conv2d(bbox_inter_channels, 4 * (reg_max + 1), 1, 1, 0)

        if self.pose_conf_in_class_head:
            self.cls_pred = nn.Conv2d(bbox_inter_channels, 1 + self.num_classes, 1, 1, 0)
            self.pose_pred = nn.Conv2d(pose_inter_channels, 2 * self.num_classes, 1, 1, 0)  # each keypoint is x,y
        else:
            self.cls_pred = nn.Conv2d(bbox_inter_channels, 1, 1, 1, 0)
            self.pose_pred = nn.Conv2d(pose_inter_channels, 3 * self.num_classes, 1, 1, 0)  # each keypoint is x,y,confidence

        self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
        self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

        self.stride = stride

        self.prior_prob = 1e-2
        self._initialize_biases()

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        if self.pose_conf_in_class_head:
            self.cls_pred = compute_new_weights_fn(self.cls_pred, 1 + num_classes)
            self.pose_pred = compute_new_weights_fn(self.pose_pred, 2 * num_classes)
        else:
            self.pose_pred = compute_new_weights_fn(self.pose_pred, 3 * num_classes)
        self.num_classes = num_classes

    @property
    def out_channels(self):
        return None

    def forward(self, x) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """

        :param x: Input feature map of shape [B, Cin, H, W]
        :return: Tuple of [reg_output, cls_output, pose_regression, pose_logits]
            - reg_output:      Tensor of [B, 4 * (reg_max + 1), H, W]
            - cls_output:      Tensor of [B, 1, H, W]
            - pose_regression: Tensor of [B, num_classes, 2, H, W]
            - pose_logits:     Tensor of [B, num_classes, H, W]
        """
        x = self.stem(x)
        pose_features = self.pose_stem(x)
        bbox_features = self.bbox_stem(x)

        cls_feat = self.cls_convs(bbox_features)
        cls_feat = self.cls_dropout_rate(cls_feat)
        cls_output = self.cls_pred(cls_feat)

        reg_feat = self.reg_convs(bbox_features)
        reg_feat = self.reg_dropout_rate(reg_feat)
        reg_output = self.reg_pred(reg_feat)

        pose_feat = self.pose_convs(pose_features)
        pose_feat = self.reg_dropout_rate(pose_feat)

        pose_output = self.pose_pred(pose_feat)

        if self.pose_conf_in_class_head:
            pose_logits = cls_output[:, 1:, :, :]
            cls_output = cls_output[:, 0:1, :, :]
            pose_regression = pose_output.reshape((pose_output.size(0), self.num_classes, 2, pose_output.size(2), pose_output.size(3)))
        else:
            pose_output = pose_output.reshape((pose_output.size(0), self.num_classes, 3, pose_output.size(2), pose_output.size(3)))
            pose_logits = pose_output[:, :, 2, :, :]
            pose_regression = pose_output[:, :, 0:2, :, :]

        return reg_output, cls_output, pose_regression, pose_logits

    def _initialize_biases(self):
        prior_bias = -math.log((1 - self.prior_prob) / self.prior_prob)
        torch.nn.init.constant_(self.cls_pred.bias, prior_bias)

__init__(in_channels, bbox_inter_channels, pose_inter_channels, pose_regression_blocks, shared_stem, pose_conf_in_class_head, pose_block_use_repvgg, width_mult, first_conv_group_size, num_classes, stride, reg_max, cls_dropout_rate=0.0, reg_dropout_rate=0.0)

Initialize the YoloNASDFLHead

Parameters:

Name Type Description Default
in_channels int

Input channels

required
bbox_inter_channels int

Intermediate number of channels for box detection & regression

required
pose_inter_channels int

Intermediate number of channels for pose regression

required
shared_stem bool

Whether to share the stem between the pose and bbox heads

required
pose_conf_in_class_head bool

Whether to include the pose confidence in the classification head

required
width_mult float

Width multiplier

required
first_conv_group_size int

Group size

required
num_classes int

Number of keypoints classes for pose regression. Number of detection classes is always 1.

required
stride int

Output stride for this head

required
reg_max int

Number of bins in the regression head

required
cls_dropout_rate float

Dropout rate for the classification head

0.0
reg_dropout_rate float

Dropout rate for the regression head

0.0
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def __init__(
    self,
    in_channels: int,
    bbox_inter_channels: int,
    pose_inter_channels: int,
    pose_regression_blocks: int,
    shared_stem: bool,
    pose_conf_in_class_head: bool,
    pose_block_use_repvgg: bool,
    width_mult: float,
    first_conv_group_size: int,
    num_classes: int,
    stride: int,
    reg_max: int,
    cls_dropout_rate: float = 0.0,
    reg_dropout_rate: float = 0.0,
):
    """
    Initialize the YoloNASDFLHead
    :param in_channels: Input channels
    :param bbox_inter_channels: Intermediate number of channels for box detection & regression
    :param pose_inter_channels: Intermediate number of channels for pose regression
    :param shared_stem: Whether to share the stem between the pose and bbox heads
    :param pose_conf_in_class_head: Whether to include the pose confidence in the classification head
    :param width_mult: Width multiplier
    :param first_conv_group_size: Group size
    :param num_classes: Number of keypoints classes for pose regression. Number of detection classes is always 1.
    :param stride: Output stride for this head
    :param reg_max: Number of bins in the regression head
    :param cls_dropout_rate: Dropout rate for the classification head
    :param reg_dropout_rate: Dropout rate for the regression head
    """
    super().__init__(in_channels)

    bbox_inter_channels = width_multiplier(bbox_inter_channels, width_mult, 8)
    pose_inter_channels = width_multiplier(pose_inter_channels, width_mult, 8)

    if first_conv_group_size == 0:
        groups = 0
    elif first_conv_group_size == -1:
        groups = 1
    else:
        groups = bbox_inter_channels // first_conv_group_size

    self.num_classes = num_classes
    self.shared_stem = shared_stem
    self.pose_conf_in_class_head = pose_conf_in_class_head

    if self.shared_stem:
        max_input = max(bbox_inter_channels, pose_inter_channels)
        self.stem = ConvBNReLU(in_channels, max_input, kernel_size=1, stride=1, padding=0, bias=False)

        if max_input != pose_inter_channels:
            self.pose_stem = nn.Conv2d(max_input, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        else:
            self.pose_stem = nn.Identity()

        if max_input != bbox_inter_channels:
            self.bbox_stem = nn.Conv2d(max_input, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        else:
            self.bbox_stem = nn.Identity()

    else:
        self.stem = nn.Identity()
        self.pose_stem = ConvBNReLU(in_channels, pose_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)
        self.bbox_stem = ConvBNReLU(in_channels, bbox_inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

    first_cls_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    first_reg_conv = [ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
    self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(bbox_inter_channels, bbox_inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

    if pose_block_use_repvgg:
        pose_block = partial(QARepVGGBlock, use_alpha=True)
    else:
        pose_block = partial(ConvBNReLU, kernel_size=3, stride=1, padding=1, bias=False)

    pose_convs = [pose_block(pose_inter_channels, pose_inter_channels) for _ in range(pose_regression_blocks)]
    self.pose_convs = nn.Sequential(*pose_convs)

    self.reg_pred = nn.Conv2d(bbox_inter_channels, 4 * (reg_max + 1), 1, 1, 0)

    if self.pose_conf_in_class_head:
        self.cls_pred = nn.Conv2d(bbox_inter_channels, 1 + self.num_classes, 1, 1, 0)
        self.pose_pred = nn.Conv2d(pose_inter_channels, 2 * self.num_classes, 1, 1, 0)  # each keypoint is x,y
    else:
        self.cls_pred = nn.Conv2d(bbox_inter_channels, 1, 1, 1, 0)
        self.pose_pred = nn.Conv2d(pose_inter_channels, 3 * self.num_classes, 1, 1, 0)  # each keypoint is x,y,confidence

    self.cls_dropout_rate = nn.Dropout2d(cls_dropout_rate) if cls_dropout_rate > 0 else nn.Identity()
    self.reg_dropout_rate = nn.Dropout2d(reg_dropout_rate) if reg_dropout_rate > 0 else nn.Identity()

    self.stride = stride

    self.prior_prob = 1e-2
    self._initialize_biases()

forward(x)

Parameters:

Name Type Description Default
x

Input feature map of shape [B, Cin, H, W]

required

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple of [reg_output, cls_output, pose_regression, pose_logits] - reg_output: Tensor of [B, 4 * (reg_max + 1), H, W] - cls_output: Tensor of [B, 1, H, W] - pose_regression: Tensor of [B, num_classes, 2, H, W] - pose_logits: Tensor of [B, num_classes, H, W]

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_dfl_head.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def forward(self, x) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """

    :param x: Input feature map of shape [B, Cin, H, W]
    :return: Tuple of [reg_output, cls_output, pose_regression, pose_logits]
        - reg_output:      Tensor of [B, 4 * (reg_max + 1), H, W]
        - cls_output:      Tensor of [B, 1, H, W]
        - pose_regression: Tensor of [B, num_classes, 2, H, W]
        - pose_logits:     Tensor of [B, num_classes, H, W]
    """
    x = self.stem(x)
    pose_features = self.pose_stem(x)
    bbox_features = self.bbox_stem(x)

    cls_feat = self.cls_convs(bbox_features)
    cls_feat = self.cls_dropout_rate(cls_feat)
    cls_output = self.cls_pred(cls_feat)

    reg_feat = self.reg_convs(bbox_features)
    reg_feat = self.reg_dropout_rate(reg_feat)
    reg_output = self.reg_pred(reg_feat)

    pose_feat = self.pose_convs(pose_features)
    pose_feat = self.reg_dropout_rate(pose_feat)

    pose_output = self.pose_pred(pose_feat)

    if self.pose_conf_in_class_head:
        pose_logits = cls_output[:, 1:, :, :]
        cls_output = cls_output[:, 0:1, :, :]
        pose_regression = pose_output.reshape((pose_output.size(0), self.num_classes, 2, pose_output.size(2), pose_output.size(3)))
    else:
        pose_output = pose_output.reshape((pose_output.size(0), self.num_classes, 3, pose_output.size(2), pose_output.size(3)))
        pose_logits = pose_output[:, :, 2, :, :]
        pose_regression = pose_output[:, :, 0:2, :, :]

    return reg_output, cls_output, pose_regression, pose_logits

YoloNASPoseNDFLHeads

Bases: BaseDetectionModule, SupportsReplaceNumClasses

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
@register_detection_module()
class YoloNASPoseNDFLHeads(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        heads_list: List[Union[HpmStruct, DictConfig]],
        grid_cell_scale: float = 5.0,
        grid_cell_offset: float = 0.5,
        reg_max: int = 16,
        inference_mode: bool = False,
        eval_size: Optional[Tuple[int, int]] = None,
        width_mult: float = 1.0,
        pose_offset_multiplier: float = 1.0,
        compensate_grid_cell_offset: bool = True,
    ):
        """
        Initializes the NDFLHeads module.

        :param num_classes: Number of detection classes
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param grid_cell_scale: A scaling factor applied to the grid cell coordinates.
               This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).
        :param grid_cell_offset: A fixed offset that is added to the grid cell coordinates.
               This offset represents a 'center' of the cell and is 0.5 by default.
        :param reg_max: Number of bins in the regression head
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param width_mult: A scaling factor applied to in_channels.
        :param pose_offset_multiplier: A scaling factor applied to the pose regression offset. This multiplier is
               meant to reduce absolute magnitude of weights in pose regression layers.
               Default value is 1.0.
        :param compensate_grid_cell_offset: (bool) Controls whether to subtract anchor cell offset from the pose regression.
               If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride.
               If False, predicted pose coordinates decoded as (offsets + anchors) * stride.
               Default value is True.

        """
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]
        super().__init__(in_channels)

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size
        self.pose_offset_multiplier = pose_offset_multiplier
        self.compensate_grid_cell_offset = compensate_grid_cell_offset
        self.inference_mode = inference_mode

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

        factory = det_factory.DetectionModulesFactory()
        heads_list = self._insert_heads_list_params(heads_list, factory, num_classes, reg_max)

        self.num_heads = len(heads_list)
        fpn_strides: List[int] = []
        for i in range(self.num_heads):
            new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
            fpn_strides.append(new_head.stride)
            setattr(self, f"head{i + 1}", new_head)

        self.fpn_strides = tuple(fpn_strides)

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        for i in range(self.num_heads):
            head = getattr(self, f"head{i + 1}")
            head.replace_num_classes(num_classes, compute_new_weights_fn)

        self.num_classes = num_classes

    @staticmethod
    def _insert_heads_list_params(
        heads_list: List[Union[HpmStruct, DictConfig]], factory: det_factory.DetectionModulesFactory, num_classes: int, reg_max: int
    ) -> List[Union[HpmStruct, DictConfig]]:
        """
        Injects num_classes and reg_max parameters into the heads_list.

        :param heads_list:  Input heads list
        :param factory:     DetectionModulesFactory
        :param num_classes: Number of classes
        :param reg_max:     Number of bins in the regression head
        :return:            Heads list with injected parameters
        """
        for i in range(len(heads_list)):
            heads_list[i] = factory.insert_module_param(heads_list[i], "num_classes", num_classes)
            heads_list[i] = factory.insert_module_param(heads_list[i], "reg_max", reg_max)
        return heads_list

    @torch.jit.ignore
    def _init_weights(self):
        if self.eval_size:
            device = infer_model_device(self)
            dtype = infer_model_dtype(self)

            anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def forward(self, feats: Tuple[Tensor, ...]) -> Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]:
        """
        Runs the forward for all the underlying heads and concatenate the predictions to a single result.
        :param feats: List of feature maps from the neck of different strides
        :return: Return value depends on the mode:
        If tracing, a tuple of 4 tensors (decoded predictions) is returned:
        - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format
        - pred_scores [B, Num Anchors, 1] - Predicted scores for each box
        - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format
        - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint

        In training/eval mode, a tuple of 2 tensors returned:
        - decoded predictions - they are the same as in tracing mode
        - raw outputs - a tuple of 8 elements in total, this is needed for training the model.
        """

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []
        pose_regression_list = []
        pose_logits_list = []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            reg_distri, cls_logit, pose_regression, pose_logits = getattr(self, f"head{i + 1}")(feat)
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, -1, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

            pose_regression_list.append(torch.permute(pose_regression.flatten(3), [0, 3, 1, 2]))  # [B, J, 2, H, W] -> [B, H * W, J, 2]
            pose_logits_list.append(torch.permute(pose_logits.flatten(2), [0, 2, 1]))  # [B, J, H, W] -> [B, H * W, J]

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        pose_regression_list = torch.cat(pose_regression_list, dim=1)  # [B, Anchors, J, 2]
        pose_logits_list = torch.cat(pose_logits_list, dim=1)  # [B, Anchors, J]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        # Decode keypoints
        if self.pose_offset_multiplier != 1.0:
            pose_regression_list *= self.pose_offset_multiplier

        if self.compensate_grid_cell_offset:
            pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2) - self.grid_cell_offset
        else:
            pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2)

        pose_regression_list *= stride_tensor.unsqueeze(0).unsqueeze(2)

        pred_pose_coords = pose_regression_list.detach().clone()  # [B, Anchors, C, 2]
        pred_pose_scores = pose_logits_list.detach().clone().sigmoid()  # [B, Anchors, C]

        decoded_predictions = pred_bboxes, pred_scores, pred_pose_coords, pred_pose_scores

        if torch.jit.is_tracing() or self.inference_mode:
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, pose_regression_list, pose_logits_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    @property
    def out_channels(self):
        return None

    def _generate_anchors(self, feats=None, dtype=None, device=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []

        dtype = dtype or feats[0].dtype
        device = device or feats[0].device

        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = torch.arange(end=w) + self.grid_cell_offset
            shift_y = torch.arange(end=h) + self.grid_cell_offset
            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype))
        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)

        if device is not None:
            anchor_points = anchor_points.to(device)
            stride_tensor = stride_tensor.to(device)
        return anchor_points, stride_tensor

__init__(num_classes, in_channels, heads_list, grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, inference_mode=False, eval_size=None, width_mult=1.0, pose_offset_multiplier=1.0, compensate_grid_cell_offset=True)

Initializes the NDFLHeads module.

Parameters:

Name Type Description Default
num_classes int

Number of detection classes

required
in_channels Tuple[int, int, int]

Number of channels for each feature map (See width_mult)

required
grid_cell_scale float

A scaling factor applied to the grid cell coordinates. This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).

5.0
grid_cell_offset float

A fixed offset that is added to the grid cell coordinates. This offset represents a 'center' of the cell and is 0.5 by default.

0.5
reg_max int

Number of bins in the regression head

16
eval_size Optional[Tuple[int, int]]

(rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed, since anchors will not be regenerated for each forward call.

None
width_mult float

A scaling factor applied to in_channels.

1.0
pose_offset_multiplier float

A scaling factor applied to the pose regression offset. This multiplier is meant to reduce absolute magnitude of weights in pose regression layers. Default value is 1.0.

1.0
compensate_grid_cell_offset bool

(bool) Controls whether to subtract anchor cell offset from the pose regression. If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride. If False, predicted pose coordinates decoded as (offsets + anchors) * stride. Default value is True.

True
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def __init__(
    self,
    num_classes: int,
    in_channels: Tuple[int, int, int],
    heads_list: List[Union[HpmStruct, DictConfig]],
    grid_cell_scale: float = 5.0,
    grid_cell_offset: float = 0.5,
    reg_max: int = 16,
    inference_mode: bool = False,
    eval_size: Optional[Tuple[int, int]] = None,
    width_mult: float = 1.0,
    pose_offset_multiplier: float = 1.0,
    compensate_grid_cell_offset: bool = True,
):
    """
    Initializes the NDFLHeads module.

    :param num_classes: Number of detection classes
    :param in_channels: Number of channels for each feature map (See width_mult)
    :param grid_cell_scale: A scaling factor applied to the grid cell coordinates.
           This scaling factor is used to define anchor boxes (see generate_anchors_for_grid_cell).
    :param grid_cell_offset: A fixed offset that is added to the grid cell coordinates.
           This offset represents a 'center' of the cell and is 0.5 by default.
    :param reg_max: Number of bins in the regression head
    :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
           since anchors will not be regenerated for each forward call.
    :param width_mult: A scaling factor applied to in_channels.
    :param pose_offset_multiplier: A scaling factor applied to the pose regression offset. This multiplier is
           meant to reduce absolute magnitude of weights in pose regression layers.
           Default value is 1.0.
    :param compensate_grid_cell_offset: (bool) Controls whether to subtract anchor cell offset from the pose regression.
           If True, predicted pose coordinates decoded as (offsets + anchors - grid_cell_offset) * stride.
           If False, predicted pose coordinates decoded as (offsets + anchors) * stride.
           Default value is True.

    """
    in_channels = [max(round(c * width_mult), 1) for c in in_channels]
    super().__init__(in_channels)

    self.in_channels = tuple(in_channels)
    self.num_classes = num_classes
    self.grid_cell_scale = grid_cell_scale
    self.grid_cell_offset = grid_cell_offset
    self.reg_max = reg_max
    self.eval_size = eval_size
    self.pose_offset_multiplier = pose_offset_multiplier
    self.compensate_grid_cell_offset = compensate_grid_cell_offset
    self.inference_mode = inference_mode

    # Do not apply quantization to this tensor
    proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
    self.register_buffer("proj_conv", proj, persistent=False)

    self._init_weights()

    factory = det_factory.DetectionModulesFactory()
    heads_list = self._insert_heads_list_params(heads_list, factory, num_classes, reg_max)

    self.num_heads = len(heads_list)
    fpn_strides: List[int] = []
    for i in range(self.num_heads):
        new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
        fpn_strides.append(new_head.stride)
        setattr(self, f"head{i + 1}", new_head)

    self.fpn_strides = tuple(fpn_strides)

forward(feats)

Runs the forward for all the underlying heads and concatenate the predictions to a single result.

Parameters:

Name Type Description Default
feats Tuple[Tensor, ...]

List of feature maps from the neck of different strides

required

Returns:

Type Description
Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]

Return value depends on the mode: If tracing, a tuple of 4 tensors (decoded predictions) is returned: - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format - pred_scores [B, Num Anchors, 1] - Predicted scores for each box - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint In training/eval mode, a tuple of 2 tensors returned: - decoded predictions - they are the same as in tracing mode - raw outputs - a tuple of 8 elements in total, this is needed for training the model.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_ndfl_heads.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def forward(self, feats: Tuple[Tensor, ...]) -> Union[YoloNasPoseDecodedPredictions, Tuple[YoloNasPoseDecodedPredictions, YoloNasPoseRawOutputs]]:
    """
    Runs the forward for all the underlying heads and concatenate the predictions to a single result.
    :param feats: List of feature maps from the neck of different strides
    :return: Return value depends on the mode:
    If tracing, a tuple of 4 tensors (decoded predictions) is returned:
    - pred_bboxes [B, Num Anchors, 4] - Predicted boxes in XYXY format
    - pred_scores [B, Num Anchors, 1] - Predicted scores for each box
    - pred_pose_coords [B, Num Anchors, Num Keypoints, 2] - Predicted poses in XY format
    - pred_pose_scores [B, Num Anchors, Num Keypoints] - Predicted scores for each keypoint

    In training/eval mode, a tuple of 2 tensors returned:
    - decoded predictions - they are the same as in tracing mode
    - raw outputs - a tuple of 8 elements in total, this is needed for training the model.
    """

    cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []
    pose_regression_list = []
    pose_logits_list = []

    for i, feat in enumerate(feats):
        b, _, h, w = feat.shape
        height_mul_width = h * w
        reg_distri, cls_logit, pose_regression, pose_logits = getattr(self, f"head{i + 1}")(feat)
        reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

        reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
        reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

        # cls and reg
        cls_score_list.append(cls_logit.reshape([b, -1, height_mul_width]))
        reg_dist_reduced_list.append(reg_dist_reduced)

        pose_regression_list.append(torch.permute(pose_regression.flatten(3), [0, 3, 1, 2]))  # [B, J, 2, H, W] -> [B, H * W, J, 2]
        pose_logits_list.append(torch.permute(pose_logits.flatten(2), [0, 2, 1]))  # [B, J, H, W] -> [B, H * W, J]

    cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
    cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

    reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
    reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

    pose_regression_list = torch.cat(pose_regression_list, dim=1)  # [B, Anchors, J, 2]
    pose_logits_list = torch.cat(pose_logits_list, dim=1)  # [B, Anchors, J]

    # Decode bboxes
    # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
    if self.eval_size:
        anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
    else:
        anchor_points_inference, stride_tensor = self._generate_anchors(feats)

    pred_scores = cls_score_list.sigmoid()
    pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

    # Decode keypoints
    if self.pose_offset_multiplier != 1.0:
        pose_regression_list *= self.pose_offset_multiplier

    if self.compensate_grid_cell_offset:
        pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2) - self.grid_cell_offset
    else:
        pose_regression_list += anchor_points_inference.unsqueeze(0).unsqueeze(2)

    pose_regression_list *= stride_tensor.unsqueeze(0).unsqueeze(2)

    pred_pose_coords = pose_regression_list.detach().clone()  # [B, Anchors, C, 2]
    pred_pose_scores = pose_logits_list.detach().clone().sigmoid()  # [B, Anchors, C]

    decoded_predictions = pred_bboxes, pred_scores, pred_pose_coords, pred_pose_scores

    if torch.jit.is_tracing() or self.inference_mode:
        return decoded_predictions

    anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

    raw_predictions = cls_score_list, reg_distri_list, pose_regression_list, pose_logits_list, anchors, anchor_points, num_anchors_list, stride_tensor
    return decoded_predictions, raw_predictions

YoloNASPosePostPredictionCallback

Bases: AbstractPoseEstimationPostPredictionCallback

A post-prediction callback for YoloNASPose model. Performs confidence thresholding, Top-K and NMS steps.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class YoloNASPosePostPredictionCallback(AbstractPoseEstimationPostPredictionCallback):
    """
    A post-prediction callback for YoloNASPose model.
    Performs confidence thresholding, Top-K and NMS steps.
    """

    def __init__(
        self,
        pose_confidence_threshold: float,
        nms_iou_threshold: float,
        pre_nms_max_predictions: int,
        post_nms_max_predictions: int,
    ):
        """
        :param pose_confidence_threshold: Pose detection confidence threshold
        :param nms_iou_threshold:         IoU threshold for NMS step.
        :param pre_nms_max_predictions:   Number of predictions participating in NMS step
        :param post_nms_max_predictions:  Maximum number of boxes to return after NMS step
        """
        if post_nms_max_predictions > pre_nms_max_predictions:
            raise ValueError("post_nms_max_predictions must be less than pre_nms_max_predictions")

        super().__init__()
        self.pose_confidence_threshold = pose_confidence_threshold
        self.nms_iou_threshold = nms_iou_threshold
        self.pre_nms_max_predictions = pre_nms_max_predictions
        self.post_nms_max_predictions = post_nms_max_predictions

    @torch.no_grad()
    def __call__(self, outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]) -> List[PoseEstimationPredictions]:
        """
        Take YoloNASPose's predictions and decode them into usable pose predictions.

        :param outputs: Output of the model's forward() method
        :return:        List of decoded predictions for each image in the batch.
        """
        # First is model predictions, second element of tuple is logits for loss computation
        predictions = outputs[0]

        decoded_predictions: List[PoseEstimationPredictions] = []
        for pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores in zip(*predictions):
            # pred_bboxes [Anchors, 4] in XYXY format
            # pred_scores [Anchors, 1] confidence scores [0..1]
            # pred_pose_coords [Anchors, Num Keypoints, 2] in (x,y) format
            # pred_pose_scores [Anchors, Num Keypoints] confidence scores [0..1]

            pred_bboxes_conf = pred_bboxes_conf.squeeze(-1)  # [Anchors]
            conf_mask = pred_bboxes_conf >= self.pose_confidence_threshold  # [Anchors]

            pred_bboxes_conf = pred_bboxes_conf[conf_mask].float()
            pred_bboxes_xyxy = pred_bboxes_xyxy[conf_mask].float()
            pred_pose_coords = pred_pose_coords[conf_mask].float()
            pred_pose_scores = pred_pose_scores[conf_mask].float()

            # Filter all predictions by self.nms_top_k
            if pred_bboxes_conf.size(0) > self.pre_nms_max_predictions:
                topk_candidates = torch.topk(pred_bboxes_conf, k=self.pre_nms_max_predictions, largest=True, sorted=True)
                pred_bboxes_conf = pred_bboxes_conf[topk_candidates.indices]
                pred_bboxes_xyxy = pred_bboxes_xyxy[topk_candidates.indices]
                pred_pose_coords = pred_pose_coords[topk_candidates.indices]
                pred_pose_scores = pred_pose_scores[topk_candidates.indices]

            # NMS
            idx_to_keep = torchvision.ops.boxes.nms(boxes=pred_bboxes_xyxy, scores=pred_bboxes_conf, iou_threshold=self.nms_iou_threshold)

            final_bboxes = pred_bboxes_xyxy[idx_to_keep]  # [Instances,]
            final_scores = pred_bboxes_conf[idx_to_keep]  # [Instances,]

            final_poses = torch.cat(
                [
                    pred_pose_coords[idx_to_keep],
                    pred_pose_scores[idx_to_keep].unsqueeze(-1),
                ],
                dim=-1,
            )  # [Instances, Num Keypoints, 3]

            decoded_predictions.append(
                PoseEstimationPredictions(
                    poses=final_poses[: self.post_nms_max_predictions],
                    scores=final_scores[: self.post_nms_max_predictions],
                    bboxes_xyxy=final_bboxes[: self.post_nms_max_predictions],
                )
            )

        return decoded_predictions

__call__(outputs)

Take YoloNASPose's predictions and decode them into usable pose predictions.

Parameters:

Name Type Description Default
outputs Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]

Output of the model's forward() method

required

Returns:

Type Description
List[PoseEstimationPredictions]

List of decoded predictions for each image in the batch.

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@torch.no_grad()
def __call__(self, outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], ...]) -> List[PoseEstimationPredictions]:
    """
    Take YoloNASPose's predictions and decode them into usable pose predictions.

    :param outputs: Output of the model's forward() method
    :return:        List of decoded predictions for each image in the batch.
    """
    # First is model predictions, second element of tuple is logits for loss computation
    predictions = outputs[0]

    decoded_predictions: List[PoseEstimationPredictions] = []
    for pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores in zip(*predictions):
        # pred_bboxes [Anchors, 4] in XYXY format
        # pred_scores [Anchors, 1] confidence scores [0..1]
        # pred_pose_coords [Anchors, Num Keypoints, 2] in (x,y) format
        # pred_pose_scores [Anchors, Num Keypoints] confidence scores [0..1]

        pred_bboxes_conf = pred_bboxes_conf.squeeze(-1)  # [Anchors]
        conf_mask = pred_bboxes_conf >= self.pose_confidence_threshold  # [Anchors]

        pred_bboxes_conf = pred_bboxes_conf[conf_mask].float()
        pred_bboxes_xyxy = pred_bboxes_xyxy[conf_mask].float()
        pred_pose_coords = pred_pose_coords[conf_mask].float()
        pred_pose_scores = pred_pose_scores[conf_mask].float()

        # Filter all predictions by self.nms_top_k
        if pred_bboxes_conf.size(0) > self.pre_nms_max_predictions:
            topk_candidates = torch.topk(pred_bboxes_conf, k=self.pre_nms_max_predictions, largest=True, sorted=True)
            pred_bboxes_conf = pred_bboxes_conf[topk_candidates.indices]
            pred_bboxes_xyxy = pred_bboxes_xyxy[topk_candidates.indices]
            pred_pose_coords = pred_pose_coords[topk_candidates.indices]
            pred_pose_scores = pred_pose_scores[topk_candidates.indices]

        # NMS
        idx_to_keep = torchvision.ops.boxes.nms(boxes=pred_bboxes_xyxy, scores=pred_bboxes_conf, iou_threshold=self.nms_iou_threshold)

        final_bboxes = pred_bboxes_xyxy[idx_to_keep]  # [Instances,]
        final_scores = pred_bboxes_conf[idx_to_keep]  # [Instances,]

        final_poses = torch.cat(
            [
                pred_pose_coords[idx_to_keep],
                pred_pose_scores[idx_to_keep].unsqueeze(-1),
            ],
            dim=-1,
        )  # [Instances, Num Keypoints, 3]

        decoded_predictions.append(
            PoseEstimationPredictions(
                poses=final_poses[: self.post_nms_max_predictions],
                scores=final_scores[: self.post_nms_max_predictions],
                bboxes_xyxy=final_bboxes[: self.post_nms_max_predictions],
            )
        )

    return decoded_predictions

__init__(pose_confidence_threshold, nms_iou_threshold, pre_nms_max_predictions, post_nms_max_predictions)

Parameters:

Name Type Description Default
pose_confidence_threshold float

Pose detection confidence threshold

required
nms_iou_threshold float

IoU threshold for NMS step.

required
pre_nms_max_predictions int

Number of predictions participating in NMS step

required
post_nms_max_predictions int

Maximum number of boxes to return after NMS step

required
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_post_prediction_callback.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(
    self,
    pose_confidence_threshold: float,
    nms_iou_threshold: float,
    pre_nms_max_predictions: int,
    post_nms_max_predictions: int,
):
    """
    :param pose_confidence_threshold: Pose detection confidence threshold
    :param nms_iou_threshold:         IoU threshold for NMS step.
    :param pre_nms_max_predictions:   Number of predictions participating in NMS step
    :param post_nms_max_predictions:  Maximum number of boxes to return after NMS step
    """
    if post_nms_max_predictions > pre_nms_max_predictions:
        raise ValueError("post_nms_max_predictions must be less than pre_nms_max_predictions")

    super().__init__()
    self.pose_confidence_threshold = pose_confidence_threshold
    self.nms_iou_threshold = nms_iou_threshold
    self.pre_nms_max_predictions = pre_nms_max_predictions
    self.post_nms_max_predictions = post_nms_max_predictions

YoloNASPose

Bases: CustomizableDetector, ExportablePoseEstimationModel, SupportsInputShapeCheck

YoloNASPose model

Exported model support matrix

Batch Size Format OnnxRuntime 1.13.1 TensorRT 8.4.2 TensorRT 8.5.3 TensorRT 8.6.1
1 Flat Yes Yes Yes Yes
>1 Flat Yes Yes Yes Yes
1 Batch Yes No No Yes
>1 Batch Yes No No Yes

ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
class YoloNASPose(CustomizableDetector, ExportablePoseEstimationModel, SupportsInputShapeCheck):
    """
    YoloNASPose model

    Exported model support matrix

    | Batch Size | Format | OnnxRuntime 1.13.1 | TensorRT 8.4.2 | TensorRT 8.5.3 | TensorRT 8.6.1 |
    |------------|--------|--------------------|----------------|----------------|----------------|
    | 1          | Flat   | Yes                | Yes            | Yes            | Yes            |
    | >1         | Flat   | Yes                | Yes            | Yes            | Yes            |
    | 1          | Batch  | Yes                | No             | No             | Yes            |
    | >1         | Batch  | Yes                | No             | No             | Yes            |

    ONNX files generated with PyTorch 2.0.1 for ONNX opset_version=14
    """

    def __init__(
        self,
        backbone: Union[str, dict, HpmStruct, DictConfig],
        heads: Union[str, dict, HpmStruct, DictConfig],
        neck: Optional[Union[str, dict, HpmStruct, DictConfig]] = None,
        num_classes: int = None,
        bn_eps: Optional[float] = None,
        bn_momentum: Optional[float] = None,
        inplace_act: Optional[bool] = True,
        in_channels: int = 3,
    ):
        super().__init__(
            backbone=backbone,
            heads=heads,
            neck=neck,
            num_classes=num_classes,
            bn_eps=bn_eps,
            bn_momentum=bn_momentum,
            inplace_act=inplace_act,
            in_channels=in_channels,
        )
        self._edge_links = None
        self._edge_colors = None
        self._keypoint_colors = None
        self._image_processor = None
        self._default_nms_conf = None
        self._default_nms_iou = None
        self._default_pre_nms_max_predictions = None
        self._default_post_nms_max_predictions = None

    def get_decoding_module(self, num_pre_nms_predictions: int, **kwargs) -> AbstractPoseEstimationDecodingModule:
        return YoloNASPoseDecodingModule(num_pre_nms_predictions)

    def predict(
        self,
        images: ImageSource,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> PoseEstimationPrediction:
        """Predict an image or a list of images.

        :param images:     Images to predict.
        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param batch_size: Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            fp16=fp16,
        )
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        batch_size: int = 32,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ):
        """Predict using webcam.

        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param batch_size: Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.

        """
        pipeline = self._get_pipeline(
            iou=iou,
            conf=conf,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
            fuse_model=fuse_model,
            skip_image_resizing=skip_image_resizing,
            fp16=fp16,
        )
        pipeline.predict_webcam()

    @lru_cache(maxsize=1)
    def _get_pipeline(
        self,
        iou: Optional[float] = None,
        conf: Optional[float] = None,
        pre_nms_max_predictions: Optional[int] = None,
        post_nms_max_predictions: Optional[int] = None,
        fuse_model: bool = True,
        skip_image_resizing: bool = False,
        fp16: bool = True,
    ) -> PoseEstimationPipeline:
        """Instantiate the prediction pipeline of this model.

        :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
        :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                           If None, the default value associated to the training is used.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param skip_image_resizing: If True, the image processor will not resize the images.
        :param fp16:       If True, use mixed precision for inference.
        """
        if None in (self._image_processor, self._default_nms_iou, self._default_nms_conf, self._edge_links):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        iou = iou or self._default_nms_iou
        conf = conf or self._default_nms_conf
        pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
        post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

        # Ensure that the image size is divisible by 32.
        if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing:
            image_processor = self._image_processor.get_equivalent_compose_without_resizing(
                auto_padding=KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0)
            )
        else:
            image_processor = self._image_processor

        pipeline = PoseEstimationPipeline(
            model=self,
            image_processor=image_processor,
            post_prediction_callback=self.get_post_prediction_callback(
                iou=iou,
                conf=conf,
                pre_nms_max_predictions=pre_nms_max_predictions,
                post_nms_max_predictions=post_nms_max_predictions,
            ),
            fuse_model=fuse_model,
            edge_links=self._edge_links,
            edge_colors=self._edge_colors,
            keypoint_colors=self._keypoint_colors,
            fp16=fp16,
        )
        return pipeline

    @classmethod
    def get_post_prediction_callback(
        cls, conf: float, iou: float, pre_nms_max_predictions=1000, post_nms_max_predictions=300
    ) -> YoloNASPosePostPredictionCallback:
        return YoloNASPosePostPredictionCallback(
            pose_confidence_threshold=conf,
            nms_iou_threshold=iou,
            pre_nms_max_predictions=pre_nms_max_predictions,
            post_nms_max_predictions=post_nms_max_predictions,
        )

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        edge_links: Union[np.ndarray, List[Tuple[int, int]]],
        edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
        image_processor: Optional[Processing] = None,
        conf: Optional[float] = None,
        iou: Optional[float] = 0.7,
        pre_nms_max_predictions=300,
        post_nms_max_predictions=100,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        :param conf:            (Optional) Below the confidence threshold, prediction are discarded
        """
        self._edge_links = edge_links or self._edge_links
        self._edge_colors = edge_colors or self._edge_colors
        self._keypoint_colors = keypoint_colors or self._keypoint_colors
        self._image_processor = image_processor or self._image_processor
        self._default_nms_conf = conf or self._default_nms_conf
        self._default_nms_iou = iou or self._default_nms_iou
        self._default_pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
        self._default_post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

    def get_input_shape_steps(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

get_input_shape_steps()

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
306
307
308
309
310
311
def get_input_shape_steps(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

get_minimum_input_shape_size()

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
313
314
315
316
317
318
def get_minimum_input_shape_size(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

predict(images, iou=None, conf=None, pre_nms_max_predictions=None, post_nms_max_predictions=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def predict(
    self,
    images: ImageSource,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    pre_nms_max_predictions: Optional[int] = None,
    post_nms_max_predictions: Optional[int] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
) -> PoseEstimationPrediction:
    """Predict an image or a list of images.

    :param images:     Images to predict.
    :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                       If None, the default value associated to the training is used.
    :param batch_size: Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        pre_nms_max_predictions=pre_nms_max_predictions,
        post_nms_max_predictions=post_nms_max_predictions,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        fp16=fp16,
    )
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(iou=None, conf=None, pre_nms_max_predictions=None, post_nms_max_predictions=None, batch_size=32, fuse_model=True, skip_image_resizing=False, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
iou Optional[float]

(Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used.

None
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
skip_image_resizing bool

If True, the image processor will not resize the images.

False
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def predict_webcam(
    self,
    iou: Optional[float] = None,
    conf: Optional[float] = None,
    pre_nms_max_predictions: Optional[int] = None,
    post_nms_max_predictions: Optional[int] = None,
    batch_size: int = 32,
    fuse_model: bool = True,
    skip_image_resizing: bool = False,
    fp16: bool = True,
):
    """Predict using webcam.

    :param iou:        (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
    :param conf:       (Optional) Below the confidence threshold, prediction are discarded.
                       If None, the default value associated to the training is used.
    :param batch_size: Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param skip_image_resizing: If True, the image processor will not resize the images.
    :param fp16:       If True, use mixed precision for inference.

    """
    pipeline = self._get_pipeline(
        iou=iou,
        conf=conf,
        pre_nms_max_predictions=pre_nms_max_predictions,
        post_nms_max_predictions=post_nms_max_predictions,
        fuse_model=fuse_model,
        skip_image_resizing=skip_image_resizing,
        fp16=fp16,
    )
    pipeline.predict_webcam()

set_dataset_processing_params(edge_links, edge_colors, keypoint_colors, image_processor=None, conf=None, iou=0.7, pre_nms_max_predictions=300, post_nms_max_predictions=100)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
conf Optional[float]

(Optional) Below the confidence threshold, prediction are discarded

None
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    edge_links: Union[np.ndarray, List[Tuple[int, int]]],
    edge_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]],
    image_processor: Optional[Processing] = None,
    conf: Optional[float] = None,
    iou: Optional[float] = 0.7,
    pre_nms_max_predictions=300,
    post_nms_max_predictions=100,
) -> None:
    """Set the processing parameters for the dataset.

    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    :param conf:            (Optional) Below the confidence threshold, prediction are discarded
    """
    self._edge_links = edge_links or self._edge_links
    self._edge_colors = edge_colors or self._edge_colors
    self._keypoint_colors = keypoint_colors or self._keypoint_colors
    self._image_processor = image_processor or self._image_processor
    self._default_nms_conf = conf or self._default_nms_conf
    self._default_nms_iou = iou or self._default_nms_iou
    self._default_pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions
    self._default_post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions

YoloNASPoseDecodingModule

Bases: AbstractPoseEstimationDecodingModule

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class YoloNASPoseDecodingModule(AbstractPoseEstimationDecodingModule):
    __constants__ = ["num_pre_nms_predictions"]

    def __init__(
        self,
        num_pre_nms_predictions: int = 1000,
    ):
        super().__init__()
        self.num_pre_nms_predictions = num_pre_nms_predictions

    @torch.jit.ignore
    def infer_total_number_of_predictions(self, inputs: Any) -> int:
        """

        :param inputs: YoloNASPose model outputs
        :return:
        """
        if torch.jit.is_tracing():
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
        else:
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

        return pred_bboxes_xyxy.size(1)

    def get_num_pre_nms_predictions(self) -> int:
        return self.num_pre_nms_predictions

    def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
        """
        Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

        :param inputs: YoloNASPose model outputs
        :return: Tuple of (pred_bboxes, pred_scores, pred_joints)
        - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format
        - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose
        - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format
        """
        if torch.jit.is_tracing():
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
        else:
            pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

        nms_top_k = self.num_pre_nms_predictions
        batch_size, num_anchors, _ = pred_bboxes_conf.size()

        topk_candidates = torch.topk(pred_bboxes_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

        offsets = num_anchors * torch.arange(batch_size, device=pred_bboxes_conf.device)
        indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1, 1)
        flat_indices = torch.flatten(indices_with_offset)

        pred_poses_and_scores = torch.cat([pred_pose_coords, pred_pose_scores.unsqueeze(3)], dim=3)

        output_pred_bboxes = pred_bboxes_xyxy.reshape(-1, pred_bboxes_xyxy.size(2))[flat_indices, :].reshape(
            pred_bboxes_xyxy.size(0), nms_top_k, pred_bboxes_xyxy.size(2)
        )
        output_pred_scores = pred_bboxes_conf.reshape(-1, pred_bboxes_conf.size(2))[flat_indices, :].reshape(
            pred_bboxes_conf.size(0), nms_top_k, pred_bboxes_conf.size(2)
        )
        output_pred_joints = pred_poses_and_scores.reshape(-1, pred_poses_and_scores.size(2), 3)[flat_indices, :, :].reshape(
            pred_poses_and_scores.size(0), nms_top_k, pred_poses_and_scores.size(2), pred_poses_and_scores.size(3)
        )

        return output_pred_bboxes, output_pred_scores, output_pred_joints

forward(inputs)

Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

Parameters:

Name Type Description Default
inputs Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]

YoloNASPose model outputs

required

Returns:

Type Description

Tuple of (pred_bboxes, pred_scores, pred_joints) - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format

Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def forward(self, inputs: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]):
    """
    Decode YoloNASPose model outputs into bounding boxes, confidence scores and pose coordinates and scores

    :param inputs: YoloNASPose model outputs
    :return: Tuple of (pred_bboxes, pred_scores, pred_joints)
    - pred_bboxes: [Batch, num_pre_nms_predictions, 4] Bounding of associated with pose in XYXY format
    - pred_scores: [Batch, num_pre_nms_predictions, 1] Confidence scores [0..1] for entire pose
    - pred_joints: [Batch, num_pre_nms_predictions, Num Joints, 3] Joints in (x,y,confidence) format
    """
    if torch.jit.is_tracing():
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
    else:
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

    nms_top_k = self.num_pre_nms_predictions
    batch_size, num_anchors, _ = pred_bboxes_conf.size()

    topk_candidates = torch.topk(pred_bboxes_conf, dim=1, k=nms_top_k, largest=True, sorted=True)

    offsets = num_anchors * torch.arange(batch_size, device=pred_bboxes_conf.device)
    indices_with_offset = topk_candidates.indices + offsets.reshape(batch_size, 1, 1)
    flat_indices = torch.flatten(indices_with_offset)

    pred_poses_and_scores = torch.cat([pred_pose_coords, pred_pose_scores.unsqueeze(3)], dim=3)

    output_pred_bboxes = pred_bboxes_xyxy.reshape(-1, pred_bboxes_xyxy.size(2))[flat_indices, :].reshape(
        pred_bboxes_xyxy.size(0), nms_top_k, pred_bboxes_xyxy.size(2)
    )
    output_pred_scores = pred_bboxes_conf.reshape(-1, pred_bboxes_conf.size(2))[flat_indices, :].reshape(
        pred_bboxes_conf.size(0), nms_top_k, pred_bboxes_conf.size(2)
    )
    output_pred_joints = pred_poses_and_scores.reshape(-1, pred_poses_and_scores.size(2), 3)[flat_indices, :, :].reshape(
        pred_poses_and_scores.size(0), nms_top_k, pred_poses_and_scores.size(2), pred_poses_and_scores.size(3)
    )

    return output_pred_bboxes, output_pred_scores, output_pred_joints

infer_total_number_of_predictions(inputs)

Parameters:

Name Type Description Default
inputs Any

YoloNASPose model outputs

required

Returns:

Type Description
int
Source code in src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py
37
38
39
40
41
42
43
44
45
46
47
48
49
@torch.jit.ignore
def infer_total_number_of_predictions(self, inputs: Any) -> int:
    """

    :param inputs: YoloNASPose model outputs
    :return:
    """
    if torch.jit.is_tracing():
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs
    else:
        pred_bboxes_xyxy, pred_bboxes_conf, pred_pose_coords, pred_pose_scores = inputs[0]

    return pred_bboxes_xyxy.size(1)

SegmentationHead

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/common.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class SegmentationHead(nn.Module):
    def __init__(self, in_channels: int, mid_channels: int, num_classes: int, dropout: float):
        super(SegmentationHead, self).__init__()
        self.seg_head = nn.Sequential(
            ConvBNReLU(in_channels, mid_channels, kernel_size=3, padding=1, stride=1, bias=False),
            nn.Dropout(dropout),
            nn.Conv2d(mid_channels, num_classes, kernel_size=1, bias=False),
        )

    def forward(self, x):
        return self.seg_head(x)

    def replace_num_classes(self, num_classes: int):
        """
        This method replace the last Conv Classification layer to output a different number of classes.
        Note that the weights of the new layers are random initiated.
        """
        old_cls_conv = self.seg_head[-1]
        self.seg_head[-1] = nn.Conv2d(old_cls_conv.in_channels, num_classes, kernel_size=1, bias=False)

replace_num_classes(num_classes)

This method replace the last Conv Classification layer to output a different number of classes. Note that the weights of the new layers are random initiated.

Source code in src/super_gradients/training/models/segmentation_models/common.py
17
18
19
20
21
22
23
def replace_num_classes(self, num_classes: int):
    """
    This method replace the last Conv Classification layer to output a different number of classes.
    Note that the weights of the new layers are random initiated.
    """
    old_cls_conv = self.seg_head[-1]
    self.seg_head[-1] = nn.Conv2d(old_cls_conv.in_channels, num_classes, kernel_size=1, bias=False)

ASPP

Bases: AbstractContextModule

ASPP bottleneck block. Splits the input to len(dilation_list) + 1, (a 1x1 conv) heads of differently dilated convolutions. The different heads will be concatenated and the output channel of each will be the input channel / len(dilation_list) + 1 so as to keep the same output channel as input channel.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class ASPP(AbstractContextModule):
    """
    ASPP bottleneck block. Splits the input to len(dilation_list) + 1, (a 1x1 conv) heads of differently dilated convolutions.
    The different heads will be concatenated and the output channel of each will be the
    input channel / len(dilation_list) + 1 so as to keep the same output channel as input channel.
    """

    def __init__(self, in_channels: int, dilation_list: List[int], in_out_ratio: float = 1.0, use_bias: bool = False, **kwargs):
        """
        :param dilation_list: list of dilation rates, the num of dilation branches should be set so that there is a
            whole division of the input channels, see assertion below.
        :param in_out_ratio: output / input num of channels ratio.
        :param use_bias: legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with
            extra redundant biases before batchnorm operators. should be set to `False` for new training processes.
        """
        super().__init__()
        num_dilation_branches = len(dilation_list) + 1
        inter_ratio = num_dilation_branches / in_out_ratio
        assert in_channels % inter_ratio == 0
        inter_channels = int(in_channels / inter_ratio)

        self.dilated_conv_list = nn.ModuleList(
            [
                ConvBNReLU(in_channels, inter_channels, kernel_size=1, dilation=1, bias=use_bias),
                *[ConvBNReLU(in_channels, inter_channels, kernel_size=3, dilation=d, padding=d, bias=use_bias) for d in dilation_list],
            ]
        )

        self.out_channels = inter_channels * num_dilation_branches

    def output_channels(self):
        return self.out_channels

    def forward(self, x):
        x = torch.cat([dilated_conv(x) for dilated_conv in self.dilated_conv_list], dim=1)
        return x

__init__(in_channels, dilation_list, in_out_ratio=1.0, use_bias=False, **kwargs)

Parameters:

Name Type Description Default
dilation_list List[int]

list of dilation rates, the num of dilation branches should be set so that there is a whole division of the input channels, see assertion below.

required
in_out_ratio float

output / input num of channels ratio.

1.0
use_bias bool

legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with extra redundant biases before batchnorm operators. should be set to False for new training processes.

False
Source code in src/super_gradients/training/models/segmentation_models/context_modules.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def __init__(self, in_channels: int, dilation_list: List[int], in_out_ratio: float = 1.0, use_bias: bool = False, **kwargs):
    """
    :param dilation_list: list of dilation rates, the num of dilation branches should be set so that there is a
        whole division of the input channels, see assertion below.
    :param in_out_ratio: output / input num of channels ratio.
    :param use_bias: legacy parameter to support PascalVOC frontier checkpoints that were trained by mistake with
        extra redundant biases before batchnorm operators. should be set to `False` for new training processes.
    """
    super().__init__()
    num_dilation_branches = len(dilation_list) + 1
    inter_ratio = num_dilation_branches / in_out_ratio
    assert in_channels % inter_ratio == 0
    inter_channels = int(in_channels / inter_ratio)

    self.dilated_conv_list = nn.ModuleList(
        [
            ConvBNReLU(in_channels, inter_channels, kernel_size=1, dilation=1, bias=use_bias),
            *[ConvBNReLU(in_channels, inter_channels, kernel_size=3, dilation=d, padding=d, bias=use_bias) for d in dilation_list],
        ]
    )

    self.out_channels = inter_channels * num_dilation_branches

SPPM

Bases: AbstractContextModule

Simple Pyramid Pooling context Module.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class SPPM(AbstractContextModule):
    """
    Simple Pyramid Pooling context Module.
    """

    def __init__(
        self,
        in_channels: int,
        inter_channels: int,
        out_channels: int,
        pool_sizes: List[Union[int, Tuple[int, int]]],
        upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
        align_corners: bool = False,
    ):
        """
        :param inter_channels: num channels in each pooling branch.
        :param out_channels: The number of output channels after pyramid pooling module.
        :param pool_sizes: spatial output sizes of the pooled feature maps.
        """
        super().__init__()
        self.branches = nn.ModuleList(
            [
                nn.Sequential(
                    nn.AdaptiveAvgPool2d(pool_size),
                    ConvBNReLU(in_channels, inter_channels, kernel_size=1, bias=False),
                )
                for pool_size in pool_sizes
            ]
        )
        self.conv_out = ConvBNReLU(inter_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.out_channels = out_channels
        self.upsample_mode = upsample_mode
        self.align_corners = align_corners
        self.pool_sizes = pool_sizes

    def forward(self, x):
        out = None
        input_shape = x.shape[2:]
        for branch in self.branches:
            y = branch(x)
            y = F.interpolate(y, size=input_shape, mode=self.upsample_mode, align_corners=self.align_corners)
            out = y if out is None else out + y
        out = self.conv_out(out)
        return out

    def output_channels(self):
        return self.out_channels

    def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
        """
        Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported
        when compiling to ONNX: `Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.`
        """
        input_size = [x / stride_ratio for x in input_size[-2:]]
        for branch in self.branches:
            global_pool: nn.AdaptiveAvgPool2d = branch[0]
            # If not a global average pooling skip this. The module might be already converted to average pooling
            # modules.
            if not isinstance(global_pool, nn.AdaptiveAvgPool2d):
                continue
            out_size = global_pool.output_size
            out_size = out_size if isinstance(out_size, (tuple, list)) else (out_size, out_size)
            kernel_size = [int(i / o) for i, o in zip(input_size, out_size)]
            branch[0] = nn.AvgPool2d(kernel_size=kernel_size, stride=kernel_size)

__init__(in_channels, inter_channels, out_channels, pool_sizes, upsample_mode=UpsampleMode.BILINEAR, align_corners=False)

Parameters:

Name Type Description Default
inter_channels int

num channels in each pooling branch.

required
out_channels int

The number of output channels after pyramid pooling module.

required
pool_sizes List[Union[int, Tuple[int, int]]]

spatial output sizes of the pooled feature maps.

required
Source code in src/super_gradients/training/models/segmentation_models/context_modules.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    in_channels: int,
    inter_channels: int,
    out_channels: int,
    pool_sizes: List[Union[int, Tuple[int, int]]],
    upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
    align_corners: bool = False,
):
    """
    :param inter_channels: num channels in each pooling branch.
    :param out_channels: The number of output channels after pyramid pooling module.
    :param pool_sizes: spatial output sizes of the pooled feature maps.
    """
    super().__init__()
    self.branches = nn.ModuleList(
        [
            nn.Sequential(
                nn.AdaptiveAvgPool2d(pool_size),
                ConvBNReLU(in_channels, inter_channels, kernel_size=1, bias=False),
            )
            for pool_size in pool_sizes
        ]
    )
    self.conv_out = ConvBNReLU(inter_channels, out_channels, kernel_size=3, padding=1, bias=False)
    self.out_channels = out_channels
    self.upsample_mode = upsample_mode
    self.align_corners = align_corners
    self.pool_sizes = pool_sizes

prep_model_for_conversion(input_size, stride_ratio=32, **kwargs)

Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported when compiling to ONNX: Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.

Source code in src/super_gradients/training/models/segmentation_models/context_modules.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
    """
    Replace Global average pooling with fixed kernels Average pooling, since dynamic kernel sizes are not supported
    when compiling to ONNX: `Unsupported: ONNX export of operator adaptive_avg_pool2d, input size not accessible.`
    """
    input_size = [x / stride_ratio for x in input_size[-2:]]
    for branch in self.branches:
        global_pool: nn.AdaptiveAvgPool2d = branch[0]
        # If not a global average pooling skip this. The module might be already converted to average pooling
        # modules.
        if not isinstance(global_pool, nn.AdaptiveAvgPool2d):
            continue
        out_size = global_pool.output_size
        out_size = out_size if isinstance(out_size, (tuple, list)) else (out_size, out_size)
        kernel_size = [int(i / o) for i, o in zip(input_size, out_size)]
        branch[0] = nn.AvgPool2d(kernel_size=kernel_size, stride=kernel_size)

DAPPMBranch

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class DAPPMBranch(nn.Module):
    def __init__(self, kernel_size: int, stride: int, in_planes: int, branch_planes: int, inter_mode: str = "bilinear"):
        """
        A DAPPM branch
        :param kernel_size: the kernel size for the average pooling
                when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed
        :param stride: stride for the average pooling
                when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1)
                when stride=1: no average pooling is performed
                when stride>1: average polling is performed (scaling the input down and up again)
        :param in_planes:
        :param branch_planes: width after the the first convolution
        :param inter_mode: interpolation mode for upscaling
        """

        super().__init__()
        down_list = []
        if stride == 0:
            # when stride is 0 average pool all the input to 1x1
            down_list.append(nn.AdaptiveAvgPool2d((1, 1)))
        elif stride == 1:
            # when stride id 1 no average pooling is used
            pass
        else:
            down_list.append(nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=stride))

        down_list.append(nn.BatchNorm2d(in_planes))
        down_list.append(nn.ReLU(inplace=True))
        down_list.append(nn.Conv2d(in_planes, branch_planes, kernel_size=1, bias=False))

        self.down_scale = nn.Sequential(*down_list)
        self.up_scale = UpscaleOnline(inter_mode)

        if stride != 1:
            self.process = nn.Sequential(
                nn.BatchNorm2d(branch_planes),
                nn.ReLU(inplace=True),
                nn.Conv2d(branch_planes, branch_planes, kernel_size=3, padding=1, bias=False),
            )

    def forward(self, x):
        """
        All branches of the DAPPM but the first one receive the output of the previous branch as a second input
        :param x: in branch 0 - the original input of the DAPPM. in other branches - a list containing the original
        input and the output of the previous branch.
        """

        if isinstance(x, list):
            output_of_prev_branch = x[1]
            x = x[0]
        else:
            output_of_prev_branch = None

        in_width = x.shape[-1]
        in_height = x.shape[-2]
        out = self.down_scale(x)
        out = self.up_scale(out, output_height=in_height, output_width=in_width)

        if output_of_prev_branch is not None:
            out = self.process(out + output_of_prev_branch)

        return out

__init__(kernel_size, stride, in_planes, branch_planes, inter_mode='bilinear')

A DAPPM branch

Parameters:

Name Type Description Default
kernel_size int

the kernel size for the average pooling when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed

required
stride int

stride for the average pooling when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1) when stride=1: no average pooling is performed when stride>1: average polling is performed (scaling the input down and up again)

required
in_planes int required
branch_planes int

width after the the first convolution

required
inter_mode str

interpolation mode for upscaling

'bilinear'
Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(self, kernel_size: int, stride: int, in_planes: int, branch_planes: int, inter_mode: str = "bilinear"):
    """
    A DAPPM branch
    :param kernel_size: the kernel size for the average pooling
            when stride=0 this parameter is omitted and AdaptiveAvgPool2d over all the input is performed
    :param stride: stride for the average pooling
            when stride=0: an AdaptiveAvgPool2d over all the input is performed (output is 1x1)
            when stride=1: no average pooling is performed
            when stride>1: average polling is performed (scaling the input down and up again)
    :param in_planes:
    :param branch_planes: width after the the first convolution
    :param inter_mode: interpolation mode for upscaling
    """

    super().__init__()
    down_list = []
    if stride == 0:
        # when stride is 0 average pool all the input to 1x1
        down_list.append(nn.AdaptiveAvgPool2d((1, 1)))
    elif stride == 1:
        # when stride id 1 no average pooling is used
        pass
    else:
        down_list.append(nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=stride))

    down_list.append(nn.BatchNorm2d(in_planes))
    down_list.append(nn.ReLU(inplace=True))
    down_list.append(nn.Conv2d(in_planes, branch_planes, kernel_size=1, bias=False))

    self.down_scale = nn.Sequential(*down_list)
    self.up_scale = UpscaleOnline(inter_mode)

    if stride != 1:
        self.process = nn.Sequential(
            nn.BatchNorm2d(branch_planes),
            nn.ReLU(inplace=True),
            nn.Conv2d(branch_planes, branch_planes, kernel_size=3, padding=1, bias=False),
        )

forward(x)

All branches of the DAPPM but the first one receive the output of the previous branch as a second input

Parameters:

Name Type Description Default
x

in branch 0 - the original input of the DAPPM. in other branches - a list containing the original input and the output of the previous branch.

required
Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def forward(self, x):
    """
    All branches of the DAPPM but the first one receive the output of the previous branch as a second input
    :param x: in branch 0 - the original input of the DAPPM. in other branches - a list containing the original
    input and the output of the previous branch.
    """

    if isinstance(x, list):
        output_of_prev_branch = x[1]
        x = x[0]
    else:
        output_of_prev_branch = None

    in_width = x.shape[-1]
    in_height = x.shape[-2]
    out = self.down_scale(x)
    out = self.up_scale(out, output_height=in_height, output_width=in_width)

    if output_of_prev_branch is not None:
        out = self.process(out + output_of_prev_branch)

    return out

DDRBackBoneBase

Bases: nn.Module, SupportsReplaceInputChannels, ABC

A base class defining functions that must be supported by DDRBackBones

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
class DDRBackBoneBase(nn.Module, SupportsReplaceInputChannels, ABC):
    """A base class defining functions that must be supported by DDRBackBones"""

    def validate_backbone_attributes(self):
        expected_attributes = ["stem", "layer1", "layer2", "layer3", "layer4", "input_channels"]
        for attribute in expected_attributes:
            assert hasattr(self, attribute), f"Invalid backbone - attribute '{attribute}' is missing"

    def get_backbone_output_number_of_channels(self):
        """Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the
        skip and compress layers"""
        output_shapes = {}
        x = torch.randn(1, self.input_channels, 320, 320)
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        output_shapes["layer2"] = x.shape[1]
        for layer in self.layer3:
            x = layer(x)
        output_shapes["layer3"] = x.shape[1]
        x = self.layer4(x)
        output_shapes["layer4"] = x.shape[1]
        return output_shapes

get_backbone_output_number_of_channels()

Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the skip and compress layers

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def get_backbone_output_number_of_channels(self):
    """Return a dictionary of the shapes of each output of the backbone to determine the in_channels of the
    skip and compress layers"""
    output_shapes = {}
    x = torch.randn(1, self.input_channels, 320, 320)
    x = self.stem(x)
    x = self.layer1(x)
    x = self.layer2(x)
    output_shapes["layer2"] = x.shape[1]
    for layer in self.layer3:
        x = layer(x)
    output_shapes["layer3"] = x.shape[1]
    x = self.layer4(x)
    output_shapes["layer4"] = x.shape[1]
    return output_shapes

DDRNet

Bases: SegmentationModule, ExportableSegmentationModel

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
class DDRNet(SegmentationModule, ExportableSegmentationModel):
    def __init__(
        self,
        backbone: DDRBackBoneBase.__class__,
        additional_layers: list,
        upscale_module: nn.Module,
        num_classes: int,
        highres_planes: int,
        spp_width: int,
        head_width: int,
        use_aux_heads: bool = False,
        ssp_inter_mode: str = "bilinear",
        segmentation_inter_mode: str = "bilinear",
        skip_block: nn.Module.__class__ = None,
        layer5_block: nn.Module.__class__ = Bottleneck,
        layer5_bottleneck_expansion: int = 2,
        classification_mode=False,
        spp_kernel_sizes: list = [1, 5, 9, 17, 0],
        spp_strides: list = [1, 2, 4, 8, 0],
        layer3_repeats: int = 1,
    ):
        """

        :param backbone: the low resolution branch of DDR, expected to have specific attributes in the class
        :param additional_layers: list of num blocks for the highres stage and layer5
        :param upscale_module: upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)
        :param num_classes: number of classes
        :param highres_planes: number of channels in the high resolution net
        :param use_aux_heads: add a second segmentation head (fed from after compress3 + upscale). this head can be used
        during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)
        :param ssp_inter_mode: the interpolation used in the SPP block
        :param segmentation_inter_mode: the interpolation used in the segmentation head
        :param skip_block: allows specifying a different block (from 'block') for the skip layer
        :param layer5_block: type of block to use in layer5 and layer5_skip
        :param layer5_bottleneck_expansion: determines the expansion rate for Bottleneck block
        :param spp_kernel_sizes: list of kernel sizes for the spp module pooling
        :param spp_strides: list of strides for the spp module pooling
        :param layer3_repeats: number of times to repeat the 3rd stage of ddr model, including the paths interchange
         modules.
        """

        super().__init__(use_aux_heads=use_aux_heads)
        self.use_aux_heads = use_aux_heads
        self.upscale = upscale_module
        self.ssp_inter_mode = ssp_inter_mode
        self.segmentation_inter_mode = segmentation_inter_mode
        self.relu = nn.ReLU(inplace=False)
        self.classification_mode = classification_mode
        self.layer3_repeats = layer3_repeats
        self.num_classes = num_classes

        assert not (use_aux_heads and classification_mode), "auxiliary head cannot be used in classification mode"

        assert isinstance(backbone, DDRBackBoneBase), "The backbone must inherit from AbstractDDRBackBone"
        self._backbone: DDRBackBoneBase = backbone
        self._backbone.validate_backbone_attributes()
        out_chan_backbone = self._backbone.get_backbone_output_number_of_channels()

        # Repeat r-times layer4
        self.compression3, self.down3, self.layer3_skip = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()
        for i in range(layer3_repeats):
            self.compression3.append(ConvBN(in_channels=out_chan_backbone["layer3"], out_channels=highres_planes, kernel_size=1, bias=False))
            self.down3.append(ConvBN(in_channels=highres_planes, out_channels=out_chan_backbone["layer3"], kernel_size=3, stride=2, padding=1, bias=False))
            self.layer3_skip.append(
                _make_layer(
                    in_planes=out_chan_backbone["layer2"] if i == 0 else highres_planes,
                    planes=highres_planes,
                    block=skip_block,
                    num_blocks=additional_layers[1],
                )
            )

        self.compression4 = ConvBN(in_channels=out_chan_backbone["layer4"], out_channels=highres_planes, kernel_size=1, bias=False)

        self.down4 = nn.Sequential(
            ConvBN(in_channels=highres_planes, out_channels=highres_planes * 2, kernel_size=3, stride=2, padding=1, bias=False, add_relu=True),
            ConvBN(in_channels=highres_planes * 2, out_channels=out_chan_backbone["layer4"], kernel_size=3, stride=2, padding=1, bias=False),
        )
        self.layer4_skip = _make_layer(block=skip_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[2])
        self.layer5_skip = _make_layer(
            block=layer5_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[3], expansion=layer5_bottleneck_expansion
        )

        # when training the backbones on Imagenet:
        #  - layer 5 has stride 1
        #  - a new high_to_low_fusion is added with to 3x3 convs with stride 2 (and double the width)
        #  - a classification head is placed instead of the segmentation head
        if self.classification_mode:
            self.layer5 = _make_layer(
                block=layer5_block,
                in_planes=out_chan_backbone["layer4"],
                planes=out_chan_backbone["layer4"],
                num_blocks=additional_layers[0],
                expansion=layer5_bottleneck_expansion,
            )

            highres_planes_out = highres_planes * layer5_bottleneck_expansion
            self.high_to_low_fusion = nn.Sequential(
                ConvBN(in_channels=highres_planes_out, out_channels=highres_planes_out * 2, kernel_size=3, stride=2, padding=1, add_relu=True),
                ConvBN(
                    in_channels=highres_planes_out * 2,
                    out_channels=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    add_relu=True,
                ),
            )

            self.average_pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(in_features=out_chan_backbone["layer4"] * layer5_bottleneck_expansion, out_features=num_classes)

        else:
            self.layer5 = _make_layer(
                block=layer5_block,
                in_planes=out_chan_backbone["layer4"],
                planes=out_chan_backbone["layer4"],
                num_blocks=additional_layers[0],
                stride=2,
                expansion=layer5_bottleneck_expansion,
            )

            self.spp = DAPPM(
                in_planes=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                branch_planes=spp_width,
                out_planes=highres_planes * layer5_bottleneck_expansion,
                inter_mode=self.ssp_inter_mode,
                kernel_sizes=spp_kernel_sizes,
                strides=spp_strides,
            )

            self.final_layer = SegmentHead(highres_planes * layer5_bottleneck_expansion, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

            if self.use_aux_heads:
                self.seghead_extra = SegmentHead(highres_planes, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

        self.highres_planes = highres_planes
        self.layer5_bottleneck_expansion = layer5_bottleneck_expansion
        self.head_width = head_width
        self.init_params()

    @property
    def backbone(self):
        """
        Create a fake backbone module to load backbone pre-trained weights.
        """
        return nn.Sequential(
            OrderedDict(
                [
                    ("_backbone", self._backbone),
                    ("compression3", self.compression3),
                    ("compression4", self.compression4),
                    ("down3", self.down3),
                    ("down4", self.down4),
                    ("layer3_skip", self.layer3_skip),
                    ("layer4_skip", self.layer4_skip),
                    ("layer4_skip", self.layer4_skip),
                    ("layer5_skip", self.layer5_skip),
                ]
            )
        )

    def forward(self, x):
        width_output = x.shape[-1] // 8
        height_output = x.shape[-2] // 8

        x = self._backbone.stem(x)
        x = self._backbone.layer1(x)
        x = self._backbone.layer2(self.relu(x))

        # Repeat layer 3
        x_skip = x
        for i in range(self.layer3_repeats):
            out_layer3 = self._backbone.layer3[i](self.relu(x))
            out_layer3_skip = self.layer3_skip[i](self.relu(x_skip))

            x = out_layer3 + self.down3[i](self.relu(out_layer3_skip))
            x_skip = out_layer3_skip + self.upscale(self.compression3[i](self.relu(out_layer3)), height_output, width_output)

        # save for auxiliary head
        if self.use_aux_heads:
            temp = x_skip

        out_layer4 = self._backbone.layer4(self.relu(x))
        out_layer4_skip = self.layer4_skip(self.relu(x_skip))

        x = out_layer4 + self.down4(self.relu(out_layer4_skip))
        x_skip = out_layer4_skip + self.upscale(self.compression4(self.relu(out_layer4)), height_output, width_output)

        out_layer5_skip = self.layer5_skip(self.relu(x_skip))

        if self.classification_mode:
            x_skip = self.high_to_low_fusion(self.relu(out_layer5_skip))
            x = self.layer5(self.relu(x))
            x = self.average_pool(x + x_skip)
            x = self.fc(x.squeeze())
            return x
        else:
            x = self.upscale(self.spp(self.layer5(self.relu(x))), height_output, width_output)

            x = self.final_layer(x + out_layer5_skip)

            if self.use_aux_heads:
                x_extra = self.seghead_extra(temp)
                return x, x_extra
            else:
                return x

    def replace_head(self, new_num_classes=None, new_head=None, new_aux_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_aux_head is not None:
            self.seghead_extra = new_aux_head
        if new_head is not None:
            self.final_layer = new_head
            self.num_classes = None
        else:
            self.final_layer = SegmentHead(
                self.highres_planes * self.layer5_bottleneck_expansion, self.head_width, new_num_classes, 8, inter_mode=self.segmentation_inter_mode
            )
            if self.use_aux_heads:
                self.seghead_extra = SegmentHead(self.highres_planes, self.head_width, new_num_classes, 8, inter_mode=self.segmentation_inter_mode)
            self.num_classes = new_num_classes

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        lr_dict = {"final_layer": lr, "default": 0}
        if self.use_aux_heads:
            lr_dict["seghead_extra"] = lr
        return lr_dict

    def _remove_auxiliary_heads(self):
        if hasattr(self, "seghead_extra"):
            del self.seghead_extra

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
            - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]
        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate backbone params from the rest.
        :return: iterators of groups named_parameters.
        """
        backbone_names = [n for n, p in self.backbone.named_parameters()]
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if name in backbone_names:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self._backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self._backbone.get_input_channels()

backbone property

Create a fake backbone module to load backbone pre-trained weights.

__init__(backbone, additional_layers, upscale_module, num_classes, highres_planes, spp_width, head_width, use_aux_heads=False, ssp_inter_mode='bilinear', segmentation_inter_mode='bilinear', skip_block=None, layer5_block=Bottleneck, layer5_bottleneck_expansion=2, classification_mode=False, spp_kernel_sizes=[1, 5, 9, 17, 0], spp_strides=[1, 2, 4, 8, 0], layer3_repeats=1)

Parameters:

Name Type Description Default
backbone DDRBackBoneBase.__class__

the low resolution branch of DDR, expected to have specific attributes in the class

required
additional_layers list

list of num blocks for the highres stage and layer5

required
upscale_module nn.Module

upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)

required
num_classes int

number of classes

required
highres_planes int

number of channels in the high resolution net

required
use_aux_heads bool

add a second segmentation head (fed from after compress3 + upscale). this head can be used during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)

False
ssp_inter_mode str

the interpolation used in the SPP block

'bilinear'
segmentation_inter_mode str

the interpolation used in the segmentation head

'bilinear'
skip_block nn.Module.__class__

allows specifying a different block (from 'block') for the skip layer

None
layer5_block nn.Module.__class__

type of block to use in layer5 and layer5_skip

Bottleneck
layer5_bottleneck_expansion int

determines the expansion rate for Bottleneck block

2
spp_kernel_sizes list

list of kernel sizes for the spp module pooling

[1, 5, 9, 17, 0]
spp_strides list

list of strides for the spp module pooling

[1, 2, 4, 8, 0]
layer3_repeats int

number of times to repeat the 3rd stage of ddr model, including the paths interchange modules.

1
Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def __init__(
    self,
    backbone: DDRBackBoneBase.__class__,
    additional_layers: list,
    upscale_module: nn.Module,
    num_classes: int,
    highres_planes: int,
    spp_width: int,
    head_width: int,
    use_aux_heads: bool = False,
    ssp_inter_mode: str = "bilinear",
    segmentation_inter_mode: str = "bilinear",
    skip_block: nn.Module.__class__ = None,
    layer5_block: nn.Module.__class__ = Bottleneck,
    layer5_bottleneck_expansion: int = 2,
    classification_mode=False,
    spp_kernel_sizes: list = [1, 5, 9, 17, 0],
    spp_strides: list = [1, 2, 4, 8, 0],
    layer3_repeats: int = 1,
):
    """

    :param backbone: the low resolution branch of DDR, expected to have specific attributes in the class
    :param additional_layers: list of num blocks for the highres stage and layer5
    :param upscale_module: upscale to use in the backbone (DAPPM and Segmentation head are using bilinear interpolation)
    :param num_classes: number of classes
    :param highres_planes: number of channels in the high resolution net
    :param use_aux_heads: add a second segmentation head (fed from after compress3 + upscale). this head can be used
    during training (see paper https://arxiv.org/pdf/2101.06085.pdf for details)
    :param ssp_inter_mode: the interpolation used in the SPP block
    :param segmentation_inter_mode: the interpolation used in the segmentation head
    :param skip_block: allows specifying a different block (from 'block') for the skip layer
    :param layer5_block: type of block to use in layer5 and layer5_skip
    :param layer5_bottleneck_expansion: determines the expansion rate for Bottleneck block
    :param spp_kernel_sizes: list of kernel sizes for the spp module pooling
    :param spp_strides: list of strides for the spp module pooling
    :param layer3_repeats: number of times to repeat the 3rd stage of ddr model, including the paths interchange
     modules.
    """

    super().__init__(use_aux_heads=use_aux_heads)
    self.use_aux_heads = use_aux_heads
    self.upscale = upscale_module
    self.ssp_inter_mode = ssp_inter_mode
    self.segmentation_inter_mode = segmentation_inter_mode
    self.relu = nn.ReLU(inplace=False)
    self.classification_mode = classification_mode
    self.layer3_repeats = layer3_repeats
    self.num_classes = num_classes

    assert not (use_aux_heads and classification_mode), "auxiliary head cannot be used in classification mode"

    assert isinstance(backbone, DDRBackBoneBase), "The backbone must inherit from AbstractDDRBackBone"
    self._backbone: DDRBackBoneBase = backbone
    self._backbone.validate_backbone_attributes()
    out_chan_backbone = self._backbone.get_backbone_output_number_of_channels()

    # Repeat r-times layer4
    self.compression3, self.down3, self.layer3_skip = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()
    for i in range(layer3_repeats):
        self.compression3.append(ConvBN(in_channels=out_chan_backbone["layer3"], out_channels=highres_planes, kernel_size=1, bias=False))
        self.down3.append(ConvBN(in_channels=highres_planes, out_channels=out_chan_backbone["layer3"], kernel_size=3, stride=2, padding=1, bias=False))
        self.layer3_skip.append(
            _make_layer(
                in_planes=out_chan_backbone["layer2"] if i == 0 else highres_planes,
                planes=highres_planes,
                block=skip_block,
                num_blocks=additional_layers[1],
            )
        )

    self.compression4 = ConvBN(in_channels=out_chan_backbone["layer4"], out_channels=highres_planes, kernel_size=1, bias=False)

    self.down4 = nn.Sequential(
        ConvBN(in_channels=highres_planes, out_channels=highres_planes * 2, kernel_size=3, stride=2, padding=1, bias=False, add_relu=True),
        ConvBN(in_channels=highres_planes * 2, out_channels=out_chan_backbone["layer4"], kernel_size=3, stride=2, padding=1, bias=False),
    )
    self.layer4_skip = _make_layer(block=skip_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[2])
    self.layer5_skip = _make_layer(
        block=layer5_block, in_planes=highres_planes, planes=highres_planes, num_blocks=additional_layers[3], expansion=layer5_bottleneck_expansion
    )

    # when training the backbones on Imagenet:
    #  - layer 5 has stride 1
    #  - a new high_to_low_fusion is added with to 3x3 convs with stride 2 (and double the width)
    #  - a classification head is placed instead of the segmentation head
    if self.classification_mode:
        self.layer5 = _make_layer(
            block=layer5_block,
            in_planes=out_chan_backbone["layer4"],
            planes=out_chan_backbone["layer4"],
            num_blocks=additional_layers[0],
            expansion=layer5_bottleneck_expansion,
        )

        highres_planes_out = highres_planes * layer5_bottleneck_expansion
        self.high_to_low_fusion = nn.Sequential(
            ConvBN(in_channels=highres_planes_out, out_channels=highres_planes_out * 2, kernel_size=3, stride=2, padding=1, add_relu=True),
            ConvBN(
                in_channels=highres_planes_out * 2,
                out_channels=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
                kernel_size=3,
                stride=2,
                padding=1,
                add_relu=True,
            ),
        )

        self.average_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(in_features=out_chan_backbone["layer4"] * layer5_bottleneck_expansion, out_features=num_classes)

    else:
        self.layer5 = _make_layer(
            block=layer5_block,
            in_planes=out_chan_backbone["layer4"],
            planes=out_chan_backbone["layer4"],
            num_blocks=additional_layers[0],
            stride=2,
            expansion=layer5_bottleneck_expansion,
        )

        self.spp = DAPPM(
            in_planes=out_chan_backbone["layer4"] * layer5_bottleneck_expansion,
            branch_planes=spp_width,
            out_planes=highres_planes * layer5_bottleneck_expansion,
            inter_mode=self.ssp_inter_mode,
            kernel_sizes=spp_kernel_sizes,
            strides=spp_strides,
        )

        self.final_layer = SegmentHead(highres_planes * layer5_bottleneck_expansion, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

        if self.use_aux_heads:
            self.seghead_extra = SegmentHead(highres_planes, head_width, num_classes, 8, inter_mode=self.segmentation_inter_mode)

    self.highres_planes = highres_planes
    self.layer5_bottleneck_expansion = layer5_bottleneck_expansion
    self.head_width = head_width
    self.init_params()

initialize_param_groups(lr, training_params)

Custom param groups for training: - Different lr for backbone and the rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
505
506
507
508
509
510
511
512
513
514
515
516
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]
    return param_groups

DDRNetCustom

Bases: DDRNet

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
class DDRNetCustom(DDRNet):
    def __init__(self, arch_params: HpmStruct):
        """Parse arch_params and translate the parameters to build the original DDRNet architecture"""
        if get_param(arch_params, "aux_heads") is not None:
            message = "arch_params.aux_heads is deprecated in 3.1.1 and will be removed in 3.2.0."
            if get_param(arch_params, "use_aux_heads") is not None:
                message += "\n using arch_params.use_aux_heads instead."

            else:
                message += "\n use arch_params.use_aux_heads instead."
            warnings.warn(message, DeprecationWarning)
            use_aux_heads = get_param(arch_params, "aux_heads")
        else:
            use_aux_heads = get_param(arch_params, "use_aux_heads")
        super().__init__(
            backbone=arch_params.backbone,
            additional_layers=arch_params.additional_layers,
            upscale_module=arch_params.upscale_module,
            num_classes=arch_params.num_classes,
            highres_planes=arch_params.highres_planes,
            spp_width=arch_params.spp_planes,
            head_width=arch_params.head_planes,
            use_aux_heads=use_aux_heads,
            ssp_inter_mode=arch_params.ssp_inter_mode,
            segmentation_inter_mode=arch_params.segmentation_inter_mode,
            skip_block=arch_params.skip_block,
            layer5_block=arch_params.layer5_block,
            layer5_bottleneck_expansion=arch_params.layer5_bottleneck_expansion,
            classification_mode=arch_params.classification_mode,
            spp_kernel_sizes=arch_params.spp_kernel_sizes,
            spp_strides=arch_params.spp_strides,
            layer3_repeats=arch_params.layer3_repeats,
        )

__init__(arch_params)

Parse arch_params and translate the parameters to build the original DDRNet architecture

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
def __init__(self, arch_params: HpmStruct):
    """Parse arch_params and translate the parameters to build the original DDRNet architecture"""
    if get_param(arch_params, "aux_heads") is not None:
        message = "arch_params.aux_heads is deprecated in 3.1.1 and will be removed in 3.2.0."
        if get_param(arch_params, "use_aux_heads") is not None:
            message += "\n using arch_params.use_aux_heads instead."

        else:
            message += "\n use arch_params.use_aux_heads instead."
        warnings.warn(message, DeprecationWarning)
        use_aux_heads = get_param(arch_params, "aux_heads")
    else:
        use_aux_heads = get_param(arch_params, "use_aux_heads")
    super().__init__(
        backbone=arch_params.backbone,
        additional_layers=arch_params.additional_layers,
        upscale_module=arch_params.upscale_module,
        num_classes=arch_params.num_classes,
        highres_planes=arch_params.highres_planes,
        spp_width=arch_params.spp_planes,
        head_width=arch_params.head_planes,
        use_aux_heads=use_aux_heads,
        ssp_inter_mode=arch_params.ssp_inter_mode,
        segmentation_inter_mode=arch_params.segmentation_inter_mode,
        skip_block=arch_params.skip_block,
        layer5_block=arch_params.layer5_block,
        layer5_bottleneck_expansion=arch_params.layer5_bottleneck_expansion,
        classification_mode=arch_params.classification_mode,
        spp_kernel_sizes=arch_params.spp_kernel_sizes,
        spp_strides=arch_params.spp_strides,
        layer3_repeats=arch_params.layer3_repeats,
    )

RegnetDDRBackBone

Bases: DDRBackBoneBase

Translation of Regnet to fit DDR model

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
class RegnetDDRBackBone(DDRBackBoneBase):
    """
    Translation of Regnet to fit DDR model
    """

    def __init__(self, regnet_module: nn.Module.__class__):
        super().__init__()
        self.input_channels = regnet_module.net.stem.conv.in_channels
        self.stem = regnet_module.net.stem
        self.layer1 = regnet_module.net.stage_0
        self.layer2 = regnet_module.net.stage_1
        self.layer3 = nn.ModuleList([regnet_module.net.stage_2])
        self.layer4 = regnet_module.net.stage_3

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        if isinstance(self.stem, SupportsReplaceInputChannels):
            self.stem.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        else:
            raise NotImplementedError(f"`{self.stem.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        if isinstance(self.stem, SupportsReplaceInputChannels):
            return self.stem.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.stem.__class__.__name__}` does not support `replace_input_channels`")

SegmentHead

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
class SegmentHead(nn.Module):
    def __init__(self, in_planes: int, inter_planes: int, out_planes: int, scale_factor: int, inter_mode: str = "bilinear"):
        """
        Last stage of the segmentation network.
        Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor
        :param in_planes: width of input
        :param inter_planes: width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle
        :param out_planes: output width
        :param scale_factor: scaling factor
        :param inter_mode: one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle.
        when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling
        """
        super().__init__()

        if inter_mode == "pixel_shuffle":
            assert inter_planes % (scale_factor ^ 2) == 0, "when using pixel_shuffle, inter_planes must be a multiple of scale_factor^2"

        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, inter_planes, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(inter_planes)
        self.relu = nn.ReLU(inplace=True)

        if inter_mode == "pixel_shuffle":
            self.conv2 = nn.Conv2d(inter_planes, inter_planes, kernel_size=1, padding=0, bias=True)
            self.upscale = nn.PixelShuffle(scale_factor)
        else:
            self.conv2 = nn.Conv2d(inter_planes, out_planes, kernel_size=1, padding=0, bias=True)
            self.upscale = nn.Upsample(scale_factor=scale_factor, mode=inter_mode)

        self.scale_factor = scale_factor

    def forward(self, x):
        x = self.conv1(self.relu(self.bn1(x)))
        out = self.conv2(self.relu(self.bn2(x)))
        out = self.upscale(out)

        return out

__init__(in_planes, inter_planes, out_planes, scale_factor, inter_mode='bilinear')

Last stage of the segmentation network. Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor

Parameters:

Name Type Description Default
in_planes int

width of input

required
inter_planes int

width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle

required
out_planes int

output width

required
scale_factor int

scaling factor

required
inter_mode str

one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle. when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling

'bilinear'
Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def __init__(self, in_planes: int, inter_planes: int, out_planes: int, scale_factor: int, inter_mode: str = "bilinear"):
    """
    Last stage of the segmentation network.
    Reduces the number of output planes (usually to num_classes) while increasing the size by scale_factor
    :param in_planes: width of input
    :param inter_planes: width of internal conv. must be a multiple of scale_factor^2 when inter_mode=pixel_shuffle
    :param out_planes: output width
    :param scale_factor: scaling factor
    :param inter_mode: one of nearest, linear, bilinear, bicubic, trilinear, area or pixel_shuffle.
    when set to pixel_shuffle, an nn.PixelShuffle will be used for scaling
    """
    super().__init__()

    if inter_mode == "pixel_shuffle":
        assert inter_planes % (scale_factor ^ 2) == 0, "when using pixel_shuffle, inter_planes must be a multiple of scale_factor^2"

    self.bn1 = nn.BatchNorm2d(in_planes)
    self.conv1 = nn.Conv2d(in_planes, inter_planes, kernel_size=3, padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(inter_planes)
    self.relu = nn.ReLU(inplace=True)

    if inter_mode == "pixel_shuffle":
        self.conv2 = nn.Conv2d(inter_planes, inter_planes, kernel_size=1, padding=0, bias=True)
        self.upscale = nn.PixelShuffle(scale_factor)
    else:
        self.conv2 = nn.Conv2d(inter_planes, out_planes, kernel_size=1, padding=0, bias=True)
        self.upscale = nn.Upsample(scale_factor=scale_factor, mode=inter_mode)

    self.scale_factor = scale_factor

UpscaleOnline

Bases: nn.Module

In some cases the required scale/size for the scaling is known only when the input is received. This class support such cases. only the interpolation mode is set in advance.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet.py
179
180
181
182
183
184
185
186
187
188
189
190
class UpscaleOnline(nn.Module):
    """
    In some cases the required scale/size for the scaling is known only when the input is received.
    This class support such cases. only the interpolation mode is set in advance.
    """

    def __init__(self, mode="bilinear"):
        super().__init__()
        self.mode = mode

    def forward(self, x, output_height: int, output_width: int):
        return F.interpolate(x, size=[output_height, output_width], mode=self.mode)

DDRNet39Backbone

Bases: DDRNet39

A somewhat frankenstein version of the DDRNet39 model that tries to be a feature extractor module.

Source code in src/super_gradients/training/models/segmentation_models/ddrnet_backbones.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@register_detection_module()
class DDRNet39Backbone(DDRNet39):
    """
    A somewhat frankenstein version of the DDRNet39 model that tries to be a feature extractor module.
    """

    def __init__(self, arch_params: HpmStruct):
        super().__init__(arch_params)

        # Delete everything that is not needed for feature extraction
        del self.final_layer
        if self.use_aux_heads:
            self.use_aux_heads = False
            del self.aux_head

        if self.classification_mode:
            del self.fc
            del self.average_pool
            del self.high_to_low_fusion
            del self.layer5

        self._out_channels = (self.highres_planes * self.layer5_bottleneck_expansion,)

    def forward(self, x):
        width_output = x.shape[-1] // 8
        height_output = x.shape[-2] // 8

        x = self._backbone.stem(x)
        x = self._backbone.layer1(x)
        x = self._backbone.layer2(self.relu(x))

        # Repeat layer 3
        x_skip = x
        for i in range(self.layer3_repeats):
            out_layer3 = self._backbone.layer3[i](self.relu(x))
            out_layer3_skip = self.layer3_skip[i](self.relu(x_skip))

            x = out_layer3 + self.down3[i](self.relu(out_layer3_skip))
            x_skip = out_layer3_skip + self.upscale(self.compression3[i](self.relu(out_layer3)), height_output, width_output)

        out_layer4 = self._backbone.layer4(self.relu(x))
        out_layer4_skip = self.layer4_skip(self.relu(x_skip))

        x = out_layer4 + self.down4(self.relu(out_layer4_skip))
        x_skip = out_layer4_skip + self.upscale(self.compression4(self.relu(out_layer4)), height_output, width_output)

        out_layer5_skip = self.layer5_skip(self.relu(x_skip))

        x = self.upscale(self.spp(self.layer5(self.relu(x))), height_output, width_output)

        return x + out_layer5_skip

    @property
    def out_channels(self) -> Tuple[int]:
        return self._out_channels

LadderBottleneck

Bases: nn.Module

ResNet Bottleneck

Source code in src/super_gradients/training/models/segmentation_models/laddernet.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class LadderBottleneck(nn.Module):
    """ResNet Bottleneck"""

    # pylint: disable=unused-argument
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1, norm_layer=None):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False)
        self.bn2 = norm_layer(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def _sum_each(self, x, y):
        assert len(x) == len(y)
        z = []
        for i in range(len(x)):
            z.append(x[i] + y[i])
        return z

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

LadderResNet

Bases: nn.Module

Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.

Parameters

block : Block Class for the residual block. Options are BasicBlockV1, BottleneckV1. layers : list of int Numbers of layers in each block classes : int, default 1000 Number of classification classes. dilated : bool, default False Applying dilation strategy to pretrained ResNet yielding a stride-8 model, typically used in Semantic Segmentation. norm_layer : object Normalization layer used in backbone network (default: :class:mxnet.gluon.nn.BatchNorm; for Synchronized Cross-GPU BachNormalization).

Reference:

- He, Kaiming, et al. "Deep residual learning for image recognition."
    Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.

- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
Source code in src/super_gradients/training/models/segmentation_models/laddernet.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class LadderResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.

    Parameters
    ----------
    block : Block
        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
    layers : list of int
        Numbers of layers in each block
    classes : int, default 1000
        Number of classification classes.
    dilated : bool, default False
        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
        typically used in Semantic Segmentation.
    norm_layer : object
        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
        for Synchronized Cross-GPU BachNormalization).

    Reference:

        - He, Kaiming, et al. "Deep residual learning for image recognition."
            Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.

        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """

    # pylint: disable=unused-variable
    # def __init__(self, block, layers, num_classes=1000, dilated=False, norm_layer=SyncBatchNorm): # FIXME - ORIGINAL CODE
    def __init__(self, block, layers, num_classes=1000, dilated=False, norm_layer=nn.BatchNorm2d):  # FIXME - TIME MEASUREMENT CODE
        self.inplanes = 64
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
        if dilated:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2, norm_layer=norm_layer)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2, norm_layer=norm_layer)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            import math

            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                norm_layer(planes * block.expansion),
            )

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1, downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, dilation=2, downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation, norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

conv3x3(in_planes, out_planes, stride=1)

3x3 convolution with padding

Source code in src/super_gradients/training/models/segmentation_models/laddernet.py
243
244
245
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True)

PPLiteSegBase

Bases: SegmentationModule

The PP_LiteSeg implementation based on PaddlePaddle. The original article refers to "Juncai Peng, Yi Liu, Shiyu Tang, Yuying Hao, Lutao Chu, Guowei Chen, Zewu Wu, Zeyu Chen, Zhiliang Yu, Yuning Du, Qingqing Dang,Baohua Lai, Qiwen Liu, Xiaoguang Hu, Dianhai Yu, Yanjun Ma. PP-LiteSeg: A Superior Real-Time Semantic Segmentation Model. https://arxiv.org/abs/2204.02681".

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
class PPLiteSegBase(SegmentationModule):
    """
    The PP_LiteSeg implementation based on PaddlePaddle.
    The original article refers to "Juncai Peng, Yi Liu, Shiyu Tang, Yuying Hao, Lutao Chu,
    Guowei Chen, Zewu Wu, Zeyu Chen, Zhiliang Yu, Yuning Du, Qingqing Dang,Baohua Lai,
    Qiwen Liu, Xiaoguang Hu, Dianhai Yu, Yanjun Ma. PP-LiteSeg: A Superior Real-Time Semantic
    Segmentation Model. https://arxiv.org/abs/2204.02681".
    """

    def __init__(
        self,
        num_classes,
        backbone: AbstractSTDCBackbone,
        projection_channels_list: List[int],
        sppm_inter_channels: int,
        sppm_out_channels: int,
        sppm_pool_sizes: List[int],
        sppm_upsample_mode: Union[UpsampleMode, str],
        align_corners: bool,
        decoder_up_factors: List[int],
        decoder_channels: List[int],
        decoder_upsample_mode: Union[UpsampleMode, str],
        head_scale_factor: int,
        head_upsample_mode: Union[UpsampleMode, str],
        head_mid_channels: int,
        dropout: float,
        use_aux_heads: bool,
        aux_hidden_channels: List[int],
        aux_scale_factors: List[int],
    ):
        """
        :param backbone: Backbone nn.Module should implement the abstract class `AbstractSTDCBackbone`.
        :param projection_channels_list: channels list to project encoder features before fusing with the decoder
            stream.
        :param sppm_inter_channels: num channels in each sppm pooling branch.
        :param sppm_out_channels: The number of output channels after sppm module.
        :param sppm_pool_sizes: spatial output sizes of the pooled feature maps.
        :param sppm_upsample_mode: Upsample mode to original size after pooling.
        :param decoder_up_factors: list upsample factor per decoder stage.
        :param decoder_channels: list of num_channels per decoder stage.
        :param decoder_upsample_mode: upsample mode in decoder stages, see UpsampleMode for valid options.
        :param head_scale_factor: scale factor for final the segmentation head logits.
        :param head_upsample_mode: upsample mode to final prediction sizes, see UpsampleMode for valid options.
        :param head_mid_channels: num of hidden channels in segmentation head.
        :param use_aux_heads: set True when training, output extra Auxiliary feature maps from the encoder module.
        :param aux_hidden_channels: List of hidden channels in auxiliary segmentation heads.
        :param aux_scale_factors: list of uppsample factors for final auxiliary heads logits.
        """
        super().__init__(use_aux_heads=use_aux_heads)

        # Init Encoder
        backbone_out_channels = backbone.get_backbone_output_number_of_channels()
        assert len(backbone_out_channels) == len(projection_channels_list), (
            f"The length of backbone outputs ({backbone_out_channels}) should match the length of projection channels" f"({len(projection_channels_list)})."
        )
        context = SPPM(
            in_channels=backbone_out_channels[-1],
            inter_channels=sppm_inter_channels,
            out_channels=sppm_out_channels,
            pool_sizes=sppm_pool_sizes,
            upsample_mode=sppm_upsample_mode,
            align_corners=align_corners,
        )
        self.encoder = PPLiteSegEncoder(backbone=backbone, context_module=context, projection_channels_list=projection_channels_list)
        encoder_channels = self.encoder.get_output_number_of_channels()

        # Init Decoder
        self.decoder = PPLiteSegDecoder(
            encoder_channels=encoder_channels,
            up_factors=decoder_up_factors,
            out_channels=decoder_channels,
            upsample_mode=decoder_upsample_mode,
            align_corners=align_corners,
        )

        # Init Segmentation classification heads
        self.seg_head = nn.Sequential(
            SegmentationHead(in_channels=decoder_channels[-1], mid_channels=head_mid_channels, num_classes=num_classes, dropout=dropout),
            make_upsample_module(scale_factor=head_scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
        )
        # Auxiliary heads
        if self.use_aux_heads:
            encoder_out_channels = projection_channels_list
            self.aux_heads = nn.ModuleList(
                [
                    nn.Sequential(
                        SegmentationHead(backbone_ch, hidden_ch, num_classes, dropout=dropout),
                        make_upsample_module(scale_factor=scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
                    )
                    for backbone_ch, hidden_ch, scale_factor in zip(encoder_out_channels, aux_hidden_channels, aux_scale_factors)
                ]
            )
        self.init_params()
        self.num_classes = num_classes

    def _remove_auxiliary_heads(self):
        if hasattr(self, "aux_heads"):
            del self.aux_heads

    @property
    def backbone(self) -> nn.Module:
        """
        Support SG load backbone when training.
        """
        return self.encoder.backbone

    def forward(self, x):
        feats = self.encoder(x)
        if self.use_aux_heads:
            enc_feats = feats[:-1]
        x = self.decoder(feats)
        x = self.seg_head(x)
        if not self.use_aux_heads:
            return x
        aux_feats = [aux_head(feat) for feat, aux_head in zip(enc_feats, self.aux_heads)]
        return tuple([x] + aux_feats)

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
            - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]
        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate backbone params from the rest.
        :return: iterators of groups named_parameters.
        """
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if "encoder.backbone" in name:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def prep_model_for_conversion(self, input_size: Union[tuple, list], stride_ratio: int = 32, **kwargs):
        if not torch_version_is_greater_or_equal(1, 11):
            raise RuntimeError("PPLiteSeg model ONNX export requires torch => 1.11, torch installed: " + str(torch.__version__))
        super().prep_model_for_conversion(input_size, **kwargs)
        if isinstance(self.encoder.context_module, SPPM):
            self.encoder.context_module.prep_model_for_conversion(input_size=input_size, stride_ratio=stride_ratio)

    def replace_head(self, new_num_classes: int, **kwargs):
        for module in self.modules():
            if isinstance(module, SegmentationHead):
                module.replace_num_classes(new_num_classes)
        self.num_classes = new_num_classes

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        lr_dict = {"seg_head": lr, "default": 0}
        if self.use_aux_heads:
            lr_dict["aux_heads"] = lr
        return lr_dict

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.encoder.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.encoder.get_input_channels()

backbone: nn.Module property

Support SG load backbone when training.

__init__(num_classes, backbone, projection_channels_list, sppm_inter_channels, sppm_out_channels, sppm_pool_sizes, sppm_upsample_mode, align_corners, decoder_up_factors, decoder_channels, decoder_upsample_mode, head_scale_factor, head_upsample_mode, head_mid_channels, dropout, use_aux_heads, aux_hidden_channels, aux_scale_factors)

Parameters:

Name Type Description Default
backbone AbstractSTDCBackbone

Backbone nn.Module should implement the abstract class AbstractSTDCBackbone.

required
projection_channels_list List[int]

channels list to project encoder features before fusing with the decoder stream.

required
sppm_inter_channels int

num channels in each sppm pooling branch.

required
sppm_out_channels int

The number of output channels after sppm module.

required
sppm_pool_sizes List[int]

spatial output sizes of the pooled feature maps.

required
sppm_upsample_mode Union[UpsampleMode, str]

Upsample mode to original size after pooling.

required
decoder_up_factors List[int]

list upsample factor per decoder stage.

required
decoder_channels List[int]

list of num_channels per decoder stage.

required
decoder_upsample_mode Union[UpsampleMode, str]

upsample mode in decoder stages, see UpsampleMode for valid options.

required
head_scale_factor int

scale factor for final the segmentation head logits.

required
head_upsample_mode Union[UpsampleMode, str]

upsample mode to final prediction sizes, see UpsampleMode for valid options.

required
head_mid_channels int

num of hidden channels in segmentation head.

required
use_aux_heads bool

set True when training, output extra Auxiliary feature maps from the encoder module.

required
aux_hidden_channels List[int]

List of hidden channels in auxiliary segmentation heads.

required
aux_scale_factors List[int]

list of uppsample factors for final auxiliary heads logits.

required
Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def __init__(
    self,
    num_classes,
    backbone: AbstractSTDCBackbone,
    projection_channels_list: List[int],
    sppm_inter_channels: int,
    sppm_out_channels: int,
    sppm_pool_sizes: List[int],
    sppm_upsample_mode: Union[UpsampleMode, str],
    align_corners: bool,
    decoder_up_factors: List[int],
    decoder_channels: List[int],
    decoder_upsample_mode: Union[UpsampleMode, str],
    head_scale_factor: int,
    head_upsample_mode: Union[UpsampleMode, str],
    head_mid_channels: int,
    dropout: float,
    use_aux_heads: bool,
    aux_hidden_channels: List[int],
    aux_scale_factors: List[int],
):
    """
    :param backbone: Backbone nn.Module should implement the abstract class `AbstractSTDCBackbone`.
    :param projection_channels_list: channels list to project encoder features before fusing with the decoder
        stream.
    :param sppm_inter_channels: num channels in each sppm pooling branch.
    :param sppm_out_channels: The number of output channels after sppm module.
    :param sppm_pool_sizes: spatial output sizes of the pooled feature maps.
    :param sppm_upsample_mode: Upsample mode to original size after pooling.
    :param decoder_up_factors: list upsample factor per decoder stage.
    :param decoder_channels: list of num_channels per decoder stage.
    :param decoder_upsample_mode: upsample mode in decoder stages, see UpsampleMode for valid options.
    :param head_scale_factor: scale factor for final the segmentation head logits.
    :param head_upsample_mode: upsample mode to final prediction sizes, see UpsampleMode for valid options.
    :param head_mid_channels: num of hidden channels in segmentation head.
    :param use_aux_heads: set True when training, output extra Auxiliary feature maps from the encoder module.
    :param aux_hidden_channels: List of hidden channels in auxiliary segmentation heads.
    :param aux_scale_factors: list of uppsample factors for final auxiliary heads logits.
    """
    super().__init__(use_aux_heads=use_aux_heads)

    # Init Encoder
    backbone_out_channels = backbone.get_backbone_output_number_of_channels()
    assert len(backbone_out_channels) == len(projection_channels_list), (
        f"The length of backbone outputs ({backbone_out_channels}) should match the length of projection channels" f"({len(projection_channels_list)})."
    )
    context = SPPM(
        in_channels=backbone_out_channels[-1],
        inter_channels=sppm_inter_channels,
        out_channels=sppm_out_channels,
        pool_sizes=sppm_pool_sizes,
        upsample_mode=sppm_upsample_mode,
        align_corners=align_corners,
    )
    self.encoder = PPLiteSegEncoder(backbone=backbone, context_module=context, projection_channels_list=projection_channels_list)
    encoder_channels = self.encoder.get_output_number_of_channels()

    # Init Decoder
    self.decoder = PPLiteSegDecoder(
        encoder_channels=encoder_channels,
        up_factors=decoder_up_factors,
        out_channels=decoder_channels,
        upsample_mode=decoder_upsample_mode,
        align_corners=align_corners,
    )

    # Init Segmentation classification heads
    self.seg_head = nn.Sequential(
        SegmentationHead(in_channels=decoder_channels[-1], mid_channels=head_mid_channels, num_classes=num_classes, dropout=dropout),
        make_upsample_module(scale_factor=head_scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
    )
    # Auxiliary heads
    if self.use_aux_heads:
        encoder_out_channels = projection_channels_list
        self.aux_heads = nn.ModuleList(
            [
                nn.Sequential(
                    SegmentationHead(backbone_ch, hidden_ch, num_classes, dropout=dropout),
                    make_upsample_module(scale_factor=scale_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
                )
                for backbone_ch, hidden_ch, scale_factor in zip(encoder_out_channels, aux_hidden_channels, aux_scale_factors)
            ]
        )
    self.init_params()
    self.num_classes = num_classes

initialize_param_groups(lr, training_params)

Custom param groups for training: - Different lr for backbone and the rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
261
262
263
264
265
266
267
268
269
270
271
272
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]
    return param_groups

PPLiteSegDecoder

Bases: nn.Module

PPLiteSegDecoder using UAFM blocks to fuse feature maps.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class PPLiteSegDecoder(nn.Module):
    """
    PPLiteSegDecoder using UAFM blocks to fuse feature maps.
    """

    def __init__(self, encoder_channels: List[int], up_factors: List[int], out_channels: List[int], upsample_mode, align_corners: bool):
        super().__init__()
        # Make a copy of channels list, to prevent out of scope changes.
        encoder_channels = encoder_channels.copy()
        encoder_channels.reverse()
        in_channels = encoder_channels.pop(0)

        # TODO - assert argument length
        self.up_stages = nn.ModuleList()
        for skip_ch, up_factor, out_ch in zip(encoder_channels, up_factors, out_channels):
            self.up_stages.append(
                UAFM(
                    in_channels=in_channels,
                    skip_channels=skip_ch,
                    out_channels=out_ch,
                    up_factor=up_factor,
                    upsample_mode=upsample_mode,
                    align_corners=align_corners,
                )
            )
            in_channels = out_ch

    def forward(self, feats: List[torch.Tensor]):
        feats.reverse()
        x = feats.pop(0)
        for up_stage, skip in zip(self.up_stages, feats):
            x = up_stage(x, skip)
        return x

PPLiteSegEncoder

Bases: nn.Module, SupportsReplaceInputChannels

Encoder for PPLiteSeg, include backbone followed by a context module.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class PPLiteSegEncoder(nn.Module, SupportsReplaceInputChannels):
    """
    Encoder for PPLiteSeg, include backbone followed by a context module.
    """

    def __init__(self, backbone: AbstractSTDCBackbone, projection_channels_list: List[int], context_module: nn.Module):
        super().__init__()
        self.backbone = backbone
        self.context_module = context_module
        feats_channels = backbone.get_backbone_output_number_of_channels()
        self.proj_convs = nn.ModuleList(
            [ConvBNReLU(feat_ch, proj_ch, kernel_size=3, padding=1, bias=False) for feat_ch, proj_ch in zip(feats_channels, projection_channels_list)]
        )
        self.projection_channels_list = projection_channels_list

    def get_output_number_of_channels(self) -> List[int]:
        channels_list = self.projection_channels_list
        if hasattr(self.context_module, "out_channels"):
            channels_list.append(self.context_module.out_channels)
        return channels_list

    def forward(self, x):
        feats = self.backbone(x)
        y = self.context_module(feats[-1])
        feats = [conv(f) for conv, f in zip(self.proj_convs, feats)]
        return feats + [y]

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        if isinstance(self.backbone, SupportsReplaceInputChannels):
            return self.backbone.get_input_channels()
        else:
            raise NotImplementedError(f"`{self.backbone.__class__.__name__}` does not support `get_input_channels`")

UAFM

Bases: nn.Module

Unified Attention Fusion Module, which uses mean and max values across the spatial dimensions.

Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class UAFM(nn.Module):
    """
    Unified Attention Fusion Module, which uses mean and max values across the spatial dimensions.
    """

    def __init__(
        self,
        in_channels: int,
        skip_channels: int,
        out_channels: int,
        up_factor: int,
        upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
        align_corners: bool = False,
    ):
        """
        :params in_channels: num_channels of input feature map.
        :param skip_channels: num_channels of skip connection feature map.
        :param out_channels: num out channels after features fusion.
        :param up_factor: upsample scale factor of the input feature map.
        :param upsample_mode: see UpsampleMode for valid options.
        """
        super().__init__()
        self.conv_atten = nn.Sequential(
            ConvBNReLU(4, 2, kernel_size=3, padding=1, bias=False), ConvBNReLU(2, 1, kernel_size=3, padding=1, bias=False, use_activation=False)
        )

        self.proj_skip = nn.Identity() if skip_channels == in_channels else ConvBNReLU(skip_channels, in_channels, kernel_size=3, padding=1, bias=False)
        self.up_x = nn.Identity() if up_factor == 1 else make_upsample_module(scale_factor=up_factor, upsample_mode=upsample_mode, align_corners=align_corners)
        self.conv_out = ConvBNReLU(in_channels, out_channels, kernel_size=3, padding=1, bias=False)

    def forward(self, x, skip):
        """
        :param x: input feature map to upsample before fusion.
        :param skip: skip connection feature map.
        """
        x = self.up_x(x)
        skip = self.proj_skip(skip)

        atten = torch.cat([*self._avg_max_spatial_reduce(x, use_concat=False), *self._avg_max_spatial_reduce(skip, use_concat=False)], dim=1)
        atten = self.conv_atten(atten)
        atten = torch.sigmoid(atten)

        out = x * atten + skip * (1 - atten)
        out = self.conv_out(out)
        return out

    @staticmethod
    def _avg_max_spatial_reduce(x, use_concat: bool = False):
        reduced = [torch.mean(x, dim=1, keepdim=True), torch.max(x, dim=1, keepdim=True)[0]]
        if use_concat:
            reduced = torch.cat(reduced, dim=1)
        return reduced

__init__(in_channels, skip_channels, out_channels, up_factor, upsample_mode=UpsampleMode.BILINEAR, align_corners=False)

Parameters:

Name Type Description Default
in_channels int

num_channels of input feature map.

required
skip_channels int

num_channels of skip connection feature map.

required
out_channels int

num out channels after features fusion.

required
up_factor int

upsample scale factor of the input feature map.

required
upsample_mode Union[UpsampleMode, str]

see UpsampleMode for valid options.

UpsampleMode.BILINEAR
Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __init__(
    self,
    in_channels: int,
    skip_channels: int,
    out_channels: int,
    up_factor: int,
    upsample_mode: Union[UpsampleMode, str] = UpsampleMode.BILINEAR,
    align_corners: bool = False,
):
    """
    :params in_channels: num_channels of input feature map.
    :param skip_channels: num_channels of skip connection feature map.
    :param out_channels: num out channels after features fusion.
    :param up_factor: upsample scale factor of the input feature map.
    :param upsample_mode: see UpsampleMode for valid options.
    """
    super().__init__()
    self.conv_atten = nn.Sequential(
        ConvBNReLU(4, 2, kernel_size=3, padding=1, bias=False), ConvBNReLU(2, 1, kernel_size=3, padding=1, bias=False, use_activation=False)
    )

    self.proj_skip = nn.Identity() if skip_channels == in_channels else ConvBNReLU(skip_channels, in_channels, kernel_size=3, padding=1, bias=False)
    self.up_x = nn.Identity() if up_factor == 1 else make_upsample_module(scale_factor=up_factor, upsample_mode=upsample_mode, align_corners=align_corners)
    self.conv_out = ConvBNReLU(in_channels, out_channels, kernel_size=3, padding=1, bias=False)

forward(x, skip)

Parameters:

Name Type Description Default
x

input feature map to upsample before fusion.

required
skip

skip connection feature map.

required
Source code in src/super_gradients/training/models/segmentation_models/ppliteseg.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def forward(self, x, skip):
    """
    :param x: input feature map to upsample before fusion.
    :param skip: skip connection feature map.
    """
    x = self.up_x(x)
    skip = self.proj_skip(skip)

    atten = torch.cat([*self._avg_max_spatial_reduce(x, use_concat=False), *self._avg_max_spatial_reduce(skip, use_concat=False)], dim=1)
    atten = self.conv_atten(atten)
    atten = torch.sigmoid(atten)

    out = x * atten + skip * (1 - atten)
    out = self.conv_out(out)
    return out

Implementation of paper: "Rethink Dilated Convolution for Real-time Semantic Segmentation", https://arxiv.org/pdf/2111.09957.pdf Based on original implementation: https://github.com/RolandGao/RegSeg, cloned 23/12/2021, commit c07a833

AdaptiveShortcutBlock

Bases: nn.Module

Adaptive shortcut makes the following adaptations, if needed: Applying pooling if stride > 1 Applying 1x1 conv if in/out channels are different or if pooling was applied If stride is 1 and in/out channels are the same, then the shortcut is just an identity

Source code in src/super_gradients/training/models/segmentation_models/regseg.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class AdaptiveShortcutBlock(nn.Module):
    """
    Adaptive shortcut makes the following adaptations, if needed:
    Applying pooling if stride > 1
    Applying 1x1 conv if in/out channels are different or if pooling was applied
    If stride is 1 and in/out channels are the same, then the shortcut is just an identity
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int):
        super().__init__()
        shortcut_layers = [nn.Identity()]
        if stride != 1:
            shortcut_layers[0] = nn.AvgPool2d(stride, stride, ceil_mode=True)  # override the identity layer
        if in_channels != out_channels or stride != 1:
            shortcut_layers.append(ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False, use_activation=False))
        self.shortcut = nn.Sequential(*shortcut_layers)

    def forward(self, x):
        return self.shortcut(x)

DBlock

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/regseg.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
class DBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, dilations: List[int], group_width: int, stride: int, se_ratio: int = 4):
        """
        :param dilations:           a list specifying the required dilations.
                                    the input will be split into len(dilations) groups,
                                    group [i] will be convolved with grouped dilated (dilations[i]) convolution
        :param group_width:         the group width for the dilated convolution(s)
        :param se_ratio:            the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper)
                                    for example: a value of 4 translates to in_channels // 4
        """
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.dilations = dilations
        self.group_width = group_width
        self.stride = stride
        self.se_ratio = se_ratio
        self.shortcut = AdaptiveShortcutBlock(in_channels, out_channels, stride)
        groups = out_channels // group_width

        if len(dilations) == 1:  # minor optimization: no need to split if we only have 1 dilation group
            dilation = dilations[0]
            dilated_conv = nn.Conv2d(out_channels, out_channels, 3, stride=stride, groups=groups, padding=dilation, dilation=dilation, bias=False)
        else:
            dilated_conv = SplitDilatedGroupConvBlock(out_channels, dilations, group_width_per_split=group_width, stride=stride, bias=False)

        self.d_block_path = nn.Sequential(
            ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False),
            dilated_conv,
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            # the ratio of se block applied to `in_channels` as in the original paper
            SqueezeAndExcitationBlock(out_channels, in_channels // se_ratio),
            ConvBNReLU(out_channels, out_channels, 1, use_activation=False, bias=False),
        )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x1 = self.shortcut(x)
        x2 = self.d_block_path(x)
        out = self.relu(x1 + x2)
        return out

    def __str__(self):
        return (
            f"{self.__class__.__name__}_in{self.in_channels}_out{self.out_channels}" f"_d{self.dilations}_gw{self.group_width}_s{self.stride}_se{self.se_ratio}"
        )

__init__(in_channels, out_channels, dilations, group_width, stride, se_ratio=4)

Parameters:

Name Type Description Default
dilations List[int]

a list specifying the required dilations. the input will be split into len(dilations) groups, group [i] will be convolved with grouped dilated (dilations[i]) convolution

required
group_width int

the group width for the dilated convolution(s)

required
se_ratio int

the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper) for example: a value of 4 translates to in_channels // 4

4
Source code in src/super_gradients/training/models/segmentation_models/regseg.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def __init__(self, in_channels: int, out_channels: int, dilations: List[int], group_width: int, stride: int, se_ratio: int = 4):
    """
    :param dilations:           a list specifying the required dilations.
                                the input will be split into len(dilations) groups,
                                group [i] will be convolved with grouped dilated (dilations[i]) convolution
    :param group_width:         the group width for the dilated convolution(s)
    :param se_ratio:            the ratio of the squeeze-and-excitation block w.r.t in_channels (as in the paper)
                                for example: a value of 4 translates to in_channels // 4
    """
    super().__init__()
    self.in_channels = in_channels
    self.out_channels = out_channels
    self.dilations = dilations
    self.group_width = group_width
    self.stride = stride
    self.se_ratio = se_ratio
    self.shortcut = AdaptiveShortcutBlock(in_channels, out_channels, stride)
    groups = out_channels // group_width

    if len(dilations) == 1:  # minor optimization: no need to split if we only have 1 dilation group
        dilation = dilations[0]
        dilated_conv = nn.Conv2d(out_channels, out_channels, 3, stride=stride, groups=groups, padding=dilation, dilation=dilation, bias=False)
    else:
        dilated_conv = SplitDilatedGroupConvBlock(out_channels, dilations, group_width_per_split=group_width, stride=stride, bias=False)

    self.d_block_path = nn.Sequential(
        ConvBNReLU(in_channels, out_channels, kernel_size=1, bias=False),
        dilated_conv,
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True),
        # the ratio of se block applied to `in_channels` as in the original paper
        SqueezeAndExcitationBlock(out_channels, in_channels // se_ratio),
        ConvBNReLU(out_channels, out_channels, 1, use_activation=False, bias=False),
    )
    self.relu = nn.ReLU(inplace=True)

RegSegDecoder

Bases: nn.Module

This implementation follows the paper. No 'pattern' in this decoder, so it is specific to 3 stages

Source code in src/super_gradients/training/models/segmentation_models/regseg.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class RegSegDecoder(nn.Module):
    """
    This implementation follows the paper. No 'pattern' in this decoder, so it is specific to 3 stages
    """

    def __init__(self, backbone_output_channels: List[int], decoder_config: dict):
        super().__init__()
        projection_out_channels = decoder_config["projection_out_channels"]

        assert len(backbone_output_channels) == len(projection_out_channels) == 3, "This decoder is specific for 3 stages"

        self.projections = nn.ModuleList(
            [ConvBNReLU(in_channels, out_channels, 1, bias=False) for in_channels, out_channels in zip(backbone_output_channels, projection_out_channels)]
        )
        self.upsample = nn.Upsample(scale_factor=2, mode=decoder_config["interpolation"], align_corners=True)
        mid_channels = projection_out_channels[1]
        self.conv_bn_relu = ConvBNReLU(in_channels=mid_channels, out_channels=mid_channels // 2, kernel_size=3, padding=1, bias=False)
        self.out_channels = mid_channels // 2 + projection_out_channels[0]  # original implementation: concat

    def forward(self, x_stages):
        proj2 = self.projections[2](x_stages[2])
        proj2 = self.upsample(proj2)
        proj1 = self.projections[1](x_stages[1])
        proj1 = proj1 + proj2
        proj1 = self.conv_bn_relu(proj1)
        proj1 = self.upsample(proj1)
        proj0 = self.projections[0](x_stages[0])
        proj0 = torch.cat((proj1, proj0), dim=1)
        return proj0

SplitDilatedGroupConvBlock

Bases: nn.Module

Splits the input to "dilation groups", following grouped convolution with different dilation for each group

Source code in src/super_gradients/training/models/segmentation_models/regseg.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class SplitDilatedGroupConvBlock(nn.Module):
    """
    Splits the input to "dilation groups", following grouped convolution with different dilation for each group
    """

    def __init__(self, in_channels: int, split_dilations: List[int], group_width_per_split: int, stride: int, bias: bool):
        """
        :param split_dilations:         a list specifying the required dilations.
                                        the input will be split into len(dilations) groups,
                                        group [i] will be convolved with grouped dilated (dilations[i]) convolution
        :param group_width_per_split:   the group width for the *inner* dilated convolution
        """
        super().__init__()
        self.num_splits = len(split_dilations)
        assert in_channels % self.num_splits == 0, f"Cannot split {in_channels} to {self.num_splits} groups with equal size."
        group_channels = in_channels // self.num_splits
        assert group_channels % group_width_per_split == 0, (
            f"Cannot split {group_channels} channels ({in_channels} / {self.num_splits} splits)" f" to groups with {group_width_per_split} channels per group."
        )
        inner_groups = group_channels // group_width_per_split
        self.convs = nn.ModuleList(
            nn.Conv2d(group_channels, group_channels, 3, padding=d, dilation=d, stride=stride, bias=bias, groups=inner_groups) for d in split_dilations
        )
        self._splits = [in_channels // self.num_splits] * self.num_splits

    def forward(self, x):
        x = torch.split(x, self._splits, dim=1)
        return torch.cat([self.convs[i](x[i]) for i in range(self.num_splits)], dim=1)

__init__(in_channels, split_dilations, group_width_per_split, stride, bias)

Parameters:

Name Type Description Default
split_dilations List[int]

a list specifying the required dilations. the input will be split into len(dilations) groups, group [i] will be convolved with grouped dilated (dilations[i]) convolution

required
group_width_per_split int

the group width for the inner dilated convolution

required
Source code in src/super_gradients/training/models/segmentation_models/regseg.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def __init__(self, in_channels: int, split_dilations: List[int], group_width_per_split: int, stride: int, bias: bool):
    """
    :param split_dilations:         a list specifying the required dilations.
                                    the input will be split into len(dilations) groups,
                                    group [i] will be convolved with grouped dilated (dilations[i]) convolution
    :param group_width_per_split:   the group width for the *inner* dilated convolution
    """
    super().__init__()
    self.num_splits = len(split_dilations)
    assert in_channels % self.num_splits == 0, f"Cannot split {in_channels} to {self.num_splits} groups with equal size."
    group_channels = in_channels // self.num_splits
    assert group_channels % group_width_per_split == 0, (
        f"Cannot split {group_channels} channels ({in_channels} / {self.num_splits} splits)" f" to groups with {group_width_per_split} channels per group."
    )
    inner_groups = group_channels // group_width_per_split
    self.convs = nn.ModuleList(
        nn.Conv2d(group_channels, group_channels, 3, padding=d, dilation=d, stride=stride, bias=bias, groups=inner_groups) for d in split_dilations
    )
    self._splits = [in_channels // self.num_splits] * self.num_splits

EfficientSelfAttention

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class EfficientSelfAttention(nn.Module):
    def __init__(self, dim: int, head: int, sr_ratio: int):
        """
        Efficient self-attention (https://arxiv.org/pdf/2105.15203.pdf)
        :param dim: embedding dimension
        :param head: number of attention heads
        :param sr_ratio: the reduction ratio of the efficient self-attention
        """

        super().__init__()

        self.head = head
        self.sr_ratio = sr_ratio
        self.scale = (dim // head) ** -0.5
        self.q = nn.Linear(dim, dim)
        self.kv = nn.Linear(dim, dim * 2)
        self.proj = nn.Linear(dim, dim)

        if sr_ratio > 1:
            self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio)
            self.norm = nn.LayerNorm(dim)

    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
        b, n, c = x.shape
        q = self.q(x).reshape(b, n, self.head, c // self.head).permute(0, 2, 1, 3)

        if self.sr_ratio > 1:
            x = x.permute(0, 2, 1).reshape(b, c, h, w)
            x = self.sr(x).reshape(b, c, -1).permute(0, 2, 1)
            x = self.norm(x)

        k, v = self.kv(x).reshape(b, -1, 2, self.head, c // self.head).permute(2, 0, 3, 1, 4)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
        x = self.proj(x)
        return x

__init__(dim, head, sr_ratio)

Efficient self-attention (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
dim int

embedding dimension

required
head int

number of attention heads

required
sr_ratio int

the reduction ratio of the efficient self-attention

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(self, dim: int, head: int, sr_ratio: int):
    """
    Efficient self-attention (https://arxiv.org/pdf/2105.15203.pdf)
    :param dim: embedding dimension
    :param head: number of attention heads
    :param sr_ratio: the reduction ratio of the efficient self-attention
    """

    super().__init__()

    self.head = head
    self.sr_ratio = sr_ratio
    self.scale = (dim // head) ** -0.5
    self.q = nn.Linear(dim, dim)
    self.kv = nn.Linear(dim, dim * 2)
    self.proj = nn.Linear(dim, dim)

    if sr_ratio > 1:
        self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio)
        self.norm = nn.LayerNorm(dim)

EncoderBlock

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class EncoderBlock(nn.Module):
    def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float):
        """
        A single encoder block (https://arxiv.org/pdf/2105.15203.pdf)
        :param dim: embedding dimension
        :param head: number of attention heads
        :param sr_ratio: the reduction ratio of the efficient self-attention
        :param dpr: drop-path ratio
        """

        super().__init__()

        self.attn = EfficientSelfAttention(dim, head, sr_ratio)

        self.drop_path = DropPath(dpr) if dpr > 0.0 else nn.Identity()

        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

        self.mlp = MixFFN(in_dim=dim, inter_dim=dim * 4)

    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
        x = x + self.drop_path(self.attn(self.norm1(x), h, w))
        x = x + self.drop_path(self.mlp(self.norm2(x), h, w))

        return x

__init__(dim, head, sr_ratio, dpr)

A single encoder block (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
dim int

embedding dimension

required
head int

number of attention heads

required
sr_ratio int

the reduction ratio of the efficient self-attention

required
dpr float

drop-path ratio

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float):
    """
    A single encoder block (https://arxiv.org/pdf/2105.15203.pdf)
    :param dim: embedding dimension
    :param head: number of attention heads
    :param sr_ratio: the reduction ratio of the efficient self-attention
    :param dpr: drop-path ratio
    """

    super().__init__()

    self.attn = EfficientSelfAttention(dim, head, sr_ratio)

    self.drop_path = DropPath(dpr) if dpr > 0.0 else nn.Identity()

    self.norm1 = nn.LayerNorm(dim)
    self.norm2 = nn.LayerNorm(dim)

    self.mlp = MixFFN(in_dim=dim, inter_dim=dim * 4)

MLP

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class MLP(nn.Module):
    def __init__(self, dim: int, embed_dim: int):
        """
        A single Linear layer, with shape pre-processing
        :param dim: input dimension
        :param embed_dim: output dimension
        """

        super().__init__()

        self.proj = nn.Linear(dim, embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.flatten(2).transpose(1, 2)
        x = self.proj(x)

        return x

__init__(dim, embed_dim)

A single Linear layer, with shape pre-processing

Parameters:

Name Type Description Default
dim int

input dimension

required
embed_dim int

output dimension

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
270
271
272
273
274
275
276
277
278
279
def __init__(self, dim: int, embed_dim: int):
    """
    A single Linear layer, with shape pre-processing
    :param dim: input dimension
    :param embed_dim: output dimension
    """

    super().__init__()

    self.proj = nn.Linear(dim, embed_dim)

MiTBackBone

Bases: nn.Module, SupportsReplaceInputChannels

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
class MiTBackBone(nn.Module, SupportsReplaceInputChannels):
    def __init__(
        self,
        embed_dims: List[int],
        encoder_layers: List[int],
        eff_self_att_reduction_ratio: List[int],
        eff_self_att_heads: List[int],
        overlap_patch_size: List[int],
        overlap_patch_stride: List[int],
        overlap_patch_pad: List[int],
        in_channels: int,
    ):
        """
        Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf)
        :param embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
        :param encoder_layers: the number of encoder layers in each encoder stage
        :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
        :param eff_self_att_heads: number of efficient self-attention heads in each stage
        :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
        :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
        :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
        :param in_channels:  number of input channels
        """

        super().__init__()

        if not (
            len(embed_dims)
            == len(encoder_layers)
            == len(eff_self_att_reduction_ratio)
            == len(eff_self_att_heads)
            == len(overlap_patch_size)
            == len(overlap_patch_stride)
            == len(overlap_patch_pad)
        ):
            raise ValueError("All backbone hyper-parameters should be lists of the same length")

        # Patch embeddings
        self.patch_embed = []
        for stage_num in range(len(embed_dims)):
            self.patch_embed.append(
                PatchEmbedding(
                    in_channels=in_channels if stage_num == 0 else embed_dims[stage_num - 1],
                    out_channels=embed_dims[stage_num],
                    patch_size=overlap_patch_size[stage_num],
                    stride=overlap_patch_stride[stage_num],
                    padding=overlap_patch_pad[stage_num],
                )
            )
            self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num])

        drop_path_rate = 0.1
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))]

        self.blocks = []
        self.norms = []

        layer_idx = 0
        for stage_num in range(len(embed_dims)):
            self.blocks.append(
                nn.ModuleList(
                    [
                        EncoderBlock(
                            dim=embed_dims[stage_num],
                            head=eff_self_att_heads[stage_num],
                            sr_ratio=eff_self_att_reduction_ratio[stage_num],
                            dpr=dpr[layer_idx + i],
                        )
                        for i in range(encoder_layers[stage_num])
                    ]
                )
            )
            self.norms.append(nn.LayerNorm(embed_dims[stage_num]))

            self.add_module(f"block{stage_num + 1}", self.blocks[stage_num])
            self.add_module(f"norm{stage_num + 1}", self.norms[stage_num])

            layer_idx += encoder_layers[stage_num]

    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        b_size = x.shape[0]

        features = []
        for stage_num in range(len(self.patch_embed)):
            x, h, w = self.patch_embed[stage_num](x)

            for enc_block in self.blocks[stage_num]:
                x = enc_block(x, h, w)
            x = self.norms[stage_num](x)
            x = x.reshape(b_size, h, w, -1).permute(0, 3, 1, 2)

            features.append(x)

        return features

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        first_patch: PatchEmbedding = self.patch_embed[0]
        first_patch.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        first_patch: PatchEmbedding = self.patch_embed[0]
        return first_patch.get_input_channels()

__init__(embed_dims, encoder_layers, eff_self_att_reduction_ratio, eff_self_att_heads, overlap_patch_size, overlap_patch_stride, overlap_patch_pad, in_channels)

Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
embed_dims List[int]

the patch embedding dimensions (number of output channels in each encoder stage)

required
encoder_layers List[int]

the number of encoder layers in each encoder stage

required
eff_self_att_reduction_ratio List[int]

the reduction ratios of the efficient self-attention in each stage

required
eff_self_att_heads List[int]

number of efficient self-attention heads in each stage

required
overlap_patch_size List[int]

the patch size of the overlapping patch embedding in each stage

required
overlap_patch_stride List[int]

the patch stride of the overlapping patch embedding in each stage

required
overlap_patch_pad List[int]

the patch padding of the overlapping patch embedding in each stage

required
in_channels int

number of input channels

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def __init__(
    self,
    embed_dims: List[int],
    encoder_layers: List[int],
    eff_self_att_reduction_ratio: List[int],
    eff_self_att_heads: List[int],
    overlap_patch_size: List[int],
    overlap_patch_stride: List[int],
    overlap_patch_pad: List[int],
    in_channels: int,
):
    """
    Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf)
    :param embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
    :param encoder_layers: the number of encoder layers in each encoder stage
    :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
    :param eff_self_att_heads: number of efficient self-attention heads in each stage
    :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
    :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
    :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
    :param in_channels:  number of input channels
    """

    super().__init__()

    if not (
        len(embed_dims)
        == len(encoder_layers)
        == len(eff_self_att_reduction_ratio)
        == len(eff_self_att_heads)
        == len(overlap_patch_size)
        == len(overlap_patch_stride)
        == len(overlap_patch_pad)
    ):
        raise ValueError("All backbone hyper-parameters should be lists of the same length")

    # Patch embeddings
    self.patch_embed = []
    for stage_num in range(len(embed_dims)):
        self.patch_embed.append(
            PatchEmbedding(
                in_channels=in_channels if stage_num == 0 else embed_dims[stage_num - 1],
                out_channels=embed_dims[stage_num],
                patch_size=overlap_patch_size[stage_num],
                stride=overlap_patch_stride[stage_num],
                padding=overlap_patch_pad[stage_num],
            )
        )
        self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num])

    drop_path_rate = 0.1
    dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))]

    self.blocks = []
    self.norms = []

    layer_idx = 0
    for stage_num in range(len(embed_dims)):
        self.blocks.append(
            nn.ModuleList(
                [
                    EncoderBlock(
                        dim=embed_dims[stage_num],
                        head=eff_self_att_heads[stage_num],
                        sr_ratio=eff_self_att_reduction_ratio[stage_num],
                        dpr=dpr[layer_idx + i],
                    )
                    for i in range(encoder_layers[stage_num])
                ]
            )
        )
        self.norms.append(nn.LayerNorm(embed_dims[stage_num]))

        self.add_module(f"block{stage_num + 1}", self.blocks[stage_num])
        self.add_module(f"norm{stage_num + 1}", self.norms[stage_num])

        layer_idx += encoder_layers[stage_num]

MixFFN

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class MixFFN(nn.Module):
    def __init__(self, in_dim: int, inter_dim: int):
        """
        MixFFN block (https://arxiv.org/pdf/2105.15203.pdf)
        :param in_dim: input dimension
        :param inter_dim: intermediate dimension
        """

        super().__init__()

        self.fc1 = nn.Linear(in_dim, inter_dim)
        self.dwconv = nn.Conv2d(in_channels=inter_dim, out_channels=inter_dim, kernel_size=3, stride=1, padding=1, groups=inter_dim)
        self.fc2 = nn.Linear(inter_dim, in_dim)

    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
        x = self.fc1(x)

        b, _, c = x.shape
        x = x.transpose(1, 2).view(b, c, h, w)
        x = self.dwconv(x)
        x = x.flatten(2).transpose(1, 2)

        x = self.fc2(F.gelu(x))

        return x

__init__(in_dim, inter_dim)

MixFFN block (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
in_dim int

input dimension

required
inter_dim int

intermediate dimension

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
109
110
111
112
113
114
115
116
117
118
119
120
def __init__(self, in_dim: int, inter_dim: int):
    """
    MixFFN block (https://arxiv.org/pdf/2105.15203.pdf)
    :param in_dim: input dimension
    :param inter_dim: intermediate dimension
    """

    super().__init__()

    self.fc1 = nn.Linear(in_dim, inter_dim)
    self.dwconv = nn.Conv2d(in_channels=inter_dim, out_channels=inter_dim, kernel_size=3, stride=1, padding=1, groups=inter_dim)
    self.fc2 = nn.Linear(inter_dim, in_dim)

PatchEmbedding

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int):
        """
        Overlapped patch merging (https://arxiv.org/pdf/2105.15203.pdf)
        :param in_channels: number of input channels
        :param out_channels: number of output channels (embedding dimension)
        :param patch_size: patch size (k for size (k, k))
        :param stride: patch stride (k for size (k, k))
        :param padding:  patch padding (k for size (k, k))
        """

        super().__init__()

        self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding)
        self.norm = nn.LayerNorm(out_channels)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
        x = self.proj(x)
        _, _, h, w = x.shape

        x = x.flatten(2).transpose(1, 2)
        x = self.norm(x)

        return x, h, w

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.modules.weight_replacement_utils import replace_conv2d_input_channels

        self.proj = replace_conv2d_input_channels(conv=self.proj, in_channels=in_channels, fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.proj.in_channels

__init__(in_channels, out_channels, patch_size, stride, padding)

Overlapped patch merging (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
in_channels int

number of input channels

required
out_channels int

number of output channels (embedding dimension)

required
patch_size int

patch size (k for size (k, k))

required
stride int

patch stride (k for size (k, k))

required
padding int

patch padding (k for size (k, k))

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int):
    """
    Overlapped patch merging (https://arxiv.org/pdf/2105.15203.pdf)
    :param in_channels: number of input channels
    :param out_channels: number of output channels (embedding dimension)
    :param patch_size: patch size (k for size (k, k))
    :param stride: patch stride (k for size (k, k))
    :param padding:  patch padding (k for size (k, k))
    """

    super().__init__()

    self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding)
    self.norm = nn.LayerNorm(out_channels)

SegFormer

Bases: SegmentationModule

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
class SegFormer(SegmentationModule):
    def __init__(
        self,
        num_classes: int,
        encoder_embed_dims: List[int],
        encoder_layers: List[int],
        eff_self_att_reduction_ratio: List[int],
        eff_self_att_heads: List[int],
        decoder_embed_dim: int,
        overlap_patch_size: List[int],
        overlap_patch_stride: List[int],
        overlap_patch_pad: List[int],
        in_channels: int = 3,
        sliding_window_crop_size: Tuple[int, int] = (1024, 1024),
        sliding_window_stride: Tuple[int, int] = (768, 768),
    ):
        """
        :param num_classes: number of classes
        :param encoder_embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
        :param encoder_layers: the number of encoder layers in each encoder stage
        :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
        :param eff_self_att_heads: number of efficient self-attention heads in each stage
        :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
        :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
        :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
        :param in_channels:  number of input channels
        :param sliding_window_crop_size:  (height, width) the crop size to take from the image for forward with sliding window
        :param sliding_window_stride:  (height, width) the stride size between crops for forward with sliding window

        """

        super().__init__(use_aux_heads=False)

        self.encoder_embed_dims = encoder_embed_dims

        self.decoder_embed_dim = decoder_embed_dim

        self._backbone = MiTBackBone(
            embed_dims=encoder_embed_dims,
            encoder_layers=encoder_layers,
            eff_self_att_reduction_ratio=eff_self_att_reduction_ratio,
            eff_self_att_heads=eff_self_att_heads,
            overlap_patch_size=overlap_patch_size,
            overlap_patch_stride=overlap_patch_stride,
            overlap_patch_pad=overlap_patch_pad,
            in_channels=in_channels,
        )

        self.decode_head = SegFormerHead(encoder_dims=encoder_embed_dims, embed_dim=decoder_embed_dim, num_classes=num_classes)

        self.init_params()

        self.num_classes = num_classes

        self.use_sliding_window_validation = False
        self.sliding_window_crop_size = tuple(sliding_window_crop_size)
        self.sliding_window_stride = tuple(sliding_window_stride)

    def init_params(self):

        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

    def enable_sliding_window_validation(self):
        self.use_sliding_window_validation = True

    def disable_sliding_window_validation(self):
        self.use_sliding_window_validation = False

    @property
    def backbone(self):
        return self._backbone

    def _remove_auxiliary_heads(self):
        pass

    def replace_head(self, new_num_classes: int):
        self.decode_head = SegFormerHead(encoder_dims=self.encoder_embed_dims, embed_dim=self.decoder_embed_dim, num_classes=new_num_classes)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        return {}

    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        features = self._backbone(x)
        out = self.decode_head(features)
        out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
        return out

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.use_sliding_window_validation:
            return forward_with_sliding_window_wrapper(
                forward=self._forward,
                img=x,
                sliding_window_stride=self.sliding_window_stride,
                sliding_window_crop_size=self.sliding_window_crop_size,
                num_classes=self.num_classes,
            )
        else:
            return self._forward(x)

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]
        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate backbone params from the rest.
        :return: iterators of groups named_parameters.
        """
        backbone_names = [n for n, p in self.backbone.named_parameters()]
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if any([backbone_name in name for backbone_name in backbone_names]):
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self._backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self._backbone.get_input_channels()

__init__(num_classes, encoder_embed_dims, encoder_layers, eff_self_att_reduction_ratio, eff_self_att_heads, decoder_embed_dim, overlap_patch_size, overlap_patch_stride, overlap_patch_pad, in_channels=3, sliding_window_crop_size=(1024, 1024), sliding_window_stride=(768, 768))

Parameters:

Name Type Description Default
num_classes int

number of classes

required
encoder_embed_dims List[int]

the patch embedding dimensions (number of output channels in each encoder stage)

required
encoder_layers List[int]

the number of encoder layers in each encoder stage

required
eff_self_att_reduction_ratio List[int]

the reduction ratios of the efficient self-attention in each stage

required
eff_self_att_heads List[int]

number of efficient self-attention heads in each stage

required
overlap_patch_size List[int]

the patch size of the overlapping patch embedding in each stage

required
overlap_patch_stride List[int]

the patch stride of the overlapping patch embedding in each stage

required
overlap_patch_pad List[int]

the patch padding of the overlapping patch embedding in each stage

required
in_channels int

number of input channels

3
sliding_window_crop_size Tuple[int, int]

(height, width) the crop size to take from the image for forward with sliding window

(1024, 1024)
sliding_window_stride Tuple[int, int]

(height, width) the stride size between crops for forward with sliding window

(768, 768)
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
def __init__(
    self,
    num_classes: int,
    encoder_embed_dims: List[int],
    encoder_layers: List[int],
    eff_self_att_reduction_ratio: List[int],
    eff_self_att_heads: List[int],
    decoder_embed_dim: int,
    overlap_patch_size: List[int],
    overlap_patch_stride: List[int],
    overlap_patch_pad: List[int],
    in_channels: int = 3,
    sliding_window_crop_size: Tuple[int, int] = (1024, 1024),
    sliding_window_stride: Tuple[int, int] = (768, 768),
):
    """
    :param num_classes: number of classes
    :param encoder_embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
    :param encoder_layers: the number of encoder layers in each encoder stage
    :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
    :param eff_self_att_heads: number of efficient self-attention heads in each stage
    :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
    :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
    :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
    :param in_channels:  number of input channels
    :param sliding_window_crop_size:  (height, width) the crop size to take from the image for forward with sliding window
    :param sliding_window_stride:  (height, width) the stride size between crops for forward with sliding window

    """

    super().__init__(use_aux_heads=False)

    self.encoder_embed_dims = encoder_embed_dims

    self.decoder_embed_dim = decoder_embed_dim

    self._backbone = MiTBackBone(
        embed_dims=encoder_embed_dims,
        encoder_layers=encoder_layers,
        eff_self_att_reduction_ratio=eff_self_att_reduction_ratio,
        eff_self_att_heads=eff_self_att_heads,
        overlap_patch_size=overlap_patch_size,
        overlap_patch_stride=overlap_patch_stride,
        overlap_patch_pad=overlap_patch_pad,
        in_channels=in_channels,
    )

    self.decode_head = SegFormerHead(encoder_dims=encoder_embed_dims, embed_dim=decoder_embed_dim, num_classes=num_classes)

    self.init_params()

    self.num_classes = num_classes

    self.use_sliding_window_validation = False
    self.sliding_window_crop_size = tuple(sliding_window_crop_size)
    self.sliding_window_stride = tuple(sliding_window_stride)

initialize_param_groups(lr, training_params)

Custom param groups for training: - Different lr for backbone and the rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
435
436
437
438
439
440
441
442
443
444
445
446
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
    - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]
    return param_groups

SegFormerB0

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
536
537
538
539
540
541
542
543
544
545
546
@register_model(Models.SEGFORMER_B0)
class SegFormerB0(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B0 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B0 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
538
539
540
541
542
543
544
545
546
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B0 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerB1

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
549
550
551
552
553
554
555
556
557
558
559
@register_model(Models.SEGFORMER_B1)
class SegFormerB1(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B1 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B1 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
551
552
553
554
555
556
557
558
559
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B1 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerB2

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
562
563
564
565
566
567
568
569
570
571
572
@register_model(Models.SEGFORMER_B2)
class SegFormerB2(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B2 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B2 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
564
565
566
567
568
569
570
571
572
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B2 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerB3

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
575
576
577
578
579
580
581
582
583
584
585
@register_model(Models.SEGFORMER_B3)
class SegFormerB3(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B3 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B3 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
577
578
579
580
581
582
583
584
585
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B3 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerB4

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
588
589
590
591
592
593
594
595
596
597
598
@register_model(Models.SEGFORMER_B4)
class SegFormerB4(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B4 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B4 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
590
591
592
593
594
595
596
597
598
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B4 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerB5

Bases: SegFormerCustom

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
601
602
603
604
605
606
607
608
609
610
611
@register_model(Models.SEGFORMER_B5)
class SegFormerB5(SegFormerCustom):
    def __init__(self, arch_params: HpmStruct):
        """
        SegFormer B5 architecture
        :param arch_params: architecture parameters
        """

        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS)
        _arch_params.override(**arch_params.to_dict())
        super().__init__(_arch_params)

__init__(arch_params)

SegFormer B5 architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
603
604
605
606
607
608
609
610
611
def __init__(self, arch_params: HpmStruct):
    """
    SegFormer B5 architecture
    :param arch_params: architecture parameters
    """

    _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS)
    _arch_params.override(**arch_params.to_dict())
    super().__init__(_arch_params)

SegFormerCustom

Bases: SegFormer

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
class SegFormerCustom(SegFormer):
    def __init__(self, arch_params: HpmStruct):
        """
        Parse arch_params and translate the parameters to build the SegFormer architecture
        :param arch_params: architecture parameters
        """

        super().__init__(
            num_classes=arch_params.num_classes,
            encoder_embed_dims=arch_params.encoder_embed_dims,
            encoder_layers=arch_params.encoder_layers,
            eff_self_att_reduction_ratio=arch_params.eff_self_att_reduction_ratio,
            eff_self_att_heads=arch_params.eff_self_att_heads,
            decoder_embed_dim=arch_params.decoder_embed_dim,
            overlap_patch_size=arch_params.overlap_patch_size,
            overlap_patch_stride=arch_params.overlap_patch_stride,
            overlap_patch_pad=arch_params.overlap_patch_pad,
            in_channels=arch_params.in_channels,
            sliding_window_crop_size=arch_params.sliding_window_crop_size,
            sliding_window_stride=arch_params.sliding_window_stride,
        )

__init__(arch_params)

Parse arch_params and translate the parameters to build the SegFormer architecture

Parameters:

Name Type Description Default
arch_params HpmStruct

architecture parameters

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
def __init__(self, arch_params: HpmStruct):
    """
    Parse arch_params and translate the parameters to build the SegFormer architecture
    :param arch_params: architecture parameters
    """

    super().__init__(
        num_classes=arch_params.num_classes,
        encoder_embed_dims=arch_params.encoder_embed_dims,
        encoder_layers=arch_params.encoder_layers,
        eff_self_att_reduction_ratio=arch_params.eff_self_att_reduction_ratio,
        eff_self_att_heads=arch_params.eff_self_att_heads,
        decoder_embed_dim=arch_params.decoder_embed_dim,
        overlap_patch_size=arch_params.overlap_patch_size,
        overlap_patch_stride=arch_params.overlap_patch_stride,
        overlap_patch_pad=arch_params.overlap_patch_pad,
        in_channels=arch_params.in_channels,
        sliding_window_crop_size=arch_params.sliding_window_crop_size,
        sliding_window_stride=arch_params.sliding_window_stride,
    )

SegFormerHead

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/segformer.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
class SegFormerHead(nn.Module):
    def __init__(self, encoder_dims: List[int], embed_dim: int, num_classes: int):
        """
        SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf)
        :param encoder_dims: list of encoder embedding dimensions
        :param embed_dim: unified embedding dimension
        :param num_classes: number of predicted classes
        """
        super().__init__()

        self.linear_layers = []
        for idx, dim in enumerate(encoder_dims):
            self.linear_layers.append(MLP(dim, embed_dim))
            self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx])

        self.linear_fuse = ConvBNReLU(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim, kernel_size=1, bias=False, inplace=True)
        self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1)

        self.dropout = nn.Dropout2d(0.1)

    def forward(self, features: List[torch.Tensor]) -> torch.Tensor:
        b, _, h, w = features[0].shape

        out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])]

        for i, feature in enumerate(features[1:]):
            out = self.linear_layers[i + 1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:])
            out = F.interpolate(out, size=(h, w), mode="bilinear", align_corners=False)
            out_lst.append(out)

        out = self.linear_fuse(torch.cat(out_lst[::-1], dim=1))
        out = self.linear_pred(self.dropout(out))

        return out

__init__(encoder_dims, embed_dim, num_classes)

SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf)

Parameters:

Name Type Description Default
encoder_dims List[int]

list of encoder embedding dimensions

required
embed_dim int

unified embedding dimension

required
num_classes int

number of predicted classes

required
Source code in src/super_gradients/training/models/segmentation_models/segformer.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def __init__(self, encoder_dims: List[int], embed_dim: int, num_classes: int):
    """
    SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf)
    :param encoder_dims: list of encoder embedding dimensions
    :param embed_dim: unified embedding dimension
    :param num_classes: number of predicted classes
    """
    super().__init__()

    self.linear_layers = []
    for idx, dim in enumerate(encoder_dims):
        self.linear_layers.append(MLP(dim, embed_dim))
        self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx])

    self.linear_fuse = ConvBNReLU(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim, kernel_size=1, bias=False, inplace=True)
    self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1)

    self.dropout = nn.Dropout2d(0.1)

SegmentationModule

Bases: SgModule, ABC, HasPredict, SupportsInputShapeCheck, ExportableSegmentationModel

Base SegmentationModule class

Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
class SegmentationModule(SgModule, ABC, HasPredict, SupportsInputShapeCheck, ExportableSegmentationModel):
    """
    Base SegmentationModule class
    """

    def __init__(self, use_aux_heads: bool):
        super().__init__()
        self._use_aux_heads = use_aux_heads

        # Processing params
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None

    @property
    def use_aux_heads(self):
        return self._use_aux_heads

    @use_aux_heads.setter
    def use_aux_heads(self, use_aux: bool):
        """
        public setter for self._use_aux_heads, called every time an assignment to self.use_aux_heads is applied.
        if use_aux is False, `_remove_auxiliary_heads` is called to delete auxiliary and detail heads.
        if use_aux is True, and self._use_aux_heads was already set to False a ValueError is raised, recreating
            aux and detail heads outside init method is not allowed, and the module should be recreated.
        """
        if use_aux is True and self._use_aux_heads is False:
            raise ValueError(
                "Cant turn use_aux_heads from False to True. Try initiating the module again with"
                " `use_aux_heads=True` or initiating the auxiliary heads modules manually."
            )
        if not use_aux:
            self._remove_auxiliary_heads()
        self._use_aux_heads = use_aux

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        # set to false and delete auxiliary and detail heads modules.
        self.use_aux_heads = False

    @abstractmethod
    def _remove_auxiliary_heads(self):
        raise NotImplementedError()

    @property
    @abstractmethod
    def backbone(self) -> nn.Module:
        """
        For SgTrainer load_backbone compatibility.
        """
        raise NotImplementedError()

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(self, fuse_model: bool = True, fp16: bool = True) -> SegmentationPipeline:
        """Instantiate the segmentation pipeline of this model.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        """
        if None in (self._class_names, self._image_processor):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        pipeline = SegmentationPipeline(
            model=self,
            image_processor=self._image_processor,
            class_names=self._class_names,
            fuse_model=fuse_model,
            fp16=fp16,
        )
        return pipeline

    def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True, fp16: bool = True) -> ImagesSegmentationPrediction:
        """Predict an image or a list of images.
        :param images:  Images to predict.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param fp16:                        If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, fuse_model: bool = True, fp16: bool = True):
        """Predict using webcam.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
        pipeline.predict_webcam()

    def get_input_shape_steps(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        """
        Returns the minimum input shape size that the model can accept.
        For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
        """
        return 32, 32

    def get_processing_params(self):
        return self._image_processor

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

backbone: nn.Module abstractmethod property

For SgTrainer load_backbone compatibility.

get_input_shape_steps()

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
127
128
129
130
131
132
def get_input_shape_steps(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

get_minimum_input_shape_size()

Returns the minimum input shape size that the model can accept. For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model

Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
134
135
136
137
138
139
def get_minimum_input_shape_size(self) -> Tuple[int, int]:
    """
    Returns the minimum input shape size that the model can accept.
    For segmentation models the default is 32x32, which corresponds to the largest stride in the encoder part of the model
    """
    return 32, 32

predict(images, batch_size=32, fuse_model=True, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
109
110
111
112
113
114
115
116
117
def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True, fp16: bool = True) -> ImagesSegmentationPrediction:
    """Predict an image or a list of images.
    :param images:  Images to predict.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param fp16:                        If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(fuse_model=True, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
119
120
121
122
123
124
125
def predict_webcam(self, fuse_model: bool = True, fp16: bool = True):
    """Predict using webcam.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
    pipeline.predict_webcam()

set_dataset_processing_params(class_names=None, image_processor=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
Source code in src/super_gradients/training/models/segmentation_models/segmentation_module.py
76
77
78
79
80
81
82
83
84
85
86
87
88
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor

Shelfnet

paper: https://arxiv.org/abs/1811.11254 based on: https://github.com/juntang-zhuang/ShelfNet

DecoderHW

Bases: DecoderBase

DecoderHW - The Decoder for the Heavy-Weight ShelfNet Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
class DecoderHW(DecoderBase):
    """
    DecoderHW - The Decoder for the Heavy-Weight ShelfNet Architecture
    """

    def __init__(self, planes, layers, block=ShelfBlock, *args, **kwargs):
        super().__init__(planes=planes, layers=layers, block=block, *args, **kwargs)

        for i in range(0, layers - 1):
            self.up_conv_list.append(
                nn.ConvTranspose2d(
                    planes * 2 ** (layers - 1 - i), planes * 2 ** max(0, layers - i - 2), kernel_size=3, stride=2, padding=1, output_padding=1, bias=True
                )
            )
            self.up_dense_list.append(block(planes * 2 ** max(0, layers - i - 2), planes * 2 ** max(0, layers - i - 2)))

    def forward(self, x):
        # BOTTOM BRANCH
        out = self.bottom(x[-1])
        bottom = out

        # UP BRANCH
        up_out = []
        up_out.append(bottom)

        for j in range(0, self.layers - 1):
            out = self.up_conv_list[j](out) + x[self.layers - j - 2]
            out = self.up_dense_list[j](out)
            up_out.append(out)

        return up_out

DecoderLW

Bases: DecoderBase

DecoderLW - The Decoder for the Light-Weight ShelfNet Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
class DecoderLW(DecoderBase):
    """
    DecoderLW - The Decoder for the Light-Weight ShelfNet Architecture
    """

    def __init__(self, planes, layers, block=ShelfBlock, *args, **kwargs):
        super().__init__(planes=planes, layers=layers, block=block, *args, **kwargs)

        for i in range(0, layers - 1):
            self.up_conv_list.append(AttentionRefinementModule(planes * 2 ** (layers - 1 - i), planes * 2 ** max(0, layers - i - 2)))
            self.up_dense_list.append(ConvBNReLU(in_chan=planes * 2 ** max(0, layers - i - 2), out_chan=planes * 2 ** max(0, layers - i - 2), ks=3, stride=1))

    def forward(self, x):
        # BOTTOM BRANCH
        out = self.bottom(x[-1])
        bottom = out

        # UP BRANCH
        up_out = []
        up_out.append(bottom)

        for j in range(0, self.layers - 1):
            out = self.up_conv_list[j](out)
            out_interpolate = F.interpolate(out, (out.size(2) * 2, out.size(3) * 2), mode="nearest")
            out = out_interpolate + x[self.layers - j - 2]
            out = self.up_dense_list[j](out)
            up_out.append(out)

        return up_out

LadderBlockHW

Bases: LadderBlockBase

LadderBlockHW - LadderBlock for the Heavy-Weight ShelfNet Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
class LadderBlockHW(LadderBlockBase):
    """
    LadderBlockHW - LadderBlock for the Heavy-Weight ShelfNet Architecture
    """

    def __init__(self, planes, layers, block=ShelfBlock, *args, **kwargs):
        super().__init__(planes=planes, layers=layers, block=block, *args, **kwargs)

        for i in range(0, layers - 1):
            self.up_conv_list.append(
                nn.ConvTranspose2d(
                    planes * 2 ** (layers - i - 1), planes * 2 ** max(0, layers - i - 2), kernel_size=3, stride=2, padding=1, output_padding=1, bias=True
                )
            )

            self.up_dense_list.append(block(planes * 2 ** max(0, layers - i - 2), planes * 2 ** max(0, layers - i - 2)))

    def forward(self, x):
        out = self.inconv(x[-1])

        down_out = []
        # down branch
        for i in range(0, self.layers - 1):
            out = out + x[-i - 1]
            out = self.down_module_list[i](out)
            down_out.append(out)

            out = self.down_conv_list[i](out)
            out = F.relu(out)

        # bottom branch
        out = self.bottom(out)
        bottom = out

        # up branch
        up_out = []
        up_out.append(bottom)

        for j in range(0, self.layers - 1):
            out = self.up_conv_list[j](out) + down_out[self.layers - j - 2]
            out = self.up_dense_list[j](out)
            up_out.append(out)

        return up_out

LadderBlockLW

Bases: LadderBlockBase

LadderBlockLW - LadderBlock for the Light-Weight ShelfNet Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
class LadderBlockLW(LadderBlockBase):
    """
    LadderBlockLW - LadderBlock for the Light-Weight ShelfNet Architecture
    """

    def __init__(self, planes, layers, block=ShelfBlock, *args, **kwargs):
        super().__init__(planes=planes, layers=layers, block=block, *args, **kwargs)

        for i in range(0, layers - 1):
            self.up_conv_list.append(AttentionRefinementModule(planes * 2 ** (layers - 1 - i), planes * 2 ** max(0, layers - i - 2)))
            self.up_dense_list.append(ConvBNReLU(in_chan=planes * 2 ** max(0, layers - i - 2), out_chan=planes * 2 ** max(0, layers - i - 2), ks=3, stride=1))

    def forward(self, x):
        out = self.inconv(x[-1])

        down_out = []
        # DOWN BRANCH
        for i in range(0, self.layers - 1):
            out = out + x[-i - 1]
            out = self.down_module_list[i](out)
            down_out.append(out)

            out = self.down_conv_list[i](out)
            out = F.relu(out)

        # BOTTOM BRANCH
        out = self.bottom(out)
        bottom = out

        # UP BRANCH
        up_out = []
        up_out.append(bottom)

        for j in range(0, self.layers - 1):
            out = self.up_conv_list[j](out)
            out = F.interpolate(out, (out.size(2) * 2, out.size(3) * 2), mode="nearest") + down_out[self.layers - j - 2]
            out = self.up_dense_list[j](out)
            up_out.append(out)

        return up_out

ShelfBlock

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class ShelfBlock(nn.Module):
    def __init__(self, in_planes: int, planes: int, stride: int = 1, dropout: float = 0.25):
        """
        S-Block implementation from the ShelfNet paper
            :param in_planes:   input planes
            :param planes:      output planes
            :param stride:      convolution stride
            :param dropout:     dropout percentage
        """
        super().__init__()
        if in_planes != planes:
            self.conv0 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=1, padding=1, bias=True)
            self.relu0 = nn.ReLU(inplace=True)

        self.in_planes = in_planes
        self.planes = planes

        self.conv1 = nn.Conv2d(self.planes, self.planes, kernel_size=3, stride=stride, padding=1, bias=True)
        self.bn1 = nn.BatchNorm2d(self.planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout2d(p=dropout)
        self.bn2 = nn.BatchNorm2d(self.planes)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        if self.in_planes != self.planes:
            x = self.conv0(x)
            x = self.relu0(x)

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.dropout(out)
        out = self.conv1(out)
        out = self.bn2(out)
        out = out + x

        return self.relu2(out)

__init__(in_planes, planes, stride=1, dropout=0.25)

S-Block implementation from the ShelfNet paper :param in_planes: input planes :param planes: output planes :param stride: convolution stride :param dropout: dropout percentage

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(self, in_planes: int, planes: int, stride: int = 1, dropout: float = 0.25):
    """
    S-Block implementation from the ShelfNet paper
        :param in_planes:   input planes
        :param planes:      output planes
        :param stride:      convolution stride
        :param dropout:     dropout percentage
    """
    super().__init__()
    if in_planes != planes:
        self.conv0 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=1, padding=1, bias=True)
        self.relu0 = nn.ReLU(inplace=True)

    self.in_planes = in_planes
    self.planes = planes

    self.conv1 = nn.Conv2d(self.planes, self.planes, kernel_size=3, stride=stride, padding=1, bias=True)
    self.bn1 = nn.BatchNorm2d(self.planes)
    self.relu1 = nn.ReLU(inplace=True)
    self.dropout = nn.Dropout2d(p=dropout)
    self.bn2 = nn.BatchNorm2d(self.planes)
    self.relu2 = nn.ReLU(inplace=True)

ShelfNetBase

Bases: ShelfNetModuleBase

ShelfNetBase - ShelfNet Base Generic Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class ShelfNetBase(ShelfNetModuleBase):
    """
    ShelfNetBase - ShelfNet Base Generic Architecture
    """

    def __init__(
        self,
        backbone: ShelfResNetBackBone,
        planes: int,
        layers: int,
        num_classes: int = 21,
        image_size: int = 512,
        net_output_mid_channels_num: int = 64,
        arch_params: HpmStruct = None,
    ):
        self.num_classes = arch_params.num_classes if (arch_params and hasattr(arch_params, "num_classes")) else num_classes
        self.image_size = arch_params.image_size if (arch_params and hasattr(arch_params, "image_size")) else image_size

        super().__init__()
        self.net_output_mid_channels_num = net_output_mid_channels_num
        self.backbone = backbone(self.num_classes)
        self.layers = layers
        self.planes = planes

        # INITIALIZE WITH AUXILARY HEAD OUTPUTS ONN -> TURN IT OFF TO RUN A FORWARD PASS WITHOUT THE AUXILARY HEADS
        self.auxilary_head_outputs = True

        # DECODER AND LADDER SHOULD BE IMPLEMENTED BY THE INHERITING CLASS
        self.decoder = None
        self.ladder = None

        # BUILD THE CONV_OUT LIST BASED ON THE AMOUNT OF LAYERS IN THE SHELFNET
        self.conv_out_list = torch.nn.ModuleList()

    def forward(self, x):
        raise NotImplementedError

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        """
        update_optimizer_for_param_groups - Updates the specific parameters with different LR
        """
        # LEARNING RATE FOR THE BACKBONE IS lr
        param_groups[0]["lr"] = lr
        for i in range(1, len(param_groups)):
            # LEARNING RATE FOR OTHER SHELFNET PARAMS IS lr * 10
            param_groups[i]["lr"] = lr * 10

        return param_groups

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.backbone.get_input_channels()

update_param_groups(param_groups, lr, epoch, iter, training_params, total_batch)

update_optimizer_for_param_groups - Updates the specific parameters with different LR

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
451
452
453
454
455
456
457
458
459
460
461
def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
    """
    update_optimizer_for_param_groups - Updates the specific parameters with different LR
    """
    # LEARNING RATE FOR THE BACKBONE IS lr
    param_groups[0]["lr"] = lr
    for i in range(1, len(param_groups)):
        # LEARNING RATE FOR OTHER SHELFNET PARAMS IS lr * 10
        param_groups[i]["lr"] = lr * 10

    return param_groups

ShelfNetHW

Bases: ShelfNetBase

ShelfNetHW - Heavy-Weight Version of ShelfNet

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
class ShelfNetHW(ShelfNetBase):
    """
    ShelfNetHW - Heavy-Weight Version of ShelfNet
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ladder = LadderBlockHW(planes=self.net_output_mid_channels_num, layers=self.layers)
        self.decoder = DecoderHW(planes=self.net_output_mid_channels_num, layers=self.layers)
        self.se_layer = nn.Linear(self.net_output_mid_channels_num * 2**3, self.num_classes)
        self.aux_head = FCNHead(1024, self.num_classes)
        self.final = nn.Conv2d(self.net_output_mid_channels_num, self.num_classes, 1)

        # THE MID CHANNELS NUMBER OF THE NET OUTPUT BLOCK
        net_out_planes = self.planes
        mid_channels_num = self.net_output_mid_channels_num

        # INITIALIZE THE conv_out_list
        for i in range(self.layers):
            self.conv_out_list.append(ConvBNReLU(in_chan=net_out_planes, out_chan=mid_channels_num, ks=1, padding=0))

            mid_channels_num *= 2
            net_out_planes *= 2

    def forward(self, x):
        image_size = x.size()[2:]

        backbone_features_list = list(self.backbone(x))
        conv_bn_relu_results_list = []

        for feature, conv_bn_relu in zip(backbone_features_list, self.conv_out_list):
            out = conv_bn_relu(feature)
            conv_bn_relu_results_list.append(out)

        decoder_out_list = self.decoder(conv_bn_relu_results_list)
        ladder_out_list = self.ladder(decoder_out_list)

        preds = [self.final(ladder_out_list[-1])]

        # SE_LOSS ENCODING
        enc = F.max_pool2d(ladder_out_list[0], kernel_size=ladder_out_list[0].size()[2:])
        enc = torch.squeeze(enc, -1)
        enc = torch.squeeze(enc, -1)
        se = self.se_layer(enc)
        preds.append(se)

        # UP SAMPLING THE TOP LAYER FOR PREDICTION
        preds[0] = F.interpolate(preds[0], image_size, mode="bilinear", align_corners=True)

        # AUXILARY HEAD OUTPUT (ONLY RELEVANT FOR LOSS CALCULATION) - USE self.auxilary_head_outputs=FALSE FOR INFERENCE
        if self.auxilary_head_outputs or self.training:
            aux_out = self.aux_head(backbone_features_list[2])
            aux_out = F.interpolate(aux_out, image_size, mode="bilinear", align_corners=True)
            preds.append(aux_out)

            return tuple(preds)
        else:
            return preds[0]

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        initialize_optimizer_for_model_param_groups - Initializes the weights of the optimizer
                                                      Initializes the Backbone, the Output and the Auxilary Head
                                                      differently
            :param optimizer_cls:   The nn.optim (optimizer class) to initialize
            :param lr:              lr to set for the optimizer
            :param training_params:
            :return: list of dictionaries with named params and optimizer attributes
        """
        # OPTIMIZER PARAMETER GROUPS
        params_list = []

        # OPTIMIZE BACKBONE USING DIFFERENT LR
        params_list.append({"named_params": self.backbone.named_parameters(), "lr": lr})

        # OPTIMIZE MAIN SHELFNET ARCHITECTURE LAYERS
        params_list.append(
            {
                "named_params": list(self.ladder.named_parameters())
                + list(self.decoder.named_parameters())
                + list(self.se_layer.named_parameters())
                + list(self.conv_out_list.named_parameters())
                + list(self.final.named_parameters())
                + list(self.aux_head.named_parameters()),
                "lr": lr * 10,
            }
        )

        return params_list

initialize_param_groups(lr, training_params)

initialize_optimizer_for_model_param_groups - Initializes the weights of the optimizer Initializes the Backbone, the Output and the Auxilary Head differently :param optimizer_cls: The nn.optim (optimizer class) to initialize :param lr: lr to set for the optimizer :param training_params: :return: list of dictionaries with named params and optimizer attributes

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    initialize_optimizer_for_model_param_groups - Initializes the weights of the optimizer
                                                  Initializes the Backbone, the Output and the Auxilary Head
                                                  differently
        :param optimizer_cls:   The nn.optim (optimizer class) to initialize
        :param lr:              lr to set for the optimizer
        :param training_params:
        :return: list of dictionaries with named params and optimizer attributes
    """
    # OPTIMIZER PARAMETER GROUPS
    params_list = []

    # OPTIMIZE BACKBONE USING DIFFERENT LR
    params_list.append({"named_params": self.backbone.named_parameters(), "lr": lr})

    # OPTIMIZE MAIN SHELFNET ARCHITECTURE LAYERS
    params_list.append(
        {
            "named_params": list(self.ladder.named_parameters())
            + list(self.decoder.named_parameters())
            + list(self.se_layer.named_parameters())
            + list(self.conv_out_list.named_parameters())
            + list(self.final.named_parameters())
            + list(self.aux_head.named_parameters()),
            "lr": lr * 10,
        }
    )

    return params_list

ShelfNetLW

Bases: ShelfNetBase

ShelfNetLW - Light-Weight Implementation for ShelfNet

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
class ShelfNetLW(ShelfNetBase):
    """
    ShelfNetLW - Light-Weight Implementation for ShelfNet
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.net_output_list = nn.ModuleList()
        self.ladder = LadderBlockLW(planes=self.planes, layers=self.layers)
        self.decoder = DecoderLW(planes=self.planes, layers=self.layers)

    def forward(self, x):
        H, W = x.size()[2:]

        # SHELFNET LW ARCHITECTURE USES ONLY LAST 3 PARTIAL OUTPUTs OF THE BACKBONE'S 4 OUTPUT LAYERS
        backbone_features_tuple = self.backbone(x)[1:]

        if isinstance(self, ShelfNet18_LW):
            # FOR SHELFNET18 USE 1x1 CONVS AFTER THE BACKBONE'S FORWARD PASS TO MANIPULATE THE CHANNELS FOR THE DECODER
            conv_bn_relu_results_list = []

            for feature, conv_bn_relu in zip(backbone_features_tuple, self.conv_out_list):
                out = conv_bn_relu(feature)
                conv_bn_relu_results_list.append(out)

        else:
            # FOR SHELFNET34 THE CHANNELS ARE ALREADY ALIGNED
            conv_bn_relu_results_list = list(backbone_features_tuple)

        decoder_out_list = self.decoder(conv_bn_relu_results_list)
        ladder_out_list = self.ladder(decoder_out_list)

        # GET THE LAST ELEMENTS OF THE LADDER_BLOCK BASED ON THE AMOUNT OF SHELVES IN THE ARCHITECTURE AND REVERSE LIST
        feat_cp_list = list(reversed(ladder_out_list[(-1 * self.layers) :]))

        feat_out = self.net_output_list[0](feat_cp_list[0])
        feat_out = F.interpolate(feat_out, (H, W), mode="bilinear", align_corners=True)

        if self.auxilary_head_outputs or self.training:
            features_out_list = [feat_out]
            for conv_output_layer, feat_cp in zip(self.net_output_list[1:], feat_cp_list[1:]):
                feat_out_res = conv_output_layer(feat_cp)
                feat_out_res = F.interpolate(feat_out_res, (H, W), mode="bilinear", align_corners=True)
                features_out_list.append(feat_out_res)

            return tuple(features_out_list)

        else:
            # THIS DOES NOT CALCULATE THE AUXILARY HEADS THAT ARE CRITICAL FOR THE LOSS (USED MAINLY FOR INFERENCE)
            return feat_out

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        initialize_optimizer_for_model_param_groups - Initializes the optimizer group params, with 10x learning rate
                                                      for all but the backbone

            :param lr:              lr to set for the backbone
            :param training_params:
            :return: list of dictionaries with named params and optimizer attributes
        """
        # OPTIMIZER PARAMETER GROUPS
        params_list = []

        # OPTIMIZE BACKBONE USING DIFFERENT LR
        params_list.append({"named_params": self.backbone.named_parameters(), "lr": lr})

        # OPTIMIZE MAIN SHELFNET ARCHITECTURE LAYERS
        params_list.append(
            {
                "named_params": list(self.ladder.named_parameters()) + list(self.decoder.named_parameters()) + list(self.conv_out_list.named_parameters()),
                "lr": lr * 10,
            }
        )

        return params_list

initialize_param_groups(lr, training_params)

initialize_optimizer_for_model_param_groups - Initializes the optimizer group params, with 10x learning rate for all but the backbone

:param lr:              lr to set for the backbone
:param training_params:
:return: list of dictionaries with named params and optimizer attributes
Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    initialize_optimizer_for_model_param_groups - Initializes the optimizer group params, with 10x learning rate
                                                  for all but the backbone

        :param lr:              lr to set for the backbone
        :param training_params:
        :return: list of dictionaries with named params and optimizer attributes
    """
    # OPTIMIZER PARAMETER GROUPS
    params_list = []

    # OPTIMIZE BACKBONE USING DIFFERENT LR
    params_list.append({"named_params": self.backbone.named_parameters(), "lr": lr})

    # OPTIMIZE MAIN SHELFNET ARCHITECTURE LAYERS
    params_list.append(
        {
            "named_params": list(self.ladder.named_parameters()) + list(self.decoder.named_parameters()) + list(self.conv_out_list.named_parameters()),
            "lr": lr * 10,
        }
    )

    return params_list

ShelfNetModuleBase

Bases: SgModule

ShelfNetModuleBase - Base class for the different Modules of the ShelfNet Architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class ShelfNetModuleBase(SgModule):
    """
    ShelfNetModuleBase - Base class for the different Modules of the ShelfNet Architecture
    """

    def __init__(self):
        super().__init__()

    def forward(self, x):
        raise NotImplementedError

    def get_params(self):
        wd_params, nowd_params = [], []
        for name, module in self.named_modules():
            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
                wd_params.append(module.weight)
                if module.bias is not None:
                    nowd_params.append(module.bias)
            elif isinstance(module, nn.BatchNorm2d):
                nowd_params += list(module.parameters())
        return wd_params, nowd_params

ShelfResNetBackBone

Bases: ResNet

ShelfResNetBackBone - A class that Inherits from the original ResNet class and manipulates the forward pass, to create a backbone for the ShelfNet architecture

Source code in src/super_gradients/training/models/segmentation_models/shelfnet.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class ShelfResNetBackBone(ResNet):
    """
    ShelfResNetBackBone - A class that Inherits from the original ResNet class and manipulates the forward pass,
                          to create a backbone for the ShelfNet architecture
    """

    def __init__(self, block, num_blocks, num_classes=10, width_mult=1):
        super().__init__(block=block, num_blocks=num_blocks, num_classes=num_classes, width_mult=width_mult, backbone_mode=True)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        feat4 = self.layer1(out)  # 1/4
        feat8 = self.layer2(feat4)  # 1/8
        feat16 = self.layer3(feat8)  # 1/16
        feat32 = self.layer4(feat16)  # 1/32
        return feat4, feat8, feat16, feat32

Implementation of paper: "Rethinking BiSeNet For Real-time Semantic Segmentation", https://arxiv.org/abs/2104.13188 Based on original implementation: https://github.com/MichaelFan01/STDC-Seg, cloned 23/08/2021, commit 59ff37f

AbstractSTDCBackbone

Bases: nn.Module, SupportsReplaceInputChannels, ABC

All backbones for STDC segmentation models must implement this class.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class AbstractSTDCBackbone(nn.Module, SupportsReplaceInputChannels, ABC):
    """
    All backbones for STDC segmentation models must implement this class.
    """

    def validate_backbone(self):
        if len(self.get_backbone_output_number_of_channels()) != 3:
            raise ValueError(f"Backbone for STDC segmentation must output 3 feature maps," f" found: {len(self.get_backbone_output_number_of_channels())}.")

    @abstractmethod
    def get_backbone_output_number_of_channels(self) -> List[int]:
        """
        :return: list on stages num channels.
        """
        raise NotImplementedError()

get_backbone_output_number_of_channels() abstractmethod

Returns:

Type Description
List[int]

list on stages num channels.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
120
121
122
123
124
125
@abstractmethod
def get_backbone_output_number_of_channels(self) -> List[int]:
    """
    :return: list on stages num channels.
    """
    raise NotImplementedError()

AttentionRefinementModule

Bases: nn.Module

AttentionRefinementModule to apply on the last two backbone stages.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
class AttentionRefinementModule(nn.Module):
    """
    AttentionRefinementModule to apply on the last two backbone stages.
    """

    def __init__(self, in_channels: int, out_channels: int):
        super(AttentionRefinementModule, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.conv_first = ConvBNReLU(in_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.attention_block = nn.Sequential(
            nn.AdaptiveAvgPool2d(1), ConvBNReLU(out_channels, out_channels, kernel_size=1, bias=False, use_activation=False), nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_first(x)
        y = self.attention_block(x)
        return torch.mul(x, y)

ContextEmbedding

Bases: nn.Module

ContextEmbedding module that use global average pooling to 1x1 to extract context information, and then upsample to original input size.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
class ContextEmbedding(nn.Module):
    """
    ContextEmbedding module that use global average pooling to 1x1 to extract context information, and then upsample
    to original input size.
    """

    def __init__(self, in_channels: int, out_channels: int):
        super(ContextEmbedding, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.context_embedding = nn.Sequential(nn.AdaptiveAvgPool2d(1), ConvBNReLU(in_channels, out_channels, kernel_size=1, stride=1, bias=False))
        self.fixed_size = False

    def forward(self, x):
        out_height, out_width = x.size()[2:]
        x = self.context_embedding(x)
        return F.interpolate(x, size=(out_height, out_width), mode="nearest")

    def to_fixed_size(self, upsample_size: Union[list, tuple]):
        if self.fixed_size:
            return
        self.fixed_size = True

        self.context_embedding.add_module("upsample", nn.Upsample(scale_factor=upsample_size, mode="nearest"))

        self.forward = self.context_embedding.forward

ContextPath

Bases: nn.Module

ContextPath in STDC output both the Spatial path and Context path. This module include a STDCBackbone and output the stage3 feature map with down_ratio = 8 as the spatial feature map, and context feature map which is a result of upsampling and fusion of context embedding, stage5 and stage4 after Arm modules, Which is also with same resolution of the spatial feature map, down_ration = 8.

Parameters:

Name Type Description Default
backbone AbstractSTDCBackbone

Backbone of type AbstractSTDCBackbone that return info about backbone output channels.

required
fuse_channels int

num channels of the fused context path.

required
use_aux_heads bool

set True when training, output extra Auxiliary feature maps of the two last stages of the backbone.

required
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
class ContextPath(nn.Module):
    """
    ContextPath in STDC output both the Spatial path and Context path. This module include a STDCBackbone and output
    the stage3 feature map with down_ratio = 8 as the spatial feature map, and context feature map which is a result of
    upsampling and fusion of context embedding, stage5 and stage4 after Arm modules, Which is also with same resolution
    of the spatial feature map, down_ration = 8.
    :param backbone: Backbone of type AbstractSTDCBackbone that return info about backbone output channels.
    :param fuse_channels: num channels of the fused context path.
    :param use_aux_heads: set True when training, output extra Auxiliary feature maps of the two last stages of the
     backbone.
    """

    def __init__(self, backbone: AbstractSTDCBackbone, fuse_channels: int, use_aux_heads: bool):
        super(ContextPath, self).__init__()

        self.fuse_channels = fuse_channels
        self.use_aux_heads = use_aux_heads

        self.backbone = backbone
        # get num of channels for two last stages
        channels16, channels32 = self.backbone.get_backbone_output_number_of_channels()[-2:]

        self.context_embedding = ContextEmbedding(channels32, fuse_channels)

        self.arm32 = AttentionRefinementModule(channels32, fuse_channels)
        self.upsample32 = nn.Sequential(
            nn.Upsample(scale_factor=2, mode="nearest"), ConvBNReLU(fuse_channels, fuse_channels, kernel_size=3, padding=1, stride=1, bias=False)
        )

        self.arm16 = AttentionRefinementModule(channels16, fuse_channels)
        self.upsample16 = nn.Sequential(
            nn.Upsample(scale_factor=2, mode="nearest"), ConvBNReLU(fuse_channels, fuse_channels, kernel_size=3, padding=1, stride=1, bias=False)
        )

    def forward(self, x):
        feat8, feat16, feat32 = self.backbone(x)

        ce_feats = self.context_embedding(feat32)
        feat32_arm = self.arm32(feat32)
        feat32_arm = feat32_arm + ce_feats

        feat32_up = self.upsample32(feat32_arm)

        feat16_arm = self.arm16(feat16)
        feat16_arm = feat16_arm + feat32_up
        feat16_up = self.upsample16(feat16_arm)

        if self.use_aux_heads:
            return feat8, feat16_up, feat16, feat32
        return feat8, feat16_up

    def prep_for_conversion(self, input_size):
        if input_size[-2] % 32 != 0 or input_size[-1] % 32 != 0:
            raise ValueError(f"Expected image dimensions to be divisible by 32, got {input_size[-2]}x{input_size[-1]}")

        context_embedding_up_size = (input_size[-2] // 32, input_size[-1] // 32)
        self.context_embedding.to_fixed_size(context_embedding_up_size)

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.backbone.get_input_channels()

CustomSTDCSegmentation

Bases: STDCSegmentationBase

Fully customized STDC Segmentation factory module.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
@register_model(Models.STDC_CUSTOM)
@register_model("custom_stdc")  # deprecated naming convention. will be dropped in v4
class CustomSTDCSegmentation(STDCSegmentationBase):
    """
    Fully customized STDC Segmentation factory module.
    """

    def __init__(self, arch_params: HpmStruct):
        super().__init__(
            backbone=get_param(arch_params, "backbone"),
            num_classes=get_param(arch_params, "num_classes"),
            context_fuse_channels=get_param(arch_params, "context_fuse_channels", 128),
            ffm_channels=get_param(arch_params, "ffm_channels", 256),
            aux_head_channels=get_param(arch_params, "aux_head_channels", 64),
            detail_head_channels=get_param(arch_params, "detail_head_channels", 64),
            use_aux_heads=get_param(arch_params, "use_aux_heads", True),
            dropout=get_param(arch_params, "dropout", 0.2),
        )

FeatureFusionModule

Bases: nn.Module

Fuse features from higher resolution aka, spatial feature map with features from lower resolution with high semantic information aka, context feature map.

Parameters:

Name Type Description Default
spatial_channels int

num channels of input from spatial path.

required
context_channels int

num channels of input from context path.

required
out_channels int

num channels of feature fusion module.

required
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
class FeatureFusionModule(nn.Module):
    """
    Fuse features from higher resolution aka, spatial feature map with features from lower resolution with high
     semantic information aka, context feature map.
    :param spatial_channels: num channels of input from spatial path.
    :param context_channels: num channels of input from context path.
    :param out_channels: num channels of feature fusion module.
    """

    def __init__(self, spatial_channels: int, context_channels: int, out_channels: int):
        super(FeatureFusionModule, self).__init__()
        self.spatial_channels = spatial_channels
        self.context_channels = context_channels
        self.out_channels = out_channels

        self.pw_conv = ConvBNReLU(spatial_channels + context_channels, out_channels, kernel_size=1, stride=1, bias=False)
        # TODO - used without bias in convolutions by mistake, try to reproduce with bias=True
        self.attention_block = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            ConvBNReLU(in_channels=out_channels, out_channels=out_channels // 4, kernel_size=1, use_normalization=False, bias=False),
            nn.Conv2d(in_channels=out_channels // 4, out_channels=out_channels, kernel_size=1, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, spatial_feats, context_feats):
        feat = torch.cat([spatial_feats, context_feats], dim=1)
        feat = self.pw_conv(feat)
        atten = self.attention_block(feat)
        feat_atten = torch.mul(feat, atten)
        feat_out = feat_atten + feat
        return feat_out

STDCBackbone

Bases: AbstractSTDCBackbone

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class STDCBackbone(AbstractSTDCBackbone):
    def __init__(
        self,
        block_types: list,
        ch_widths: list,
        num_blocks: list,
        stdc_steps: int = 4,
        stdc_downsample_mode: str = "avg_pool",
        in_channels: int = 3,
        out_down_ratios: Union[tuple, list] = (32,),
    ):
        """
        :param block_types: list of block type for each stage, supported `conv` for ConvBNRelu with 3x3 kernel.
        :param ch_widths: list of output num of channels for each stage.
        :param num_blocks: list of the number of repeating blocks in each stage.
        :param stdc_steps: num of convs steps in each block.
        :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
         `dw_conv` for depthwise-convolution.
        :param in_channels: num channels of the input image.
        :param out_down_ratios: down ratio of output feature maps required from the backbone,
            default (32,) for classification.
        """
        super(STDCBackbone, self).__init__()
        if not (len(block_types) == len(ch_widths) == len(num_blocks)):
            raise ValueError(
                f"STDC architecture configuration, block_types, ch_widths, num_blocks, must be defined for the same number"
                f" of stages, found: {len(block_types)} for block_type, {len(ch_widths)} for ch_widths, "
                f"{len(num_blocks)} for num_blocks"
            )

        self.out_widths = []
        self.stages = nn.ModuleDict()
        self.out_stage_keys = []
        down_ratio = 2
        for block_type, width, blocks in zip(block_types, ch_widths, num_blocks):
            block_name = f"block_s{down_ratio}"
            self.stages[block_name] = self._make_stage(
                in_channels=in_channels,
                out_channels=width,
                block_type=block_type,
                num_blocks=blocks,
                stdc_steps=stdc_steps,
                stdc_downsample_mode=stdc_downsample_mode,
            )
            if down_ratio in out_down_ratios:
                self.out_stage_keys.append(block_name)
                self.out_widths.append(width)
            in_channels = width
            down_ratio *= 2

    def _make_stage(self, in_channels: int, out_channels: int, block_type: str, num_blocks: int, stdc_downsample_mode: str, stdc_steps: int = 4):
        """
        :param in_channels: input channels of stage.
        :param out_channels: output channels of stage.
        :param block_type: stage building block, supported `conv` for 3x3 ConvBNRelu, or `stdc` for STDCBlock.
        :param num_blocks: num of blocks in each stage.
        :param stdc_steps: number of conv3x3 steps in each STDC block, referred as `num blocks` in paper.
        :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
         `dw_conv` for depthwise-convolution.
        :return: nn.Module
        """
        if block_type == "conv":
            block = ConvBNReLU
            kwargs = {"kernel_size": 3, "padding": 1, "bias": False}
        elif block_type == "stdc":
            block = STDCBlock
            kwargs = {"steps": stdc_steps, "stdc_downsample_mode": stdc_downsample_mode}
        else:
            raise ValueError(f"Block type not supported: {block_type}, excepted: `conv` or `stdc`")

        # first block to apply stride 2.
        blocks = nn.ModuleList([block(in_channels, out_channels, stride=2, **kwargs)])
        # build rest of blocks
        for i in range(num_blocks - 1):
            blocks.append(block(out_channels, out_channels, stride=1, **kwargs))

        return nn.Sequential(*blocks)

    def forward(self, x):
        outputs = []
        for stage_name, stage in self.stages.items():
            x = stage(x)
            if stage_name in self.out_stage_keys:
                outputs.append(x)
        return tuple(outputs)

    def get_backbone_output_number_of_channels(self) -> List[int]:
        return self.out_widths

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        from super_gradients.module_interfaces import SupportsReplaceInputChannels

        first_stage: nn.Sequential = next(iter(self.stages.values()))  # noqa
        first_block = first_stage[0]

        if isinstance(first_block, SupportsReplaceInputChannels):
            first_block.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        else:
            raise NotImplementedError(f"`{first_block.__class__.__name__}` does not support `replace_input_channels`")

    def get_input_channels(self) -> int:
        first_stage: nn.Sequential = next(iter(self.stages.values()))  # noqa
        first_block = first_stage[0]
        if isinstance(first_block, SupportsReplaceInputChannels):
            return first_block.get_input_channels()
        else:
            raise NotImplementedError(f"`{first_block.__class__.__name__}` does not support `get_input_channels`")

__init__(block_types, ch_widths, num_blocks, stdc_steps=4, stdc_downsample_mode='avg_pool', in_channels=3, out_down_ratios=(32))

Parameters:

Name Type Description Default
block_types list

list of block type for each stage, supported conv for ConvBNRelu with 3x3 kernel.

required
ch_widths list

list of output num of channels for each stage.

required
num_blocks list

list of the number of repeating blocks in each stage.

required
stdc_steps int

num of convs steps in each block.

4
stdc_downsample_mode str

downsample mode in stdc block, supported avg_pool for average-pooling and dw_conv for depthwise-convolution.

'avg_pool'
in_channels int

num channels of the input image.

3
out_down_ratios Union[tuple, list]

down ratio of output feature maps required from the backbone, default (32,) for classification.

(32)
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def __init__(
    self,
    block_types: list,
    ch_widths: list,
    num_blocks: list,
    stdc_steps: int = 4,
    stdc_downsample_mode: str = "avg_pool",
    in_channels: int = 3,
    out_down_ratios: Union[tuple, list] = (32,),
):
    """
    :param block_types: list of block type for each stage, supported `conv` for ConvBNRelu with 3x3 kernel.
    :param ch_widths: list of output num of channels for each stage.
    :param num_blocks: list of the number of repeating blocks in each stage.
    :param stdc_steps: num of convs steps in each block.
    :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
     `dw_conv` for depthwise-convolution.
    :param in_channels: num channels of the input image.
    :param out_down_ratios: down ratio of output feature maps required from the backbone,
        default (32,) for classification.
    """
    super(STDCBackbone, self).__init__()
    if not (len(block_types) == len(ch_widths) == len(num_blocks)):
        raise ValueError(
            f"STDC architecture configuration, block_types, ch_widths, num_blocks, must be defined for the same number"
            f" of stages, found: {len(block_types)} for block_type, {len(ch_widths)} for ch_widths, "
            f"{len(num_blocks)} for num_blocks"
        )

    self.out_widths = []
    self.stages = nn.ModuleDict()
    self.out_stage_keys = []
    down_ratio = 2
    for block_type, width, blocks in zip(block_types, ch_widths, num_blocks):
        block_name = f"block_s{down_ratio}"
        self.stages[block_name] = self._make_stage(
            in_channels=in_channels,
            out_channels=width,
            block_type=block_type,
            num_blocks=blocks,
            stdc_steps=stdc_steps,
            stdc_downsample_mode=stdc_downsample_mode,
        )
        if down_ratio in out_down_ratios:
            self.out_stage_keys.append(block_name)
            self.out_widths.append(width)
        in_channels = width
        down_ratio *= 2

STDCBlock

Bases: nn.Module

STDC building block, known as Short Term Dense Concatenate module. In STDC module, the kernel size of first block is 1, and the rest of them are simply set as 3.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
class STDCBlock(nn.Module):
    """
    STDC building block, known as Short Term Dense Concatenate module.
    In STDC module, the kernel size of first block is 1, and the rest of them are simply set as 3.
    """

    def __init__(self, in_channels: int, out_channels: int, steps: int, stdc_downsample_mode: str, stride: int):
        """
        :param steps: The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.
        :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
         `dw_conv` for depthwise-convolution.
        """
        super().__init__()
        if steps not in [2, 3, 4]:
            raise ValueError(f"only 2, 3, 4 steps number are supported, found: {steps}")

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.steps = steps
        self.stdc_downsample_mode = stdc_downsample_mode
        self.stride = stride
        self.conv_list = nn.ModuleList()
        # build first step conv 1x1.
        self.conv_list.append(ConvBNReLU(in_channels, out_channels // 2, kernel_size=1, bias=False))
        # build skip connection after first convolution.
        if stride == 1:
            self.skip_step1 = Residual()
        elif stdc_downsample_mode == "avg_pool":
            self.skip_step1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
        elif stdc_downsample_mode == "dw_conv":
            self.skip_step1 = ConvBNReLU(
                out_channels // 2, out_channels // 2, kernel_size=3, stride=2, padding=1, bias=False, groups=out_channels // 2, use_activation=False
            )
        else:
            raise ValueError(f"stdc_downsample mode is not supported: found {stdc_downsample_mode}," f" must be in [avg_pool, dw_conv]")

        in_channels = out_channels // 2
        mid_channels = in_channels
        # build rest conv3x3 layers.
        for idx in range(1, steps):
            if idx < steps - 1:
                mid_channels //= 2
            conv = ConvBNReLU(in_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False)
            self.conv_list.append(conv)
            in_channels = mid_channels

        # add dw conv before second step for down sample if stride = 2.
        if stride == 2:
            self.conv_list[1] = nn.Sequential(
                ConvBNReLU(
                    out_channels // 2, out_channels // 2, kernel_size=3, stride=2, padding=1, groups=out_channels // 2, use_activation=False, bias=False
                ),
                self.conv_list[1],
            )

    def forward(self, x):
        out_list = []
        # run first conv
        x = self.conv_list[0](x)
        out_list.append(self.skip_step1(x))

        for conv in self.conv_list[1:]:
            x = conv(x)
            out_list.append(x)

        out = torch.cat(out_list, dim=1)
        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        first_conv: ConvBNReLU = self.conv_list[0]  # noqa
        first_conv.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)
        self.in_channels = self.get_input_channels()

    def get_input_channels(self) -> int:
        first_conv: ConvBNReLU = self.conv_list[0]  # noqa
        return first_conv.get_input_channels()

__init__(in_channels, out_channels, steps, stdc_downsample_mode, stride)

Parameters:

Name Type Description Default
steps int

The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.

required
stdc_downsample_mode str

downsample mode in stdc block, supported avg_pool for average-pooling and dw_conv for depthwise-convolution.

required
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def __init__(self, in_channels: int, out_channels: int, steps: int, stdc_downsample_mode: str, stride: int):
    """
    :param steps: The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.
    :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
     `dw_conv` for depthwise-convolution.
    """
    super().__init__()
    if steps not in [2, 3, 4]:
        raise ValueError(f"only 2, 3, 4 steps number are supported, found: {steps}")

    self.in_channels = in_channels
    self.out_channels = out_channels
    self.steps = steps
    self.stdc_downsample_mode = stdc_downsample_mode
    self.stride = stride
    self.conv_list = nn.ModuleList()
    # build first step conv 1x1.
    self.conv_list.append(ConvBNReLU(in_channels, out_channels // 2, kernel_size=1, bias=False))
    # build skip connection after first convolution.
    if stride == 1:
        self.skip_step1 = Residual()
    elif stdc_downsample_mode == "avg_pool":
        self.skip_step1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
    elif stdc_downsample_mode == "dw_conv":
        self.skip_step1 = ConvBNReLU(
            out_channels // 2, out_channels // 2, kernel_size=3, stride=2, padding=1, bias=False, groups=out_channels // 2, use_activation=False
        )
    else:
        raise ValueError(f"stdc_downsample mode is not supported: found {stdc_downsample_mode}," f" must be in [avg_pool, dw_conv]")

    in_channels = out_channels // 2
    mid_channels = in_channels
    # build rest conv3x3 layers.
    for idx in range(1, steps):
        if idx < steps - 1:
            mid_channels //= 2
        conv = ConvBNReLU(in_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.conv_list.append(conv)
        in_channels = mid_channels

    # add dw conv before second step for down sample if stride = 2.
    if stride == 2:
        self.conv_list[1] = nn.Sequential(
            ConvBNReLU(
                out_channels // 2, out_channels // 2, kernel_size=3, stride=2, padding=1, groups=out_channels // 2, use_activation=False, bias=False
            ),
            self.conv_list[1],
        )

STDCClassificationBase

Bases: SgModule

Base module for classification model based on STDCs backbones

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class STDCClassificationBase(SgModule):
    """
    Base module for classification model based on STDCs backbones
    """

    def __init__(self, backbone: STDCBackbone, num_classes: int, dropout: float):
        super(STDCClassificationBase, self).__init__()
        self.backbone = backbone
        last_channels = self.backbone.out_widths[-1]
        head_channels = max(1024, last_channels)

        self.conv_last = ConvBNReLU(last_channels, head_channels, 1, 1, bias=False)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(head_channels, head_channels, bias=False)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(head_channels, num_classes, bias=False)
        self.init_params()

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        out = self.backbone(x)[-1]
        # original implementation, why to use power?
        out = self.conv_last(out).pow(2)
        out = self.gap(out).flatten(1)
        out = self.fc(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.linear(out)
        return out

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.backbone.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.backbone.get_input_channels()

STDCSegmentationBase

Bases: SgModule, HasPredict, SupportsInputShapeCheck, ExportableSegmentationModel

Base STDC Segmentation Module.

Parameters:

Name Type Description Default
backbone AbstractSTDCBackbone

Backbone of type AbstractSTDCBackbone that return info about backbone output channels.

required
num_classes int

num of dataset classes, exclude ignore label.

required
context_fuse_channels int

num of output channels in ContextPath ARM feature fusion.

required
ffm_channels int

num of output channels of Feature Fusion Module.

required
aux_head_channels int

Num of hidden channels in Auxiliary segmentation heads.

required
detail_head_channels int

Num of hidden channels in Detail segmentation heads.

required
use_aux_heads bool

set True when training, attach Auxiliary and Detail heads. For compilation / inference mode set False.

required
dropout float

segmentation heads dropout.

required
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
class STDCSegmentationBase(SgModule, HasPredict, SupportsInputShapeCheck, ExportableSegmentationModel):
    """
    Base STDC Segmentation Module.
    :param backbone: Backbone of type AbstractSTDCBackbone that return info about backbone output channels.
    :param num_classes: num of dataset classes, exclude ignore label.
    :param context_fuse_channels: num of output channels in ContextPath ARM feature fusion.
    :param ffm_channels: num of output channels of Feature Fusion Module.
    :param aux_head_channels: Num of hidden channels in Auxiliary segmentation heads.
    :param detail_head_channels: Num of hidden channels in Detail segmentation heads.
    :param use_aux_heads: set True when training, attach Auxiliary and Detail heads. For compilation / inference mode
        set False.
    :param dropout: segmentation heads dropout.
    """

    @resolve_param("backbone", BaseFactory({"STDCBackbone": STDCBackbone}))
    def __init__(
        self,
        backbone: AbstractSTDCBackbone,
        num_classes: int,
        context_fuse_channels: int,
        ffm_channels: int,
        aux_head_channels: int,
        detail_head_channels: int,
        use_aux_heads: bool,
        dropout: float,
    ):
        super(STDCSegmentationBase, self).__init__()
        backbone.validate_backbone()
        self._use_aux_heads = use_aux_heads
        self.num_classes = num_classes
        self.cp = ContextPath(backbone, context_fuse_channels, use_aux_heads=use_aux_heads)

        stage3_s8_channels, stage4_s16_channels, stage5_s32_channels = backbone.get_backbone_output_number_of_channels()

        self.ffm = FeatureFusionModule(spatial_channels=stage3_s8_channels, context_channels=context_fuse_channels, out_channels=ffm_channels)
        # Main segmentation head
        self.segmentation_head = nn.Sequential(
            SegmentationHead(ffm_channels, ffm_channels, num_classes, dropout=dropout), nn.Upsample(scale_factor=8, mode="bilinear", align_corners=True)
        )

        if self._use_aux_heads:
            # Auxiliary heads
            self.aux_head_s16 = nn.Sequential(
                SegmentationHead(stage4_s16_channels, aux_head_channels, num_classes, dropout=dropout),
                nn.Upsample(scale_factor=16, mode="bilinear", align_corners=True),
            )
            self.aux_head_s32 = nn.Sequential(
                SegmentationHead(stage5_s32_channels, aux_head_channels, num_classes, dropout=dropout),
                nn.Upsample(scale_factor=32, mode="bilinear", align_corners=True),
            )
            # Detail head
            self.detail_head8 = nn.Sequential(
                SegmentationHead(stage3_s8_channels, detail_head_channels, 1, dropout=dropout), nn.Upsample(scale_factor=8, mode="bilinear", align_corners=True)
            )

        self.init_params()
        # Processing params
        self._class_names: Optional[List[str]] = None
        self._image_processor: Optional[Processing] = None

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare model for conversion, force use_aux_heads mode False and delete auxiliary and detail heads. Replace
        ContextEmbeddingOnline which cause compilation issues and not supported in some compilations,
        to ContextEmbeddingFixedSize.
        """
        # set to false and delete auxiliary and detail heads modules.
        self.use_aux_heads = False

        self.cp.prep_for_conversion(input_size)

    def _remove_auxiliary_and_detail_heads(self):
        attributes_to_delete = ["aux_head_s16", "aux_head_s32", "detail_head8"]
        for attr in attributes_to_delete:
            if hasattr(self, attr):
                delattr(self, attr)

    @property
    def use_aux_heads(self):
        return self._use_aux_heads

    @use_aux_heads.setter
    def use_aux_heads(self, use_aux: bool):
        """
        private setter for self._use_aux_heads, called every time an assignment to self._use_aux_heads is applied.
        if use_aux is False, `_remove_auxiliary_and_detail_heads` is called to delete auxiliary and detail heads.
        if use_aux is True, and self._use_aux_heads was already set to False a ValueError is raised, recreating
            aux and detail heads outside init method is not allowed, and the module should be recreated.
        """
        if use_aux is True and self._use_aux_heads is False:
            raise ValueError("Cant turn use_aux_heads from False to True, you should initiate the module again with" " `use_aux_heads=True`")
        if not use_aux:
            self._remove_auxiliary_and_detail_heads()
        self.cp.use_aux_heads = use_aux
        self._use_aux_heads = use_aux

    @property
    def backbone(self):
        """
        For Trainer load_backbone compatibility.
        """
        return self.cp.backbone

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        cp_outs = self.cp(x)
        feat8, feat_cp8 = cp_outs[0], cp_outs[1]
        # fuse stage 3 with result of context path after ARM modules.
        feat_out = self.ffm(spatial_feats=feat8, context_feats=feat_cp8)
        feat_out = self.segmentation_head(feat_out)

        if not self.use_aux_heads:
            return feat_out
        feat16, feat32 = cp_outs[2], cp_outs[3]
        detail_out8 = self.detail_head8(feat8)

        aux_out_s16 = self.aux_head_s16(feat16)
        aux_out_s32 = self.aux_head_s32(feat32)

        return feat_out, aux_out_s32, aux_out_s16, detail_out8

    def replace_head(self, new_num_classes: int, **kwargs):
        ffm_channels = self.ffm.attention_block[-2].out_channels
        dropout = self.segmentation_head[0].seg_head[1].p

        # Output layer's replacement- first modules in the sequences are the SegmentationHead modules.
        self.segmentation_head[0] = SegmentationHead(ffm_channels, ffm_channels, new_num_classes, dropout=dropout)
        self.num_classes = new_num_classes
        if self.use_aux_heads:
            stage3_s8_channels, stage4_s16_channels, stage5_s32_channels = self.backbone.get_backbone_output_number_of_channels()
            aux_head_channels = self.aux_head_s16[0].seg_head[-1].in_channels
            detail_head_channels = self.detail_head8[0].seg_head[-1].in_channels

            self.aux_head_s16[0] = SegmentationHead(stage4_s16_channels, aux_head_channels, new_num_classes, dropout=dropout)

            self.aux_head_s32[0] = SegmentationHead(stage5_s32_channels, aux_head_channels, new_num_classes, dropout=dropout)
            # Detail head
            self.detail_head8[0] = SegmentationHead(stage3_s8_channels, detail_head_channels, 1, dropout=dropout)

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        lr_dict = {"segmentation_head": lr, "default": 0}
        if self.use_aux_heads:
            lr_dict["aux_head"] = lr
            lr_dict["detail_head"] = lr
        return lr_dict

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for STDC training:
            - Different lr for context path and heads, if `multiply_head_lr` key is in `training_params`.
            - Add extra Detail loss params to optimizer.
        """

        extra_train_params = training_params.loss.get_train_named_params() if hasattr(training_params.loss, "get_train_named_params") else None
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)

        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]

        if extra_train_params is not None:
            param_groups.append({"named_params": extra_train_params, "lr": lr, "weight_decay": 0.0, "name": "detail_params"})

        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        """
        Separate ContextPath params from the rest.
        :return: iterators of groups named_parameters.
        """
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if "cp." in name:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def replace_input_channels(self, in_channels: int, compute_new_weights_fn: Optional[Callable[[nn.Module, int], nn.Module]] = None):
        self.cp.replace_input_channels(in_channels=in_channels, compute_new_weights_fn=compute_new_weights_fn)

    def get_input_channels(self) -> int:
        return self.cp.get_input_channels()

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor

    @lru_cache(1)
    def _get_pipeline(self, fuse_model: bool = True, fp16: bool = True) -> SegmentationPipeline:
        """Instantiate the segmentation pipeline of this model.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        """
        if None in (self._class_names, self._image_processor):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        pipeline = SegmentationPipeline(
            model=self,
            image_processor=self._image_processor,
            class_names=self._class_names,
            fuse_model=fuse_model,
        )
        return pipeline

    def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True, fp16: bool = True) -> ImagesSegmentationPrediction:
        """Predict an image or a list of images.
        :param images:  Images to predict.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, fuse_model: bool = True, fp16: bool = True):
        """Predict using webcam.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        :param fp16:       If True, use mixed precision for inference.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
        pipeline.predict_webcam()

    def get_input_shape_steps(self) -> Tuple[int, int]:
        return 32, 32

    def get_minimum_input_shape_size(self) -> Tuple[int, int]:
        return 32, 32

    def get_processing_params(self):
        return self._image_processor

    def get_preprocessing_callback(self, **kwargs):
        processing = self.get_processing_params()
        preprocessing_module = processing.get_equivalent_photometric_module()
        return preprocessing_module

backbone property

For Trainer load_backbone compatibility.

initialize_param_groups(lr, training_params)

Custom param groups for STDC training: - Different lr for context path and heads, if multiply_head_lr key is in training_params. - Add extra Detail loss params to optimizer.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for STDC training:
        - Different lr for context path and heads, if `multiply_head_lr` key is in `training_params`.
        - Add extra Detail loss params to optimizer.
    """

    extra_train_params = training_params.loss.get_train_named_params() if hasattr(training_params.loss, "get_train_named_params") else None
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)

    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]

    if extra_train_params is not None:
        param_groups.append({"named_params": extra_train_params, "lr": lr, "weight_decay": 0.0, "name": "detail_params"})

    return param_groups

predict(images, batch_size=32, fuse_model=True, fp16=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
681
682
683
684
685
686
687
688
689
def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True, fp16: bool = True) -> ImagesSegmentationPrediction:
    """Predict an image or a list of images.
    :param images:  Images to predict.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(fuse_model=True, fp16=True)

Predict using webcam.

Parameters:

Name Type Description Default
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
fp16 bool

If True, use mixed precision for inference.

True
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
691
692
693
694
695
696
697
def predict_webcam(self, fuse_model: bool = True, fp16: bool = True):
    """Predict using webcam.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    :param fp16:       If True, use mixed precision for inference.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model, fp16=fp16)
    pipeline.predict_webcam()

prep_model_for_conversion(input_size=None, **kwargs)

Prepare model for conversion, force use_aux_heads mode False and delete auxiliary and detail heads. Replace ContextEmbeddingOnline which cause compilation issues and not supported in some compilations, to ContextEmbeddingFixedSize.

Source code in src/super_gradients/training/models/segmentation_models/stdc.py
502
503
504
505
506
507
508
509
510
511
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare model for conversion, force use_aux_heads mode False and delete auxiliary and detail heads. Replace
    ContextEmbeddingOnline which cause compilation issues and not supported in some compilations,
    to ContextEmbeddingFixedSize.
    """
    # set to false and delete auxiliary and detail heads modules.
    self.use_aux_heads = False

    self.cp.prep_for_conversion(input_size)

set_dataset_processing_params(class_names=None, image_processor=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
Source code in src/super_gradients/training/models/segmentation_models/stdc.py
649
650
651
652
653
654
655
656
657
658
659
660
661
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor

UNet

Bases: UNetCustom

implementation of: "U-Net: Convolutional Networks for Biomedical Image Segmentation", https://arxiv.org/pdf/1505.04597.pdf The upsample operation is done by using bilinear interpolation which is reported to show better results.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet.py
224
225
226
227
228
229
230
231
232
233
234
@register_model(Models.UNET)
class UNet(UNetCustom):
    """
    implementation of:
     "U-Net: Convolutional Networks for Biomedical Image Segmentation", https://arxiv.org/pdf/1505.04597.pdf
    The upsample operation is done by using bilinear interpolation which is reported to show better results.
    """

    def __init__(self, arch_params: HpmStruct):
        arch_params = HpmStruct(**models.get_arch_params("unet_arch_params.yaml", arch_params.to_dict()))
        super().__init__(arch_params)

UNetBase

Bases: SegmentationModule

Source code in src/super_gradients/training/models/segmentation_models/unet/unet.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class UNetBase(SegmentationModule):
    @resolve_param("context_module", ContextModulesFactory())
    def __init__(
        self,
        num_classes: int,
        use_aux_heads: bool,
        final_upsample_factor: int,
        head_hidden_channels: Optional[int],
        head_upsample_mode: Union[UpsampleMode, str],
        align_corners: bool,
        backbone_params: dict,
        context_module: AbstractContextModule,
        decoder_params: dict,
        aux_heads_params: dict,
        dropout: float,
    ):
        """
        :param num_classes: num classes to predict.
        :param use_aux_heads: Whether to use auxiliary heads.
        :param final_upsample_factor: Final upsample scale factor after the segmentation head.
        :param head_hidden_channels: num channels before the last classification layer. see `mid_channels` in
            `SegmentationHead` class.
        :param head_upsample_mode: UpsampleMode of segmentation and auxiliary heads.
        :param align_corners: align_corners arg of segmentation and auxiliary heads.
        :param backbone_params: params to build a `UNetBackboneBase`, include the following keys:
            - strides_list: List[int], list of stride per stage.
            - width_list: List[int], list of num channels per stage.
            - num_blocks_list: List[int], list of num blocks per stage.
            - block_types_list: List[Union[DownBlockType, int]], list of block types per stage.
            - is_out_feature_list: List[bool], list of flags whether stage features should be an output.
            - in_channels: int, num channels of the input to the backbone module.
            - block_params: dict, argument to be passed to the block types constructors. i.e for `RegnetXStage`
                block_params should include bottleneck_ratio, group_width and se_ratio.
        :param decoder_params: params to build a `Decoder`, include the following keys:
            - up_block_repeat_list: List[int], num of blocks per decoder stage, the `block` implementation depends on
                the up-block type.
            - skip_expansion: float, skip expansion ratio value, before fusing the skip features from the encoder with
                the decoder features, a projection convolution is applied upon the encoder features to project the
                num_channels by skip_expansion.
            - decoder_scale: float, num_channels width ratio between encoder stages and decoder stages.
            - up_blocks: List[Type[AbstractUpFuseBlock]], list of AbstractUpFuseBlock types.
            - is_skip_list: List[bool], List of flags whether to use feature-map from encoder stage as skip connection
                or not.
        :param aux_heads_params: params to initiate auxiliary heads, include the following keys:
            - use_aux_list: List[bool], whether to append to auxiliary head per encoder stage.
            - aux_heads_factor: List[int], Upsample factor per encoder stage.
            - aux_hidden_channels: List[int], Hidden num channels before last classification layer, per encoder stage.
            - aux_out_channels: List[int], Output channels, can be refers as num_classes, of auxiliary head per encoder
                stage.
        :param dropout: dropout probability of segmentation and auxiliary heads.
        """
        super().__init__(use_aux_heads=use_aux_heads)
        self.num_classes = num_classes
        # Init Backbone
        backbone = UNetBackboneBase(**backbone_params)
        # Init Encoder
        self.encoder = Encoder(backbone, context_module)
        # Init Decoder
        self.decoder = Decoder(skip_channels_list=self.encoder.get_output_number_of_channels(), **decoder_params)
        # Init Segmentation Head
        self.seg_head = nn.Sequential(
            SegmentationHead(
                in_channels=self.decoder.up_channels_list[-1],
                mid_channels=head_hidden_channels or self.decoder.up_channels_list[-1],
                num_classes=self.num_classes,
                dropout=dropout,
            ),
            nn.Identity()
            if final_upsample_factor == 1
            else make_upsample_module(scale_factor=final_upsample_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
        )
        # Init Aux Heads
        if self.use_aux_heads:
            # Aux heads are applied if both conditions are true, use_aux_list is set as True and the correspondent
            # backbone features are outputted and set as True in backbone is_out_feature_list.
            aux_heads_params["use_aux_list"] = [a and b for a, b in zip(aux_heads_params["use_aux_list"], backbone_params["is_out_feature_list"])]
            self.aux_heads = self.init_aux_heads(
                in_channels_list=self.encoder.get_all_number_of_channels(),
                upsample_mode=head_upsample_mode,
                align_corners=align_corners,
                dropout=dropout,
                **aux_heads_params,
            )
            self.use_aux_feats = [a and b for a, b in zip(aux_heads_params["use_aux_list"], backbone_params["is_out_feature_list"]) if b]
        self.init_params()

    @staticmethod
    def init_aux_heads(
        in_channels_list: List[int],
        use_aux_list: List[bool],
        aux_heads_factor: List[int],
        aux_hidden_channels: List[int],
        aux_out_channels: List[int],
        dropout: float,
        upsample_mode: Union[str, UpsampleMode],
        align_corners: Optional[bool] = None,
    ):
        """
        :param use_aux_list: whether to append to auxiliary head per encoder stage.
        :param in_channels_list: list of input channels to the auxiliary segmentation heads.
        :param aux_heads_factor: list of upsample scale factors to apply at the end of the auxiliary segmentation heads.
        :param aux_hidden_channels: list of segmentation heads hidden channels.
        :param aux_out_channels: list of segmentation heads out channels, usually set as num_classes or 1 for detail
            edge heads.
        :param dropout: dropout probability factor.
        :param upsample_mode: see UpsampleMode for supported options.
        :return: nn.ModuleList
        """
        heads = nn.ModuleList(
            [
                nn.Sequential(
                    SegmentationHead(ch, hid_ch, out_ch, dropout=dropout),
                    make_upsample_module(scale_factor=scale, upsample_mode=upsample_mode, align_corners=align_corners),
                )
                for ch, scale, hid_ch, out_ch, use_aux in zip(in_channels_list, aux_heads_factor, aux_hidden_channels, aux_out_channels, use_aux_list)
                if use_aux
            ]
        )
        return heads

    def forward(self, x):
        encoder_feats = self.encoder(x)
        x = self.decoder(encoder_feats)
        x = self.seg_head(x)
        if not self.use_aux_heads:
            return x
        encoder_feats = [f for i, f in enumerate(encoder_feats) if self.use_aux_feats[i]]
        aux_feats = [aux_head(feat) for feat, aux_head in zip(encoder_feats[-len(self.aux_heads) :], self.aux_heads)]
        aux_feats.reverse()
        return tuple([x] + aux_feats)

    def _remove_auxiliary_heads(self):
        if hasattr(self, "aux_heads"):
            del self.aux_heads

    @property
    def backbone(self):
        return self.encoder

    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """
        Custom param groups for training:
            - Different lr for head and rest, if `multiply_head_lr` key is in `training_params`.
        """
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)

        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
        param_groups = [
            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
        ]

        return param_groups

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
        for param_group in param_groups:
            param_group["lr"] = lr
            if param_group["name"] == "multiply_lr_params":
                param_group["lr"] *= multiply_head_lr
        return param_groups

    def _separate_lr_multiply_params(self):
        multiply_lr_params, no_multiply_params = {}, {}
        for name, param in self.named_parameters():
            if "backbone." in name:
                no_multiply_params[name] = param
            else:
                multiply_lr_params[name] = param
        return multiply_lr_params.items(), no_multiply_params.items()

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, append_sigmoid: bool = False, append_softmax: bool = False, **kwargs):
        super().prep_model_for_conversion(input_size=input_size, **kwargs)
        fuse_repvgg_blocks_residual_branches(self)
        if append_sigmoid:
            self.seg_head.add_module("sigmoid", nn.Sigmoid())
        if append_softmax:
            self.seg_head.add_module("softmax", nn.Softmax(dim=1))

    def replace_head(self, new_num_classes: int, **kwargs):
        for module in self.modules():
            if isinstance(module, SegmentationHead):
                module.replace_num_classes(new_num_classes)

__init__(num_classes, use_aux_heads, final_upsample_factor, head_hidden_channels, head_upsample_mode, align_corners, backbone_params, context_module, decoder_params, aux_heads_params, dropout)

Parameters:

Name Type Description Default
num_classes int

num classes to predict.

required
use_aux_heads bool

Whether to use auxiliary heads.

required
final_upsample_factor int

Final upsample scale factor after the segmentation head.

required
head_hidden_channels Optional[int]

num channels before the last classification layer. see mid_channels in SegmentationHead class.

required
head_upsample_mode Union[UpsampleMode, str]

UpsampleMode of segmentation and auxiliary heads.

required
align_corners bool

align_corners arg of segmentation and auxiliary heads.

required
backbone_params dict

params to build a UNetBackboneBase, include the following keys: - strides_list: List[int], list of stride per stage. - width_list: List[int], list of num channels per stage. - num_blocks_list: List[int], list of num blocks per stage. - block_types_list: List[Union[DownBlockType, int]], list of block types per stage. - is_out_feature_list: List[bool], list of flags whether stage features should be an output. - in_channels: int, num channels of the input to the backbone module. - block_params: dict, argument to be passed to the block types constructors. i.e for RegnetXStage block_params should include bottleneck_ratio, group_width and se_ratio.

required
decoder_params dict

params to build a Decoder, include the following keys: - up_block_repeat_list: List[int], num of blocks per decoder stage, the block implementation depends on the up-block type. - skip_expansion: float, skip expansion ratio value, before fusing the skip features from the encoder with the decoder features, a projection convolution is applied upon the encoder features to project the num_channels by skip_expansion. - decoder_scale: float, num_channels width ratio between encoder stages and decoder stages. - up_blocks: List[Type[AbstractUpFuseBlock]], list of AbstractUpFuseBlock types. - is_skip_list: List[bool], List of flags whether to use feature-map from encoder stage as skip connection or not.

required
aux_heads_params dict

params to initiate auxiliary heads, include the following keys: - use_aux_list: List[bool], whether to append to auxiliary head per encoder stage. - aux_heads_factor: List[int], Upsample factor per encoder stage. - aux_hidden_channels: List[int], Hidden num channels before last classification layer, per encoder stage. - aux_out_channels: List[int], Output channels, can be refers as num_classes, of auxiliary head per encoder stage.

required
dropout float

dropout probability of segmentation and auxiliary heads.

required
Source code in src/super_gradients/training/models/segmentation_models/unet/unet.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
@resolve_param("context_module", ContextModulesFactory())
def __init__(
    self,
    num_classes: int,
    use_aux_heads: bool,
    final_upsample_factor: int,
    head_hidden_channels: Optional[int],
    head_upsample_mode: Union[UpsampleMode, str],
    align_corners: bool,
    backbone_params: dict,
    context_module: AbstractContextModule,
    decoder_params: dict,
    aux_heads_params: dict,
    dropout: float,
):
    """
    :param num_classes: num classes to predict.
    :param use_aux_heads: Whether to use auxiliary heads.
    :param final_upsample_factor: Final upsample scale factor after the segmentation head.
    :param head_hidden_channels: num channels before the last classification layer. see `mid_channels` in
        `SegmentationHead` class.
    :param head_upsample_mode: UpsampleMode of segmentation and auxiliary heads.
    :param align_corners: align_corners arg of segmentation and auxiliary heads.
    :param backbone_params: params to build a `UNetBackboneBase`, include the following keys:
        - strides_list: List[int], list of stride per stage.
        - width_list: List[int], list of num channels per stage.
        - num_blocks_list: List[int], list of num blocks per stage.
        - block_types_list: List[Union[DownBlockType, int]], list of block types per stage.
        - is_out_feature_list: List[bool], list of flags whether stage features should be an output.
        - in_channels: int, num channels of the input to the backbone module.
        - block_params: dict, argument to be passed to the block types constructors. i.e for `RegnetXStage`
            block_params should include bottleneck_ratio, group_width and se_ratio.
    :param decoder_params: params to build a `Decoder`, include the following keys:
        - up_block_repeat_list: List[int], num of blocks per decoder stage, the `block` implementation depends on
            the up-block type.
        - skip_expansion: float, skip expansion ratio value, before fusing the skip features from the encoder with
            the decoder features, a projection convolution is applied upon the encoder features to project the
            num_channels by skip_expansion.
        - decoder_scale: float, num_channels width ratio between encoder stages and decoder stages.
        - up_blocks: List[Type[AbstractUpFuseBlock]], list of AbstractUpFuseBlock types.
        - is_skip_list: List[bool], List of flags whether to use feature-map from encoder stage as skip connection
            or not.
    :param aux_heads_params: params to initiate auxiliary heads, include the following keys:
        - use_aux_list: List[bool], whether to append to auxiliary head per encoder stage.
        - aux_heads_factor: List[int], Upsample factor per encoder stage.
        - aux_hidden_channels: List[int], Hidden num channels before last classification layer, per encoder stage.
        - aux_out_channels: List[int], Output channels, can be refers as num_classes, of auxiliary head per encoder
            stage.
    :param dropout: dropout probability of segmentation and auxiliary heads.
    """
    super().__init__(use_aux_heads=use_aux_heads)
    self.num_classes = num_classes
    # Init Backbone
    backbone = UNetBackboneBase(**backbone_params)
    # Init Encoder
    self.encoder = Encoder(backbone, context_module)
    # Init Decoder
    self.decoder = Decoder(skip_channels_list=self.encoder.get_output_number_of_channels(), **decoder_params)
    # Init Segmentation Head
    self.seg_head = nn.Sequential(
        SegmentationHead(
            in_channels=self.decoder.up_channels_list[-1],
            mid_channels=head_hidden_channels or self.decoder.up_channels_list[-1],
            num_classes=self.num_classes,
            dropout=dropout,
        ),
        nn.Identity()
        if final_upsample_factor == 1
        else make_upsample_module(scale_factor=final_upsample_factor, upsample_mode=head_upsample_mode, align_corners=align_corners),
    )
    # Init Aux Heads
    if self.use_aux_heads:
        # Aux heads are applied if both conditions are true, use_aux_list is set as True and the correspondent
        # backbone features are outputted and set as True in backbone is_out_feature_list.
        aux_heads_params["use_aux_list"] = [a and b for a, b in zip(aux_heads_params["use_aux_list"], backbone_params["is_out_feature_list"])]
        self.aux_heads = self.init_aux_heads(
            in_channels_list=self.encoder.get_all_number_of_channels(),
            upsample_mode=head_upsample_mode,
            align_corners=align_corners,
            dropout=dropout,
            **aux_heads_params,
        )
        self.use_aux_feats = [a and b for a, b in zip(aux_heads_params["use_aux_list"], backbone_params["is_out_feature_list"]) if b]
    self.init_params()

init_aux_heads(in_channels_list, use_aux_list, aux_heads_factor, aux_hidden_channels, aux_out_channels, dropout, upsample_mode, align_corners=None) staticmethod

Parameters:

Name Type Description Default
use_aux_list List[bool]

whether to append to auxiliary head per encoder stage.

required
in_channels_list List[int]

list of input channels to the auxiliary segmentation heads.

required
aux_heads_factor List[int]

list of upsample scale factors to apply at the end of the auxiliary segmentation heads.

required
aux_hidden_channels List[int]

list of segmentation heads hidden channels.

required
aux_out_channels List[int]

list of segmentation heads out channels, usually set as num_classes or 1 for detail edge heads.

required
dropout float

dropout probability factor.

required
upsample_mode Union[str, UpsampleMode]

see UpsampleMode for supported options.

required

Returns:

Type Description

nn.ModuleList

Source code in src/super_gradients/training/models/segmentation_models/unet/unet.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
@staticmethod
def init_aux_heads(
    in_channels_list: List[int],
    use_aux_list: List[bool],
    aux_heads_factor: List[int],
    aux_hidden_channels: List[int],
    aux_out_channels: List[int],
    dropout: float,
    upsample_mode: Union[str, UpsampleMode],
    align_corners: Optional[bool] = None,
):
    """
    :param use_aux_list: whether to append to auxiliary head per encoder stage.
    :param in_channels_list: list of input channels to the auxiliary segmentation heads.
    :param aux_heads_factor: list of upsample scale factors to apply at the end of the auxiliary segmentation heads.
    :param aux_hidden_channels: list of segmentation heads hidden channels.
    :param aux_out_channels: list of segmentation heads out channels, usually set as num_classes or 1 for detail
        edge heads.
    :param dropout: dropout probability factor.
    :param upsample_mode: see UpsampleMode for supported options.
    :return: nn.ModuleList
    """
    heads = nn.ModuleList(
        [
            nn.Sequential(
                SegmentationHead(ch, hid_ch, out_ch, dropout=dropout),
                make_upsample_module(scale_factor=scale, upsample_mode=upsample_mode, align_corners=align_corners),
            )
            for ch, scale, hid_ch, out_ch, use_aux in zip(in_channels_list, aux_heads_factor, aux_hidden_channels, aux_out_channels, use_aux_list)
            if use_aux
        ]
    )
    return heads

initialize_param_groups(lr, training_params)

Custom param groups for training: - Different lr for head and rest, if multiply_head_lr key is in training_params.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """
    Custom param groups for training:
        - Different lr for head and rest, if `multiply_head_lr` key is in `training_params`.
    """
    multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)

    multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
    param_groups = [
        {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
        {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
    ]

    return param_groups

AbstractUpFuseBlock

Bases: nn.Module, ABC

Abstract class for upsample and fuse UNet decoder building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class AbstractUpFuseBlock(nn.Module, ABC):
    """
    Abstract class for upsample and fuse UNet decoder building block.
    """

    def __init__(self, in_channels: int, skip_channels: int, out_channels: int, **kwargs):
        """
        :param in_channels: num_channels of the feature map to be upsample.
        :param skip_channels: num_channels of the skip feature map from higher resolution.
        :param out_channels: num_channels of the output features.
        """
        super().__init__()

    @abstractmethod
    def forward(self, x, skip):
        raise NotImplementedError()

    @staticmethod
    def validate_upsample_mode(
        in_channels: int, up_factor: int, upsample_mode: Union[UpsampleMode, str], fallback_mode: Optional[Union[UpsampleMode, str]] = None
    ) -> Tuple[Union[UpsampleMode, str], int]:
        """
        Validate whether the upsample_mode is supported, and returns the upsample path output channels.
        :return: tuple of upsample_mode and out_channels of the upsample module
        """
        out_channels = in_channels
        upsample_mode = upsample_mode.value if isinstance(upsample_mode, UpsampleMode) else upsample_mode
        if upsample_mode in [UpsampleMode.PIXEL_SHUFFLE.value, UpsampleMode.NN_PIXEL_SHUFFLE.value]:
            # Check if in_channels is divisible by (up_factor ** 2) for pixel shuffle, else fallback to fallback_mode.
            _in_ch = in_channels / (up_factor**2)
            if _in_ch % 1 == 0:
                out_channels = int(_in_ch)
            elif fallback_mode is not None:
                upsample_mode = fallback_mode
            else:
                raise ValueError(
                    f"Upsample mode: {upsample_mode} can't be used, due to in_channels: {in_channels} "
                    f"is not divisible by (up_factor ** 2) for up_factor: {up_factor}.\n"
                    f"Consider setting a `fallback_mode`."
                )
        return upsample_mode, out_channels

__init__(in_channels, skip_channels, out_channels, **kwargs)

Parameters:

Name Type Description Default
in_channels int

num_channels of the feature map to be upsample.

required
skip_channels int

num_channels of the skip feature map from higher resolution.

required
out_channels int

num_channels of the output features.

required
Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
21
22
23
24
25
26
27
def __init__(self, in_channels: int, skip_channels: int, out_channels: int, **kwargs):
    """
    :param in_channels: num_channels of the feature map to be upsample.
    :param skip_channels: num_channels of the skip feature map from higher resolution.
    :param out_channels: num_channels of the output features.
    """
    super().__init__()

validate_upsample_mode(in_channels, up_factor, upsample_mode, fallback_mode=None) staticmethod

Validate whether the upsample_mode is supported, and returns the upsample path output channels.

Returns:

Type Description
Tuple[Union[UpsampleMode, str], int]

tuple of upsample_mode and out_channels of the upsample module

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@staticmethod
def validate_upsample_mode(
    in_channels: int, up_factor: int, upsample_mode: Union[UpsampleMode, str], fallback_mode: Optional[Union[UpsampleMode, str]] = None
) -> Tuple[Union[UpsampleMode, str], int]:
    """
    Validate whether the upsample_mode is supported, and returns the upsample path output channels.
    :return: tuple of upsample_mode and out_channels of the upsample module
    """
    out_channels = in_channels
    upsample_mode = upsample_mode.value if isinstance(upsample_mode, UpsampleMode) else upsample_mode
    if upsample_mode in [UpsampleMode.PIXEL_SHUFFLE.value, UpsampleMode.NN_PIXEL_SHUFFLE.value]:
        # Check if in_channels is divisible by (up_factor ** 2) for pixel shuffle, else fallback to fallback_mode.
        _in_ch = in_channels / (up_factor**2)
        if _in_ch % 1 == 0:
            out_channels = int(_in_ch)
        elif fallback_mode is not None:
            upsample_mode = fallback_mode
        else:
            raise ValueError(
                f"Upsample mode: {upsample_mode} can't be used, due to in_channels: {in_channels} "
                f"is not divisible by (up_factor ** 2) for up_factor: {up_factor}.\n"
                f"Consider setting a `fallback_mode`."
            )
    return upsample_mode, out_channels

Decoder

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class Decoder(nn.Module):
    @resolve_param("up_block_types", ListFactory(TypeFactory(UP_FUSE_BLOCKS)))
    def __init__(
        self,
        skip_channels_list: List[int],
        up_block_repeat_list: List[int],
        skip_expansion: float,
        decoder_scale: float,
        up_block_types: List[Type[AbstractUpFuseBlock]],
        is_skip_list: List[bool],
        min_decoder_channels: int = 1,
        **up_block_kwargs,
    ):
        """

        :param skip_channels_list: num_channels list of skip feature maps from the encoder.
        :param up_block_repeat_list: `num_repeats` arg list to be passed to the UpFuseBlocks.
        :param skip_expansion: skip expansion ratio value, before fusing the skip features from the encoder with the
            decoder features, a projection convolution is applied upon the encoder features to project the num_channels
            by skip_expansion as follows: `num_channels = skip_channels * skip_expansion
        :param decoder_scale: num_channels width ratio between encoder stages and decoder stages.
        :param min_decoder_channels: The minimum num_channels of decoder stages. Useful i.e if we want to keep the width
            above the num of classes. The num_channels of a decoder stage is determined as follows:
                `decoder_channels = max(encoder_channels * decoder_scale, min_decoder_channels)`
        :param up_block_types: list of AbstractUpFuseBlock.
        :param is_skip_list: List of flags whether to use feature-map from encoder stage as skip connection or not. Used
            to not apply projection convolutions if a certain encoder feature is not aggregate with the decoder.
        :param up_block_kwargs: init parameters for fuse blocks.
        """
        super().__init__()
        # num_channels list after encoder features projections.
        self.up_channels_list = [max(int(ch * decoder_scale), min_decoder_channels) for ch in skip_channels_list]
        # Reverse order to up-bottom order, i.e [stage4_ch, stage3_ch, ... , stage1_ch]
        self.up_channels_list.reverse()
        # Remove last stage num_channels, as it is the input to the decoder.
        self.up_channels_list.pop(0)

        is_skip_list.reverse()
        is_skip_list += [False]

        self.projection_blocks, skip_channels_list = self._make_skip_projection(skip_channels_list, skip_expansion, is_skip_list, min_decoder_channels)
        skip_channels_list = skip_channels_list.copy()
        skip_channels_list.reverse()

        self.up_stages = nn.ModuleList()
        in_channels = skip_channels_list.pop(0)
        skip_channels_list.append(None)
        for i in range(len(up_block_types)):
            self.up_stages.append(
                up_block_types[i](in_channels, skip_channels_list[i], self.up_channels_list[i], num_repeats=up_block_repeat_list[i], **up_block_kwargs)
            )
            in_channels = self.up_channels_list[i]

    def _make_skip_projection(self, skip_channels_list: list, skip_expansion: float, is_skip_list: list, min_decoder_channels: int):
        if skip_expansion == 1.0:
            return nn.ModuleList([CrossModelSkipConnection()] * len(skip_channels_list)), skip_channels_list

        projection_channels = [max(int(ch * skip_expansion), min_decoder_channels) for ch in skip_channels_list]
        blocks = nn.ModuleList()
        for i in range(len(skip_channels_list)):
            if not is_skip_list[i]:
                blocks.append(nn.Identity())
                projection_channels[i] = skip_channels_list[i]
            else:
                blocks.append(ConvBNReLU(skip_channels_list[i], projection_channels[i], kernel_size=1, bias=False, use_activation=False))

        return blocks, projection_channels

    def forward(self, feats: List[torch.Tensor]):
        feats = [adapt_conv(feat) for feat, adapt_conv in zip(feats, self.projection_blocks)]
        # Reverse order to up-bottom order, i.e [stage4_ch, stage3_ch, ... , stage1_ch]
        feats.reverse()
        # Remove last stage feature map, as it is the input to the decoder and not a skip connection.
        x = feats.pop(0)
        for up_stage, skip in zip(self.up_stages, feats):
            x = up_stage(x, skip)
        return x

__init__(skip_channels_list, up_block_repeat_list, skip_expansion, decoder_scale, up_block_types, is_skip_list, min_decoder_channels=1, **up_block_kwargs)

Parameters:

Name Type Description Default
skip_channels_list List[int]

num_channels list of skip feature maps from the encoder.

required
up_block_repeat_list List[int]

num_repeats arg list to be passed to the UpFuseBlocks.

required
skip_expansion float

skip expansion ratio value, before fusing the skip features from the encoder with the decoder features, a projection convolution is applied upon the encoder features to project the num_channels by skip_expansion as follows: `num_channels = skip_channels * skip_expansion

required
decoder_scale float

num_channels width ratio between encoder stages and decoder stages.

required
min_decoder_channels int

The minimum num_channels of decoder stages. Useful i.e if we want to keep the width above the num of classes. The num_channels of a decoder stage is determined as follows: decoder_channels = max(encoder_channels * decoder_scale, min_decoder_channels)

1
up_block_types List[Type[AbstractUpFuseBlock]]

list of AbstractUpFuseBlock.

required
is_skip_list List[bool]

List of flags whether to use feature-map from encoder stage as skip connection or not. Used to not apply projection convolutions if a certain encoder feature is not aggregate with the decoder.

required
up_block_kwargs

init parameters for fuse blocks.

{}
Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
@resolve_param("up_block_types", ListFactory(TypeFactory(UP_FUSE_BLOCKS)))
def __init__(
    self,
    skip_channels_list: List[int],
    up_block_repeat_list: List[int],
    skip_expansion: float,
    decoder_scale: float,
    up_block_types: List[Type[AbstractUpFuseBlock]],
    is_skip_list: List[bool],
    min_decoder_channels: int = 1,
    **up_block_kwargs,
):
    """

    :param skip_channels_list: num_channels list of skip feature maps from the encoder.
    :param up_block_repeat_list: `num_repeats` arg list to be passed to the UpFuseBlocks.
    :param skip_expansion: skip expansion ratio value, before fusing the skip features from the encoder with the
        decoder features, a projection convolution is applied upon the encoder features to project the num_channels
        by skip_expansion as follows: `num_channels = skip_channels * skip_expansion
    :param decoder_scale: num_channels width ratio between encoder stages and decoder stages.
    :param min_decoder_channels: The minimum num_channels of decoder stages. Useful i.e if we want to keep the width
        above the num of classes. The num_channels of a decoder stage is determined as follows:
            `decoder_channels = max(encoder_channels * decoder_scale, min_decoder_channels)`
    :param up_block_types: list of AbstractUpFuseBlock.
    :param is_skip_list: List of flags whether to use feature-map from encoder stage as skip connection or not. Used
        to not apply projection convolutions if a certain encoder feature is not aggregate with the decoder.
    :param up_block_kwargs: init parameters for fuse blocks.
    """
    super().__init__()
    # num_channels list after encoder features projections.
    self.up_channels_list = [max(int(ch * decoder_scale), min_decoder_channels) for ch in skip_channels_list]
    # Reverse order to up-bottom order, i.e [stage4_ch, stage3_ch, ... , stage1_ch]
    self.up_channels_list.reverse()
    # Remove last stage num_channels, as it is the input to the decoder.
    self.up_channels_list.pop(0)

    is_skip_list.reverse()
    is_skip_list += [False]

    self.projection_blocks, skip_channels_list = self._make_skip_projection(skip_channels_list, skip_expansion, is_skip_list, min_decoder_channels)
    skip_channels_list = skip_channels_list.copy()
    skip_channels_list.reverse()

    self.up_stages = nn.ModuleList()
    in_channels = skip_channels_list.pop(0)
    skip_channels_list.append(None)
    for i in range(len(up_block_types)):
        self.up_stages.append(
            up_block_types[i](in_channels, skip_channels_list[i], self.up_channels_list[i], num_repeats=up_block_repeat_list[i], **up_block_kwargs)
        )
        in_channels = self.up_channels_list[i]

UpCatBlock

Bases: AbstractUpFuseBlock

Fuse features with concatenation and followed Convolutions.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@register_unet_up_block()
class UpCatBlock(AbstractUpFuseBlock):
    """
    Fuse features with concatenation and followed Convolutions.
    """

    def __init__(
        self,
        in_channels: int,
        skip_channels: int,
        out_channels: int,
        up_factor: int,
        mode: Union[UpsampleMode, str],
        num_repeats: int,
        fallback_mode: Optional[Union[UpsampleMode, str]] = None,
        **kwargs,
    ):
        super().__init__(in_channels=in_channels, skip_channels=skip_channels, out_channels=out_channels)

        mode, up_out_channels = self.validate_upsample_mode(in_channels, up_factor=up_factor, upsample_mode=mode, fallback_mode=fallback_mode)

        self.up_path = make_upsample_module(scale_factor=up_factor, upsample_mode=mode, align_corners=False)

        self.last_convs = nn.Sequential(
            ConvBNReLU(up_out_channels + skip_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.Sequential(*[ConvBNReLU(out_channels, out_channels, kernel_size=3, padding=1, bias=False) for _ in range(num_repeats - 1)]),
        )

    def forward(self, x, skip):
        x = self.up_path(x)
        x = torch.cat([x, skip], dim=1)
        return self.last_convs(x)

UpFactorBlock

Bases: AbstractUpFuseBlock

Ignore Skip features, simply apply upsampling and ConvBNRelu layers.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@register_unet_up_block()
class UpFactorBlock(AbstractUpFuseBlock):
    """
    Ignore Skip features, simply apply upsampling and ConvBNRelu layers.
    """

    def __init__(
        self,
        in_channels: int,
        skip_channels: int,
        out_channels: int,
        up_factor: int,
        mode: Union[UpsampleMode, str],
        num_repeats: int,
        fallback_mode: Optional[Union[UpsampleMode, str]] = None,
        **kwargs,
    ):
        super().__init__(in_channels=in_channels, skip_channels=0, out_channels=out_channels)

        mode, up_out_channels = self.validate_upsample_mode(in_channels, up_factor=up_factor, upsample_mode=mode, fallback_mode=fallback_mode)
        self.up_path = make_upsample_module(scale_factor=up_factor, upsample_mode=mode, align_corners=False)

        self.last_convs = nn.Sequential(
            ConvBNReLU(up_out_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.Sequential(*[ConvBNReLU(out_channels, out_channels, kernel_size=3, padding=1, bias=False) for _ in range(num_repeats - 1)]),
        )

    def forward(self, x, skip):
        x = self.up_path(x)
        return self.last_convs(x)

UpSumBlock

Bases: AbstractUpFuseBlock

Fuse features with concatenation and followed Convolutions.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_decoder.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
@register_unet_up_block()
class UpSumBlock(AbstractUpFuseBlock):
    """
    Fuse features with concatenation and followed Convolutions.
    """

    def __init__(
        self,
        in_channels: int,
        skip_channels: int,
        out_channels: int,
        up_factor: int,
        mode: Union[UpsampleMode, str],
        num_repeats: int,
        fallback_mode: Optional[Union[UpsampleMode, str]] = None,
        **kwargs,
    ):
        super().__init__(in_channels=in_channels, skip_channels=skip_channels, out_channels=out_channels)
        mode, up_out_channels = self.validate_upsample_mode(in_channels, up_factor=up_factor, upsample_mode=mode, fallback_mode=fallback_mode)

        self.up_path = make_upsample_module(scale_factor=up_factor, upsample_mode=mode, align_corners=False)

        self.proj_conv = (
            Residual() if skip_channels == up_out_channels else ConvBNReLU(skip_channels, up_out_channels, kernel_size=1, bias=False, use_activation=False)
        )

        self.last_convs = nn.Sequential(
            ConvBNReLU(up_out_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.Sequential(*[ConvBNReLU(out_channels, out_channels, kernel_size=3, padding=1, bias=False) for _ in range(num_repeats - 1)]),
        )

    def forward(self, x, skip):
        skip = self.proj_conv(skip)
        x = self.up_path(x)
        x = x + skip
        return self.last_convs(x)

AbstractUNetBackbone

Bases: nn.Module, ABC

All backbones for UNet segmentation models must implement this class.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class AbstractUNetBackbone(nn.Module, ABC):
    """
    All backbones for UNet segmentation models must implement this class.
    """

    @abstractmethod
    def get_backbone_output_number_of_channels(self) -> List[int]:
        """
        :return: list of stages num channels.
        """
        raise NotImplementedError()

    @abstractmethod
    def get_all_number_of_channels(self) -> List[int]:
        """
        :return: list of stages num channels.
        """
        raise NotImplementedError()

    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        """
        :return: list of skip features from different resolutions to be fused by the decoder.
        """
        raise NotImplementedError()

forward(x)

Returns:

Type Description
List[torch.Tensor]

list of skip features from different resolutions to be fused by the decoder.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
48
49
50
51
52
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
    """
    :return: list of skip features from different resolutions to be fused by the decoder.
    """
    raise NotImplementedError()

get_all_number_of_channels() abstractmethod

Returns:

Type Description
List[int]

list of stages num channels.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
41
42
43
44
45
46
@abstractmethod
def get_all_number_of_channels(self) -> List[int]:
    """
    :return: list of stages num channels.
    """
    raise NotImplementedError()

get_backbone_output_number_of_channels() abstractmethod

Returns:

Type Description
List[int]

list of stages num channels.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
34
35
36
37
38
39
@abstractmethod
def get_backbone_output_number_of_channels(self) -> List[int]:
    """
    :return: list of stages num channels.
    """
    raise NotImplementedError()

BackboneStage

Bases: nn.Module, ABC

BackboneStage abstract class to define a stage in UnetBackbone. Each stage include blocks which their amounts is defined by num_blocks.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class BackboneStage(nn.Module, ABC):
    """
    BackboneStage abstract class to define a stage in UnetBackbone. Each stage include blocks which their amounts is
    defined by `num_blocks`.
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int, num_blocks: int, **kwargs):
        super().__init__()
        self.blocks = self.build_stage(in_channels, out_channels, stride=stride, num_blocks=num_blocks, **kwargs)

    @abstractmethod
    def build_stage(self, in_channels: int, out_channels: int, stride: int, num_blocks: int, **kwargs) -> nn.Sequential:
        raise NotImplementedError()

    def forward(self, x):
        return self.blocks(x)

ConvBaseStage

Bases: BackboneStage, ABC

Base single conv block implementation, such as, Conv, QARepVGG, and RepVGG stages. Optionally support different downsample strategy, anti_alias with the AntiAliasDownsample and max_pool with the nn.MaxPool2d module.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class ConvBaseStage(BackboneStage, ABC):
    """
    Base single conv block implementation, such as, Conv, QARepVGG, and RepVGG stages.
    Optionally support different downsample strategy, `anti_alias` with the `AntiAliasDownsample` and `max_pool` with
    the `nn.MaxPool2d` module.
    """

    def build_stage(
        self,
        in_channels: int,
        out_channels: int,
        stride: int,
        num_blocks: int,
        anti_alias: Optional[bool] = None,
        downsample_mode: Optional[Union[str, DownSampleMode]] = None,
        **kwargs,
    ):
        blocks = []
        # Init down-sample module
        if anti_alias is not None:  # captures `False` and `True`
            logger.warning("`anti_alias` argument is deprecated and will be removed in future versions. Please set `downsample_mode='anti_alias'` instead.")
        if anti_alias:
            if downsample_mode is not None:
                raise ValueError(f"Only one argument should set as downsample_mode found: anti_alias: `True`," f" and downsample_mode: {downsample_mode}.")
            downsample_mode = DownSampleMode.ANTI_ALIAS

        if downsample_mode is not None and stride == 2:
            blocks.append(make_downsample_module(in_channels, stride=stride, downsample_mode=downsample_mode))
            stride = 1

        # RepVGG blocks
        blocks.extend(
            [
                self.build_conv_block(in_channels, out_channels, stride=stride),
                *[self.build_conv_block(out_channels, out_channels, stride=1) for _ in range(num_blocks - 1)],
            ]
        )
        return nn.Sequential(*blocks)

    @abstractmethod
    def build_conv_block(self, in_channels: int, out_channels: int, stride: int):
        raise NotImplementedError()

ConvStage

Bases: ConvBaseStage

Conv stage with ConvBNReLU as building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
216
217
218
219
220
221
222
223
@register_unet_backbone_stage()
class ConvStage(ConvBaseStage):
    """
    Conv stage with ConvBNReLU as building block.
    """

    def build_conv_block(self, in_channels: int, out_channels: int, stride: int):
        return ConvBNReLU(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)

Encoder

Bases: nn.Module

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
class Encoder(nn.Module):
    def __init__(self, backbone: AbstractUNetBackbone, context_module: Optional[nn.Module]):
        super().__init__()
        self.backbone = backbone
        self.context_module = nn.Identity() if context_module is None else context_module

    def forward(self, x):
        feats = self.backbone(x)
        feats[-1] = self.context_module(feats[-1])
        return feats

    def get_output_number_of_channels(self) -> List[int]:
        """
        Return list of encoder output channels, which is backbone output channels and context module output channels in
        case the context module return different num of channels.
        """
        channels_list = self.backbone.get_backbone_output_number_of_channels()
        if hasattr(self.context_module, "out_channels") and self.context_module.out_channels is not None:
            channels_list[-1] = self.context_module.out_channels
        return channels_list

    def get_all_number_of_channels(self) -> List[int]:
        channels_list = self.backbone.get_all_number_of_channels()
        if hasattr(self.context_module, "output_channels"):
            channels_list[-1] = self.context_module.output_channels()
        return channels_list

get_output_number_of_channels()

Return list of encoder output channels, which is backbone output channels and context module output channels in case the context module return different num of channels.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
286
287
288
289
290
291
292
293
294
def get_output_number_of_channels(self) -> List[int]:
    """
    Return list of encoder output channels, which is backbone output channels and context module output channels in
    case the context module return different num of channels.
    """
    channels_list = self.backbone.get_backbone_output_number_of_channels()
    if hasattr(self.context_module, "out_channels") and self.context_module.out_channels is not None:
        channels_list[-1] = self.context_module.out_channels
    return channels_list

QARepVGGStage

Bases: ConvBaseStage

QARepVGG stage with QARepVGGBlock as building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
166
167
168
169
170
171
172
173
@register_unet_backbone_stage()
class QARepVGGStage(ConvBaseStage):
    """
    QARepVGG stage with QARepVGGBlock as building block.
    """

    def build_conv_block(self, in_channels: int, out_channels: int, stride: int):
        return QARepVGGBlock(in_channels, out_channels, stride=stride, use_residual_connection=(out_channels == in_channels and stride == 1))

RegnetXStage

Bases: BackboneStage

RegNetX stage with XBlock as building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
@register_unet_backbone_stage()
class RegnetXStage(BackboneStage):
    """
    RegNetX stage with XBlock as building block.
    """

    def build_stage(
        self,
        in_channels: int,
        out_channels: int,
        stride: int,
        num_blocks: int,
        bottleneck_ratio: float,
        group_width: int,
        se_ratio: float,
        droppath_prob: float,
        **kwargs,
    ):
        group_width = self._get_divisable_group_width(out_channels, bottleneck_ratio, group_width)
        return nn.Sequential(
            XBlock(in_channels, out_channels, bottleneck_ratio, group_width, stride, se_ratio, droppath_prob),
            *[XBlock(out_channels, out_channels, bottleneck_ratio, group_width, 1, se_ratio, droppath_prob) for _ in range(num_blocks - 1)],
        )

    @staticmethod
    def _get_divisable_group_width(channels: int, bottleneck_ratio: float, group_width: int) -> int:
        """
        Returns a valid value for group_width, in channels isn't a multiplication of group_width.
        """
        inter_channels = channels // bottleneck_ratio
        # if group_width is higher than the Conv channels, fallback to a regular Conv with group_width = channels.
        if group_width > inter_channels:
            return inter_channels
        group_pow = int(math.log2(group_width))
        for pow in range(group_pow, -1, -1):
            if (inter_channels / 2**pow) % 1 == 0:
                return int(2**pow)
        return 1

RepVGGStage

Bases: ConvBaseStage

RepVGG stage with RepVGGBlock as building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
156
157
158
159
160
161
162
163
@register_unet_backbone_stage()
class RepVGGStage(ConvBaseStage):
    """
    RepVGG stage with RepVGGBlock as building block.
    """

    def build_conv_block(self, in_channels: int, out_channels: int, stride: int):
        return RepVGGBlock(in_channels, out_channels, stride=stride)

STDCStage

Bases: BackboneStage

STDC stage with STDCBlock as building block.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@register_unet_backbone_stage()
class STDCStage(BackboneStage):
    """
    STDC stage with STDCBlock as building block.
    """

    def build_stage(self, in_channels: int, out_channels: int, stride: int, num_blocks: int, steps: int, stdc_downsample_mode: str, **kwargs):
        """
        :param steps: The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.
        :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
         `dw_conv` for depthwise-convolution.
        :return:
        """
        self.assert_divisible_channels(out_channels, steps)
        blocks = []
        # STDC blocks
        blocks.extend(
            [
                STDCBlock(in_channels, out_channels, stride=stride, steps=steps, stdc_downsample_mode=stdc_downsample_mode),
                *[STDCBlock(out_channels, out_channels, stride=1, steps=steps, stdc_downsample_mode=stdc_downsample_mode) for _ in range(num_blocks - 1)],
            ]
        )
        return nn.Sequential(*blocks)

    @staticmethod
    def assert_divisible_channels(num_channels: int, steps: int):
        """
        STDC block refactors the convolution operator by applying several smaller convolution with num of filters that
        decrease w.r.t the num of steps. The ratio to the smallest num of channels is `2 ** (steps - 1)`,
        thus this method assert that the stage num of channels is divisible by the above ratio.
        """
        channels_ratio = 2 ** (steps - 1)
        if num_channels % channels_ratio != 0:
            raise AssertionError(
                f"Num channels: {num_channels}, isn't divisible by the channels width ratio:"
                f" {channels_ratio}, when initiating an STDC block with steps: {steps}"
            )

assert_divisible_channels(num_channels, steps) staticmethod

STDC block refactors the convolution operator by applying several smaller convolution with num of filters that decrease w.r.t the num of steps. The ratio to the smallest num of channels is 2 ** (steps - 1), thus this method assert that the stage num of channels is divisible by the above ratio.

Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@staticmethod
def assert_divisible_channels(num_channels: int, steps: int):
    """
    STDC block refactors the convolution operator by applying several smaller convolution with num of filters that
    decrease w.r.t the num of steps. The ratio to the smallest num of channels is `2 ** (steps - 1)`,
    thus this method assert that the stage num of channels is divisible by the above ratio.
    """
    channels_ratio = 2 ** (steps - 1)
    if num_channels % channels_ratio != 0:
        raise AssertionError(
            f"Num channels: {num_channels}, isn't divisible by the channels width ratio:"
            f" {channels_ratio}, when initiating an STDC block with steps: {steps}"
        )

build_stage(in_channels, out_channels, stride, num_blocks, steps, stdc_downsample_mode, **kwargs)

Parameters:

Name Type Description Default
steps int

The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.

required
stdc_downsample_mode str

downsample mode in stdc block, supported avg_pool for average-pooling and dw_conv for depthwise-convolution.

required

Returns:

Type Description
Source code in src/super_gradients/training/models/segmentation_models/unet/unet_encoder.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def build_stage(self, in_channels: int, out_channels: int, stride: int, num_blocks: int, steps: int, stdc_downsample_mode: str, **kwargs):
    """
    :param steps: The total number of convs in this module, 1 conv 1x1 and (steps - 1) conv3x3.
    :param stdc_downsample_mode: downsample mode in stdc block, supported `avg_pool` for average-pooling and
     `dw_conv` for depthwise-convolution.
    :return:
    """
    self.assert_divisible_channels(out_channels, steps)
    blocks = []
    # STDC blocks
    blocks.extend(
        [
            STDCBlock(in_channels, out_channels, stride=stride, steps=steps, stdc_downsample_mode=stdc_downsample_mode),
            *[STDCBlock(out_channels, out_channels, stride=1, steps=steps, stdc_downsample_mode=stdc_downsample_mode) for _ in range(num_blocks - 1)],
        ]
    )
    return nn.Sequential(*blocks)

SgModule

Bases: nn.Module, SupportsReplaceInputChannels, SupportsFineTune

Source code in src/super_gradients/training/models/sg_module.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class SgModule(nn.Module, SupportsReplaceInputChannels, SupportsFineTune):
    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
        """

        :return: list of dictionaries containing the key 'named_params' with a list of named params
        """
        return [{"named_params": self.named_parameters()}]

    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
        """

        :param param_groups: list of dictionaries containing the params
        :return: list of dictionaries containing the params
        """
        for param_group in param_groups:
            param_group["lr"] = lr
        return param_groups

    def get_include_attributes(self) -> list:
        """
        This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training)
        are updated to the EMA model along with the model weights.
        By default, all attributes are updated except for private attributes (starting with '_')
        You can either set include_attributes or exclude_attributes. By returning a non empty list from this function,
        you override the default behaviour and only attributes named in this list will be updated.
        Note: This will also override the get_exclude_attributes list.
            :return: list of attributes to update from main model to EMA model
        """
        return []

    def get_exclude_attributes(self) -> list:
        """
        This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training)
        are updated to the EMA model along with the model weights.
        By default, all attributes are updated except for private attributes (starting with '_')
        You can either set include_attributes or exclude_attributes. By returning a non empty list from this function,
        you override the default behaviour and attributes named in this list will also be excluded from update.
        Note: if get_include_attributes is not empty, it will override this list.
            :return: list of attributes to not update from main model to EMA mode
        """
        return []

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """

    def replace_head(self, **kwargs):
        """
        Replace final layer for pretrained models. Since this varies between architectures, we leave it to the inheriting
        class to implement.
        """

        raise NotImplementedError

    def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
        """
        Returns a dictionary, mapping lr to the unfrozen part of the network, in the same fashion as using initial_lr in trianing_params
         when calling Trainer.train().
        For example:
            def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
                return {"default": 0, "head": lr}

        :param lr: float, learning rate for the part of the network to be tuned.
        :return: learning rate mapping that can be used by
         super_gradients.training.utils.optimizer_utils.initialize_param_groups
        """
        raise NotImplementedError("Finetune is not implemented for this model, it is required to implement get_finetune_lr_dict.")

get_exclude_attributes()

This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training) are updated to the EMA model along with the model weights. By default, all attributes are updated except for private attributes (starting with '_') You can either set include_attributes or exclude_attributes. By returning a non empty list from this function, you override the default behaviour and attributes named in this list will also be excluded from update. Note: if get_include_attributes is not empty, it will override this list. :return: list of attributes to not update from main model to EMA mode

Source code in src/super_gradients/training/models/sg_module.py
39
40
41
42
43
44
45
46
47
48
49
def get_exclude_attributes(self) -> list:
    """
    This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training)
    are updated to the EMA model along with the model weights.
    By default, all attributes are updated except for private attributes (starting with '_')
    You can either set include_attributes or exclude_attributes. By returning a non empty list from this function,
    you override the default behaviour and attributes named in this list will also be excluded from update.
    Note: if get_include_attributes is not empty, it will override this list.
        :return: list of attributes to not update from main model to EMA mode
    """
    return []

get_finetune_lr_dict(lr)

Returns a dictionary, mapping lr to the unfrozen part of the network, in the same fashion as using initial_lr in trianing_params when calling Trainer.train(). For example: def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]: return {"default": 0, "head": lr}

Parameters:

Name Type Description Default
lr float

float, learning rate for the part of the network to be tuned.

required

Returns:

Type Description
Dict[str, float]

learning rate mapping that can be used by super_gradients.training.utils.optimizer_utils.initialize_param_groups

Source code in src/super_gradients/training/models/sg_module.py
67
68
69
70
71
72
73
74
75
76
77
78
79
def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
    """
    Returns a dictionary, mapping lr to the unfrozen part of the network, in the same fashion as using initial_lr in trianing_params
     when calling Trainer.train().
    For example:
        def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]:
            return {"default": 0, "head": lr}

    :param lr: float, learning rate for the part of the network to be tuned.
    :return: learning rate mapping that can be used by
     super_gradients.training.utils.optimizer_utils.initialize_param_groups
    """
    raise NotImplementedError("Finetune is not implemented for this model, it is required to implement get_finetune_lr_dict.")

get_include_attributes()

This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training) are updated to the EMA model along with the model weights. By default, all attributes are updated except for private attributes (starting with '_') You can either set include_attributes or exclude_attributes. By returning a non empty list from this function, you override the default behaviour and only attributes named in this list will be updated. Note: This will also override the get_exclude_attributes list. :return: list of attributes to update from main model to EMA model

Source code in src/super_gradients/training/models/sg_module.py
27
28
29
30
31
32
33
34
35
36
37
def get_include_attributes(self) -> list:
    """
    This function is used by the EMA. When updating the EMA model, some attributes of the main model (used in training)
    are updated to the EMA model along with the model weights.
    By default, all attributes are updated except for private attributes (starting with '_')
    You can either set include_attributes or exclude_attributes. By returning a non empty list from this function,
    you override the default behaviour and only attributes named in this list will be updated.
    Note: This will also override the get_exclude_attributes list.
        :return: list of attributes to update from main model to EMA model
    """
    return []

initialize_param_groups(lr, training_params)

Returns:

Type Description
list

list of dictionaries containing the key 'named_params' with a list of named params

Source code in src/super_gradients/training/models/sg_module.py
10
11
12
13
14
15
def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
    """

    :return: list of dictionaries containing the key 'named_params' with a list of named params
    """
    return [{"named_params": self.named_parameters()}]

prep_model_for_conversion(input_size=None, **kwargs)

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name Type Description Default
input_size Union[tuple, list]

[H,W]

None
Source code in src/super_gradients/training/models/sg_module.py
51
52
53
54
55
56
57
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """

replace_head(**kwargs)

Replace final layer for pretrained models. Since this varies between architectures, we leave it to the inheriting class to implement.

Source code in src/super_gradients/training/models/sg_module.py
59
60
61
62
63
64
65
def replace_head(self, **kwargs):
    """
    Replace final layer for pretrained models. Since this varies between architectures, we leave it to the inheriting
    class to implement.
    """

    raise NotImplementedError

update_param_groups(param_groups, lr, epoch, iter, training_params, total_batch)

Parameters:

Name Type Description Default
param_groups list

list of dictionaries containing the params

required

Returns:

Type Description
list

list of dictionaries containing the params

Source code in src/super_gradients/training/models/sg_module.py
17
18
19
20
21
22
23
24
25
def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
    """

    :param param_groups: list of dictionaries containing the params
    :return: list of dictionaries containing the params
    """
    for param_group in param_groups:
        param_group["lr"] = lr
    return param_groups