Skip to content

Models

get_arch_params(config_name, overriding_params=None, recipes_dir_path=None)

Class for creating arch parameters dictionary, taking defaults from yaml files in src/super_gradients/recipes/arch_params.

Parameters:

Name Type Description Default
config_name str

Name of the yaml to load (e.g. "resnet18_cifar_arch_params")

required
overriding_params Dict

Dict, dictionary like object containing entries to override.

None
recipes_dir_path Optional[str]

Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "arch_params" folder, which itself should include the config file named after config_name.

None
Source code in V3_2/src/super_gradients/training/models/arch_params_factory.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_arch_params(config_name: str, overriding_params: Dict = None, recipes_dir_path: Optional[str] = None) -> DictConfig:
    """
    Class for creating arch parameters dictionary, taking defaults from yaml
     files in src/super_gradients/recipes/arch_params.

    :param config_name:         Name of the yaml to load (e.g. "resnet18_cifar_arch_params")
    :param overriding_params: Dict, dictionary like object containing entries to override.
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "arch_params" folder,
                                which itself should include the config file named after config_name.
    """
    overriding_params = overriding_params if overriding_params else dict()

    arch_params = load_arch_params(config_name=config_name, recipes_dir_path=recipes_dir_path)
    arch_params = hydra.utils.instantiate(arch_params)

    arch_params.update(**overriding_params)

    return arch_params

BaseClassifier

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/base_classifer.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class BaseClassifier(SgModule):
    def __init__(
        self,
    ):
        super(BaseClassifier, self).__init__()

    @resolve_param("image_processor", ProcessingFactory())
    def set_dataset_processing_params(
        self,
        class_names: Optional[List[str]] = None,
        image_processor: Optional[Processing] = None,
    ) -> None:
        """Set the processing parameters for the dataset.

        :param class_names:     (Optional) Names of the dataset the model was trained on.
        :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
        """
        self._class_names = class_names or self._class_names
        self._image_processor = image_processor or self._image_processor

    @lru_cache(maxsize=1)
    def _get_pipeline(self, fuse_model: bool = True) -> ClassificationPipeline:
        """Instantiate the prediction pipeline of this model.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        """
        if None in (self._class_names, self._image_processor):
            raise RuntimeError(
                "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first."
            )

        pipeline = ClassificationPipeline(
            model=self,
            image_processor=self._image_processor,
            class_names=self._class_names,
            fuse_model=fuse_model,
        )
        return pipeline

    def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True) -> ImagesPredictions:
        """Predict an image or a list of images.

        :param images:  Images to predict.
        :param batch_size:  Maximum number of images to process at the same time.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model)
        return pipeline(images, batch_size=batch_size)  # type: ignore

    def predict_webcam(self, fuse_model: bool = True):
        """Predict using webcam.
        :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
        """
        pipeline = self._get_pipeline(fuse_model=fuse_model)
        pipeline.predict_webcam()

predict(images, batch_size=32, fuse_model=True)

Predict an image or a list of images.

Parameters:

Name Type Description Default
images ImageSource

Images to predict.

required
batch_size int

Maximum number of images to process at the same time.

32
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
Source code in V3_2/src/super_gradients/training/models/classification_models/base_classifer.py
51
52
53
54
55
56
57
58
59
def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True) -> ImagesPredictions:
    """Predict an image or a list of images.

    :param images:  Images to predict.
    :param batch_size:  Maximum number of images to process at the same time.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model)
    return pipeline(images, batch_size=batch_size)  # type: ignore

predict_webcam(fuse_model=True)

Predict using webcam.

Parameters:

Name Type Description Default
fuse_model bool

If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.

True
Source code in V3_2/src/super_gradients/training/models/classification_models/base_classifer.py
61
62
63
64
65
66
def predict_webcam(self, fuse_model: bool = True):
    """Predict using webcam.
    :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
    """
    pipeline = self._get_pipeline(fuse_model=fuse_model)
    pipeline.predict_webcam()

set_dataset_processing_params(class_names=None, image_processor=None)

Set the processing parameters for the dataset.

Parameters:

Name Type Description Default
class_names Optional[List[str]]

(Optional) Names of the dataset the model was trained on.

None
image_processor Optional[Processing]

(Optional) Image processing objects to reproduce the dataset preprocessing used for training.

None
Source code in V3_2/src/super_gradients/training/models/classification_models/base_classifer.py
19
20
21
22
23
24
25
26
27
28
29
30
31
@resolve_param("image_processor", ProcessingFactory())
def set_dataset_processing_params(
    self,
    class_names: Optional[List[str]] = None,
    image_processor: Optional[Processing] = None,
) -> None:
    """Set the processing parameters for the dataset.

    :param class_names:     (Optional) Names of the dataset the model was trained on.
    :param image_processor: (Optional) Image processing objects to reproduce the dataset preprocessing used for training.
    """
    self._class_names = class_names or self._class_names
    self._image_processor = image_processor or self._image_processor

BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)

Model from official source: https://github.com/microsoft/unilm/tree/master/beit

At this point only the 1k fine-tuned classification weights and model configs have been added, see original source above for pre-training models and procedure.

Modifications by / Copyright 2021 Ross Wightman, original copyrights below

Beit

Bases: SgModule

Vision Transformer with support for patch or hybrid CNN input stage

Source code in V3_2/src/super_gradients/training/models/classification_models/beit.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
class Beit(SgModule):
    """Vision Transformer with support for patch or hybrid CNN input stage"""

    def __init__(
        self,
        image_size=(224, 224),
        patch_size=16,
        in_chans=3,
        num_classes=1000,
        global_pool="avg",
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=True,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),
        init_values=None,
        use_abs_pos_emb=True,
        use_rel_pos_bias=False,
        use_shared_rel_pos_bias=False,
        head_init_scale=0.001,
        **kwargs,
    ):
        super().__init__()
        self.num_classes = num_classes
        self.global_pool = global_pool
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.grad_checkpointing = False

        self.patch_embed = PatchEmbed(img_size=image_size, patch_size=patch_size, in_channels=in_chans, hidden_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None
        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.grid_size, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    init_values=init_values,
                    window_size=self.patch_embed.grid_size if use_rel_pos_bias else None,
                )
                for i in range(depth)
            ]
        )
        use_fc_norm = self.global_pool == "avg"
        self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else None
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)
        if self.pos_embed is not None:
            trunc_normal_(self.pos_embed, std=0.02)
        trunc_normal_(self.cls_token, std=0.02)
        # trunc_normal_(self.mask_token, std=.02)
        self.fix_init_weight()
        if isinstance(self.head, nn.Linear):
            trunc_normal_(self.head.weight, std=0.02)
            self.head.weight.data.mul_(head_init_scale)
            self.head.bias.data.mul_(head_init_scale)

    def fix_init_weight(self):
        def rescale(param, layer_id):
            param.div_(math.sqrt(2.0 * layer_id))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight.data, layer_id + 1)
            rescale(layer.mlp.fc2.weight.data, layer_id + 1)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        nwd = {"pos_embed", "cls_token"}
        for n, _ in self.named_parameters():
            if "relative_position_bias_table" in n:
                nwd.add(n)
        return nwd

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def group_matcher(self, coarse=False):
        matcher = dict(
            stem=r"^cls_token|pos_embed|patch_embed|rel_pos_bias",  # stem and embed
            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
        )
        return matcher

    @torch.jit.ignore
    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.global_pool = global_pool
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        x = self.patch_embed(x)
        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for blk in self.blocks:
            if self.grad_checkpointing and not torch.jit.is_scripting():
                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
            else:
                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
        x = self.norm(x)
        return x

    def forward_head(self, x, pre_logits: bool = False):
        if self.fc_norm is not None:
            x = x[:, 1:].mean(dim=1)
            x = self.fc_norm(x)
        else:
            x = x[:, 0]
        return x if pre_logits else self.head(x)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

Mlp

Bases: nn.Module

MLP as used in Vision Transformer, MLP-Mixer and related networks

Source code in V3_2/src/super_gradients/training/models/classification_models/beit.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class Mlp(nn.Module):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features

        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x

trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0)

Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:\mathcal{N}(\text{mean}, \text{std}^2) with values outside :math:[a, b] redrawn until they are within the bounds. The method used for generating the random values works best when :math:a \leq \text{mean} \leq b.

Parameters:

Name Type Description Default
tensor

an n-dimensional torch.Tensor

required
mean

the mean of the normal distribution

0.0
std

the standard deviation of the normal distribution

1.0
a

the minimum cutoff value

-2.0
b

the maximum cutoff value Examples: >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)

2.0
Source code in V3_2/src/super_gradients/training/models/classification_models/beit.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
    # type: (Tensor, float, float, float, float) -> Tensor
    r"""Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.

    :param tensor: an n-dimensional `torch.Tensor`
    :param mean: the mean of the normal distribution
    :param std: the standard deviation of the normal distribution
    :param a: the minimum cutoff value
    :param b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.trunc_normal_(w)
    """
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

DenseNet

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/densenet.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class DenseNet(SgModule):
    def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int):
        """
        :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
        :param structure:           how many layers in each pooling block - sequentially
        :param num_init_features:   the number of filters to learn in the first convolutional layer
        :param bn_size:             multiplicative factor for the number of bottle neck layers
                                        (i.e. bn_size * k featurs in the bottleneck)
        :param drop_rate:           dropout rate after each dense layer
        :param num_classes:         number of classes in the classification task
        """
        super(DenseNet, self).__init__()

        # First convolution
        self.features = nn.Sequential(
            OrderedDict(
                [
                    ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                    ("norm0", nn.BatchNorm2d(num_init_features)),
                    ("relu0", nn.ReLU(inplace=True)),
                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
                ]
            )
        )

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(structure):
            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module("denseblock%d" % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(structure) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module("transition%d" % (i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
        self.features.add_module("norm5", nn.BatchNorm2d(num_features))

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

        # Official init from torch repo.
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

__init__(growth_rate, structure, num_init_features, bn_size, drop_rate, num_classes)

Parameters:

Name Type Description Default
growth_rate int

number of filter to add each layer (noted as 'k' in the paper)

required
structure list

how many layers in each pooling block - sequentially

required
num_init_features int

the number of filters to learn in the first convolutional layer

required
bn_size int

multiplicative factor for the number of bottle neck layers (i.e. bn_size * k featurs in the bottleneck)

required
drop_rate float

dropout rate after each dense layer

required
num_classes int

number of classes in the classification task

required
Source code in V3_2/src/super_gradients/training/models/classification_models/densenet.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def __init__(self, growth_rate: int, structure: list, num_init_features: int, bn_size: int, drop_rate: float, num_classes: int):
    """
    :param growth_rate:         number of filter to add each layer (noted as 'k' in the paper)
    :param structure:           how many layers in each pooling block - sequentially
    :param num_init_features:   the number of filters to learn in the first convolutional layer
    :param bn_size:             multiplicative factor for the number of bottle neck layers
                                    (i.e. bn_size * k featurs in the bottleneck)
    :param drop_rate:           dropout rate after each dense layer
    :param num_classes:         number of classes in the classification task
    """
    super(DenseNet, self).__init__()

    # First convolution
    self.features = nn.Sequential(
        OrderedDict(
            [
                ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
                ("norm0", nn.BatchNorm2d(num_init_features)),
                ("relu0", nn.ReLU(inplace=True)),
                ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
            ]
        )
    )

    # Each denseblock
    num_features = num_init_features
    for i, num_layers in enumerate(structure):
        block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
        self.features.add_module("denseblock%d" % (i + 1), block)
        num_features = num_features + num_layers * growth_rate
        if i != len(structure) - 1:
            trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
            self.features.add_module("transition%d" % (i + 1), trans)
            num_features = num_features // 2

    # Final batch norm
    self.features.add_module("norm5", nn.BatchNorm2d(num_features))

    # Linear layer
    self.classifier = nn.Linear(num_features, num_classes)

    # Official init from torch repo.
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.constant_(m.bias, 0)

Dual Path Networks in PyTorch.

Credits: https://github.com/kuangliu/pytorch-cifar/blob/master/models/dpn.py

EfficientNet model class, based on "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" https://arxiv.org/abs/1905.11946` Code source: https://github.com/lukemelas/EfficientNet-PyTorch Pre-trained checkpoints converted to Deci's code base with the reported accuracy can be found in S3 repo

BlockDecoder

Bases: object

Block Decoder for readability, straight from the official TensorFlow repository.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
class BlockDecoder(object):
    """Block Decoder for readability, straight from the official TensorFlow repository."""

    @staticmethod
    def _decode_block_string(block_string: str) -> BlockArgs:
        """Get a block through a string notation of arguments.

        :param block_string: A string notation of arguments. Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
        :return:     BlockArgs: The namedtuple defined at the top of this file.
        """
        assert isinstance(block_string, str)

        ops = block_string.split("_")
        options = {}
        for op in ops:
            splits = re.split(r"(\d.*)", op)
            if len(splits) >= 2:
                key, value = splits[:2]
                options[key] = value

        # Check stride
        assert ("s" in options and len(options["s"]) == 1) or (len(options["s"]) == 2 and options["s"][0] == options["s"][1])

        return BlockArgs(
            num_repeat=int(options["r"]),
            kernel_size=int(options["k"]),
            stride=[int(options["s"][0])],
            expand_ratio=int(options["e"]),
            input_filters=int(options["i"]),
            output_filters=int(options["o"]),
            se_ratio=float(options["se"]) if "se" in options else None,
            id_skip=("noskip" not in block_string),
        )

    @staticmethod
    def _encode_block_string(block) -> str:
        """Encode a block to a string.

        :param block: A BlockArgs type argument (NamedTuple)
        :return: block_string: A String form of BlockArgs.
        """
        args = [
            "r%d" % block.num_repeat,
            "k%d" % block.kernel_size,
            "s%d%d" % (block.strides[0], block.strides[1]),
            "e%s" % block.expand_ratio,
            "i%d" % block.input_filters,
            "o%d" % block.output_filters,
        ]
        if 0 < block.se_ratio <= 1:
            args.append("se%s" % block.se_ratio)
        if block.id_skip is False:
            args.append("noskip")
        return "_".join(args)

    @staticmethod
    def decode(string_list: List[str]) -> List[BlockArgs]:
        """Decode a list of string notations to specify blocks inside the network.

        :param string_list:     List of strings, each string is a notation of block.
        :return blocks_args:    List of BlockArgs namedtuples of block args.
        """
        assert isinstance(string_list, list)
        blocks_args = []
        for block_string in string_list:
            blocks_args.append(BlockDecoder._decode_block_string(block_string))
        return blocks_args

    @staticmethod
    def encode(blocks_args: List):
        """Encode a list of BlockArgs to a list of strings.

        :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
        :return: block_strings: A list of strings, each string is a notation of block.
        """
        block_strings = []
        for block in blocks_args:
            block_strings.append(BlockDecoder._encode_block_string(block))
        return block_strings

decode(string_list) staticmethod

Decode a list of string notations to specify blocks inside the network.

Parameters:

Name Type Description Default
string_list List[str]

List of strings, each string is a notation of block.

required

Returns:

Type Description
List[BlockArgs]

List of BlockArgs namedtuples of block args.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
277
278
279
280
281
282
283
284
285
286
287
288
@staticmethod
def decode(string_list: List[str]) -> List[BlockArgs]:
    """Decode a list of string notations to specify blocks inside the network.

    :param string_list:     List of strings, each string is a notation of block.
    :return blocks_args:    List of BlockArgs namedtuples of block args.
    """
    assert isinstance(string_list, list)
    blocks_args = []
    for block_string in string_list:
        blocks_args.append(BlockDecoder._decode_block_string(block_string))
    return blocks_args

encode(blocks_args) staticmethod

Encode a list of BlockArgs to a list of strings.

Parameters:

Name Type Description Default
blocks_args List

A list of BlockArgs namedtuples of block args. (list[namedtuples])

required

Returns:

Type Description

block_strings: A list of strings, each string is a notation of block.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
290
291
292
293
294
295
296
297
298
299
300
@staticmethod
def encode(blocks_args: List):
    """Encode a list of BlockArgs to a list of strings.

    :param blocks_args: A list of BlockArgs namedtuples of block args. (list[namedtuples])
    :return: block_strings: A list of strings, each string is a notation of block.
    """
    block_strings = []
    for block in blocks_args:
        block_strings.append(BlockDecoder._encode_block_string(block))
    return block_strings

Conv2dDynamicSamePadding

Bases: nn.Conv2d

2D Convolutions like TensorFlow, for a dynamic image size. The padding is operated in forward function by calculating dynamically.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class Conv2dDynamicSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow, for a dynamic image size.
    The padding is operated in forward function by calculating dynamically.
    """

    # Tips for 'SAME' mode padding.
    #     Given the following:
    #         i: width or height
    #         s: stride
    #         k: kernel size
    #         d: dilation
    #         p: padding
    #     Output after Conv2d:
    #         o = floor((i+p-((k-1)*d+1))/s+1)
    # If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
    # => p = (i-1)*s+((k-1)*d+1)-i

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)  # change the output size according to stride ! ! !
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)

Conv2dStaticSamePadding

Bases: nn.Conv2d

2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. The padding mudule is calculated in construction function, then used in forward.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class Conv2dStaticSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
    The padding mudule is calculated in construction function, then used in forward.
    """

    # With the same calculation as Conv2dDynamicSamePadding

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
        super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

        # Calculate padding based on image size and save it
        assert image_size is not None
        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d((pad_w - pad_w // 2, pad_w // 2, pad_h - pad_h // 2, pad_h // 2))
        else:
            self.static_padding = Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
        return x

EfficientNet

Bases: SgModule

EfficientNet model.

References: [1] https://arxiv.org/abs/1905.11946 (EfficientNet)

Parameters:

Name Type Description Default
width_coefficient float

model's width coefficient. Used as the multiplier.

required
depth_coefficient float

model's depth coefficient. Used as the multiplier.

required
image_size int

Size of input image.

required
dropout_rate float

Dropout probability in final layer

required
num_classes int

Number of classes.

required
batch_norm_momentum Optional[float]

Value used for the running_mean and running_var computation

0.99
batch_norm_epsilon Optional[float]

Value added to the denominator for numerical stability

0.001
drop_connect_rate Optional[float]

Connection dropout probability

0.2
depth_divisor Optional[int]

Model's depth divisor. Used as the divisor.

8
min_depth Optional[int]

Model's minimal depth, if given.

None
backbone_mode Optional[bool]

If true, dropping the final linear layer

False
blocks_args Optional[list]

List of BlockArgs to construct blocks. (list[namedtuple])

None
Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
class EfficientNet(SgModule):
    """
    EfficientNet model.

    References:
        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)


    :param width_coefficient:   model's width coefficient. Used as the multiplier.
    :param depth_coefficient:   model's depth coefficient. Used as the multiplier.
    :param image_size:          Size of input image.
    :param dropout_rate:        Dropout probability in final layer
    :param num_classes:         Number of classes.
    :param batch_norm_momentum: Value used for the running_mean and running_var computation
    :param batch_norm_epsilon:  Value added to the denominator for numerical stability
    :param drop_connect_rate:   Connection dropout probability
    :param depth_divisor:       Model's depth divisor. Used as the divisor.
    :param min_depth:           Model's minimal depth, if given.
    :param backbone_mode:       If true, dropping the final linear layer
    :param blocks_args:         List of BlockArgs to construct blocks. (list[namedtuple])
    """

    def __init__(
        self,
        width_coefficient: float,
        depth_coefficient: float,
        image_size: int,
        dropout_rate: float,
        num_classes: int,
        batch_norm_momentum: Optional[float] = 0.99,
        batch_norm_epsilon: Optional[float] = 1e-3,
        drop_connect_rate: Optional[float] = 0.2,
        depth_divisor: Optional[int] = 8,
        min_depth: Optional[int] = None,
        backbone_mode: Optional[bool] = False,
        blocks_args: Optional[list] = None,
    ):
        super().__init__()
        assert isinstance(blocks_args, list), "blocks_args should be a list"
        assert len(blocks_args) > 0, "block args must be greater than 0"

        self._blocks_args = blocks_args
        self.backbone_mode = backbone_mode
        self.drop_connect_rate = drop_connect_rate

        # Batch norm parameters
        bn_mom = 1 - batch_norm_momentum
        bn_eps = batch_norm_epsilon

        # Get stem static or dynamic convolution depending on image size
        Conv2d = get_same_padding_conv2d(image_size=image_size)

        # Stem
        in_channels = 3  # rgb
        out_channels = round_filters(32, width_coefficient, depth_divisor, min_depth)  # number of output channels
        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
        image_size = calculate_output_image_size(image_size, 2)

        # Build blocks
        self._blocks = nn.ModuleList([])
        for block_args in self._blocks_args:

            # Update block input and output filters based on depth multiplier.
            block_args = block_args._replace(
                input_filters=round_filters(block_args.input_filters, width_coefficient, depth_divisor, min_depth),
                output_filters=round_filters(block_args.output_filters, width_coefficient, depth_divisor, min_depth),
                num_repeat=round_repeats(block_args.num_repeat, depth_coefficient),
            )

            # The first block needs to take care of stride and filter size increase.
            self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
            image_size = calculate_output_image_size(image_size, block_args.stride)
            if block_args.num_repeat > 1:  # modify block_args to keep same output size
                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
                self._blocks.append(MBConvBlock(block_args, batch_norm_momentum, batch_norm_epsilon, image_size=image_size))
                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1

        # Head
        in_channels = block_args.output_filters  # output of final block
        out_channels = round_filters(1280, width_coefficient, depth_divisor, min_depth)
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        # Final linear layer
        if not self.backbone_mode:
            self._avg_pooling = nn.AdaptiveAvgPool2d(1)
            self._dropout = nn.Dropout(dropout_rate)
            self._fc = nn.Linear(out_channels, num_classes)
        self._swish = nn.functional.silu

    def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
        """
        Use convolution layer to extract feature.

        :param inputs: Input tensor.
        :return: Output of the final convolution layer in the efficientnet model.
        """

        # Stem
        x = self._swish(self._bn0(self._conv_stem(inputs)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)

        # Head
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

    def forward(self, inputs):
        """
        EfficientNet's forward function.
        Calls extract_features to extract features, applies final linear layer, and returns logits.

        :param inputs: Input tensor.
        :return: Output of this model after processing.
        """
        bs = inputs.size(0)

        # Convolution layers
        x = self.extract_features(inputs)

        # Pooling and final linear layer, not needed for backbone mode
        if not self.backbone_mode:
            x = self._avg_pooling(x)
            x = x.view(bs, -1)
            x = self._dropout(x)
            x = self._fc(x)

        return x

    def replace_head(self, new_num_classes: Optional[int] = None, new_head: Optional[nn.Module] = None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self._fc = new_head
        else:
            self._fc = nn.Linear(self._fc.in_features, new_num_classes)

    def load_state_dict(self, state_dict: dict, strict: bool = True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            pretrained_model_weights_dict = pretrained_backbone_weights_dict

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_model_weights_dict, strict)

extract_features(inputs)

Use convolution layer to extract feature.

Parameters:

Name Type Description Default
inputs torch.Tensor

Input tensor.

required

Returns:

Type Description
torch.Tensor

Output of the final convolution layer in the efficientnet model.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
def extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
    """
    Use convolution layer to extract feature.

    :param inputs: Input tensor.
    :return: Output of the final convolution layer in the efficientnet model.
    """

    # Stem
    x = self._swish(self._bn0(self._conv_stem(inputs)))

    # Blocks
    for idx, block in enumerate(self._blocks):
        drop_connect_rate = self.drop_connect_rate
        if drop_connect_rate:
            drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
        x = block(x, drop_connect_rate=drop_connect_rate)

    # Head
    x = self._swish(self._bn1(self._conv_head(x)))

    return x

forward(inputs)

EfficientNet's forward function. Calls extract_features to extract features, applies final linear layer, and returns logits.

Parameters:

Name Type Description Default
inputs

Input tensor.

required

Returns:

Type Description

Output of this model after processing.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def forward(self, inputs):
    """
    EfficientNet's forward function.
    Calls extract_features to extract features, applies final linear layer, and returns logits.

    :param inputs: Input tensor.
    :return: Output of this model after processing.
    """
    bs = inputs.size(0)

    # Convolution layers
    x = self.extract_features(inputs)

    # Pooling and final linear layer, not needed for backbone mode
    if not self.backbone_mode:
        x = self._avg_pooling(x)
        x = x.view(bs, -1)
        x = self._dropout(x)
        x = self._fc(x)

    return x

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict dict

The state_dict to load

required
strict bool

strict loading (see super() docs)

True
Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
def load_state_dict(self, state_dict: dict, strict: bool = True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        pretrained_model_weights_dict = pretrained_backbone_weights_dict

    # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
    super().load_state_dict(pretrained_model_weights_dict, strict)

Identity

Bases: nn.Module

Identity mapping. Send input to output directly.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
205
206
207
208
209
210
211
212
213
214
class Identity(nn.Module):
    """Identity mapping.
    Send input to output directly.
    """

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input

MBConvBlock

Bases: nn.Module

Mobile Inverted Residual Bottleneck Block.

References: [1] https://arxiv.org/abs/1704.04861 (MobileNet v1) [2] https://arxiv.org/abs/1801.04381 (MobileNet v2) [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

Parameters:

Name Type Description Default
block_args BlockArgs

BlockArgs.

required
batch_norm_momentum float

Batch norm momentum.

required
batch_norm_epsilon float

Batch norm epsilon.

required
image_size Union[Tuple, List]

[image_height, image_width].

None
Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
class MBConvBlock(nn.Module):
    """Mobile Inverted Residual Bottleneck Block.

    References:
        [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
        [2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
        [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)

    :param block_args: BlockArgs.
    :param batch_norm_momentum: Batch norm momentum.
    :param batch_norm_epsilon: Batch norm epsilon.
    :param image_size: [image_height, image_width].
    """

    def __init__(self, block_args: BlockArgs, batch_norm_momentum: float, batch_norm_epsilon: float, image_size: Union[Tuple, List] = None):
        super().__init__()
        self._block_args = block_args
        self._bn_mom = 1 - batch_norm_momentum  # pytorch's difference from tensorflow
        self._bn_eps = batch_norm_epsilon
        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip  # whether to use skip connection and drop connect

        # Expansion phase (Inverted Bottleneck)
        inp = self._block_args.input_filters  # number of input channels
        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
        if self._block_args.expand_ratio != 1:
            Conv2d = get_same_padding_conv2d(image_size=image_size)
            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        # Depthwise convolution phase
        k = self._block_args.kernel_size
        s = self._block_args.stride
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._depthwise_conv = Conv2d(in_channels=oup, out_channels=oup, groups=oup, kernel_size=k, stride=s, bias=False)  # groups makes it depthwise
        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
        image_size = calculate_output_image_size(image_size, s)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)

        # Pointwise convolution phase
        final_oup = self._block_args.output_filters
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
        self._swish = nn.functional.silu

    def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
        """MBConvBlock's forward function.

        :param inputs:              Input tensor.
        :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
        :return:                    Output of this block after processing.
        """

        # Expansion and Depthwise Convolution
        x = inputs
        if self._block_args.expand_ratio != 1:
            x = self._expand_conv(inputs)
            x = self._bn0(x)
            x = self._swish(x)

        x = self._depthwise_conv(x)
        x = self._bn1(x)
        x = self._swish(x)

        # Squeeze and Excitation
        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_reduce(x_squeezed)
            x_squeezed = self._swish(x_squeezed)
            x_squeezed = self._se_expand(x_squeezed)
            x = torch.sigmoid(x_squeezed) * x

        # Pointwise Convolution
        x = self._project_conv(x)
        x = self._bn2(x)

        # Skip connection and drop connect
        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            # The combination of skip connection and drop connect brings about stochastic depth.
            if drop_connect_rate:
                x = drop_connect(x, p=drop_connect_rate, training=self.training)
            x = x + inputs  # skip connection
        return x

forward(inputs, drop_connect_rate=None)

MBConvBlock's forward function.

Parameters:

Name Type Description Default
inputs torch.Tensor

Input tensor.

required
drop_connect_rate Optional[float]

Drop connect rate (float, between 0 and 1).

None

Returns:

Type Description
torch.Tensor

Output of this block after processing.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def forward(self, inputs: torch.Tensor, drop_connect_rate: Optional[float] = None) -> torch.Tensor:
    """MBConvBlock's forward function.

    :param inputs:              Input tensor.
    :param drop_connect_rate:   Drop connect rate (float, between 0 and 1).
    :return:                    Output of this block after processing.
    """

    # Expansion and Depthwise Convolution
    x = inputs
    if self._block_args.expand_ratio != 1:
        x = self._expand_conv(inputs)
        x = self._bn0(x)
        x = self._swish(x)

    x = self._depthwise_conv(x)
    x = self._bn1(x)
    x = self._swish(x)

    # Squeeze and Excitation
    if self.has_se:
        x_squeezed = F.adaptive_avg_pool2d(x, 1)
        x_squeezed = self._se_reduce(x_squeezed)
        x_squeezed = self._swish(x_squeezed)
        x_squeezed = self._se_expand(x_squeezed)
        x = torch.sigmoid(x_squeezed) * x

    # Pointwise Convolution
    x = self._project_conv(x)
    x = self._bn2(x)

    # Skip connection and drop connect
    input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
    if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
        # The combination of skip connection and drop connect brings about stochastic depth.
        if drop_connect_rate:
            x = drop_connect(x, p=drop_connect_rate, training=self.training)
        x = x + inputs  # skip connection
    return x

calculate_output_image_size(input_image_size, stride)

Calculates the output image size when using Conv2dSamePadding with a stride. Necessary for static padding. Thanks to mannatsingh for pointing this out.

Parameters:

Name Type Description Default
input_image_size Union[int, Tuple, List]

Size of input image.

required
stride Union[int, Tuple, List]

Conv2d operation's stride.

required

Returns:

Type Description
Optional[List[int]]

output_image_size: A list [H,W].

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def calculate_output_image_size(input_image_size: Union[int, Tuple, List], stride: Union[int, Tuple, List]) -> Optional[List[int]]:
    """Calculates the output image size when using Conv2dSamePadding with a stride.
    Necessary for static padding. Thanks to mannatsingh for pointing this out.

    :param input_image_size:    Size of input image.
    :param stride:              Conv2d operation's stride.
    :return: output_image_size: A list [H,W].
    """
    if input_image_size is None:
        return None
    elif isinstance(input_image_size, int):
        input_image_size = (input_image_size, input_image_size)

    image_height, image_width = input_image_size
    stride = stride if isinstance(stride, int) else stride[0]
    image_height = int(math.ceil(image_height / stride))
    image_width = int(math.ceil(image_width / stride))
    return [image_height, image_width]

drop_connect(inputs, p, training)

Drop connect.

Parameters:

Name Type Description Default
inputs

Input of this structure. (tensor: BCWH)

required
training bool

Running mode.

required

Returns:

Type Description
torch.Tensor

output: Output after drop connection.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def drop_connect(inputs: torch.Tensor, p: float, training: bool) -> torch.Tensor:
    """Drop connect.

    :param inputs :     Input of this structure. (tensor: BCWH)
    :param p :          Probability of drop connection. (float: 0.0~1.0)
    :param training:    Running mode.
    :return: output: Output after drop connection.
    """
    assert p >= 0 and p <= 1, "p must be in range of [0,1]"

    if not training:
        return inputs

    batch_size = inputs.shape[0]
    keep_prob = 1 - p

    # generate binary_tensor mask according to probability (p for 0, 1-p for 1)
    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
    binary_tensor = torch.floor(random_tensor)

    output = inputs / keep_prob * binary_tensor
    return output

get_same_padding_conv2d(image_size=None)

Chooses static padding if you have specified an image size, and dynamic padding otherwise. Static padding is necessary for ONNX exporting of models.

Parameters:

Name Type Description Default
image_size Optional[Union[int, Tuple[int, int]]]

Size of the image.

None

Returns:

Type Description

Conv2dDynamicSamePadding or Conv2dStaticSamePadding.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
129
130
131
132
133
134
135
136
137
138
139
def get_same_padding_conv2d(image_size: Optional[Union[int, Tuple[int, int]]] = None):
    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
       Static padding is necessary for ONNX exporting of models.

    :param image_size: Size of the image.
    :return: Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
    """
    if image_size is None:
        return Conv2dDynamicSamePadding
    else:
        return partial(Conv2dStaticSamePadding, image_size=image_size)

round_filters(filters, width_coefficient, depth_divisor, min_depth)

Calculate and round number of filters based on width multiplier. Use width_coefficient, depth_divisor and min_depth.

Parameters:

Name Type Description Default
filters int

Filters number to be calculated. Params from arch_params:

required
width_coefficient int

model's width coefficient. Used as the multiplier.

required
depth_divisor int

model's depth divisor. Used as the divisor.

required
min_depth int

model's minimal depth, if given.

required

Returns:

Type Description

new_filters: New filters number after calculating.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def round_filters(filters: int, width_coefficient: int, depth_divisor: int, min_depth: int):
    """Calculate and round number of filters based on width multiplier.
       Use width_coefficient, depth_divisor and min_depth.

    :param filters: Filters number to be calculated. Params from arch_params:
    :param width_coefficient: model's width coefficient. Used as the multiplier.
    :param depth_divisor: model's depth divisor. Used as the divisor.
    :param min_depth: model's minimal depth, if given.
    :return: new_filters: New filters number after calculating.
    """
    if not width_coefficient:
        return filters
    min_depth = min_depth
    filters *= width_coefficient
    min_depth = min_depth or depth_divisor  # pay attention to this line when using min_depth
    # follow the formula transferred from official TensorFlow implementation
    new_filters = max(min_depth, int(filters + depth_divisor / 2) // depth_divisor * depth_divisor)
    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
        new_filters += depth_divisor
    return int(new_filters)

round_repeats(repeats, depth_coefficient)

Calculate module's repeat number of a block based on depth multiplier. Use depth_coefficient.

Parameters:

Name Type Description Default
repeats int

num_repeat to be calculated.

required
depth_coefficient int

the depth coefficient of the model. this func uses it as the multiplier.

required

Returns:

Type Description

new repeat: New repeat number after calculating.

Source code in V3_2/src/super_gradients/training/models/classification_models/efficientnet.py
64
65
66
67
68
69
70
71
72
73
74
75
def round_repeats(repeats: int, depth_coefficient: int):
    """Calculate module's repeat number of a block based on depth multiplier.
       Use depth_coefficient.

    :param repeats: num_repeat to be calculated.
    :param depth_coefficient: the depth coefficient of the model. this func uses it as the multiplier.
    :return: new repeat: New repeat number after calculating.
    """
    if not depth_coefficient:
        return repeats
    # follow the formula transferred from official TensorFlow implementation
    return int(math.ceil(depth_coefficient * repeats))

Googlenet code based on https://pytorch.org/vision/stable/_modules/torchvision/models/googlenet.html

GoogLeNet

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/googlenet.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class GoogLeNet(SgModule):
    def __init__(self, num_classes=1000, aux_logits=True, init_weights=True, backbone_mode=False, dropout=0.3):
        super(GoogLeNet, self).__init__()

        self.num_classes = num_classes
        self.backbone_mode = backbone_mode

        self.aux_logits = aux_logits
        self.dropout_p = dropout

        self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        self.conv2 = BasicConv2d(64, 64, kernel_size=1)
        self.conv3 = BasicConv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)

        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)

        if aux_logits:
            self.aux1 = InceptionAux(512, num_classes)
            self.aux2 = InceptionAux(528, num_classes)
        else:
            self.aux1 = None
            self.aux2 = None

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        if not self.backbone_mode:
            self.dropout = nn.Dropout(self.dropout_p)
            self.fc = nn.Linear(1024, num_classes)

        if init_weights:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                import scipy.stats as stats

                x = stats.truncnorm(-2, 2, scale=0.01)
                values = torch.as_tensor(x.rvs(m.weight.numel()), dtype=m.weight.dtype)
                values = values.view(m.weight.size())
                with torch.no_grad():
                    m.weight.copy_(values)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _forward(self, x):
        # N x 3 x 224 x 224
        x = self.conv1(x)
        # N x 64 x 112 x 112
        x = self.maxpool1(x)
        # N x 64 x 56 x 56
        x = self.conv2(x)
        # N x 64 x 56 x 56
        x = self.conv3(x)
        # N x 192 x 56 x 56
        x = self.maxpool2(x)

        # N x 192 x 28 x 28
        x = self.inception3a(x)
        # N x 256 x 28 x 28
        x = self.inception3b(x)
        # N x 480 x 28 x 28
        x = self.maxpool3(x)
        # N x 480 x 14 x 14
        x = self.inception4a(x)
        # N x 512 x 14 x 14
        aux1 = None
        if self.aux1 is not None and self.training:
            aux1 = self.aux1(x)

        x = self.inception4b(x)
        # N x 512 x 14 x 14
        x = self.inception4c(x)
        # N x 512 x 14 x 14
        x = self.inception4d(x)
        # N x 528 x 14 x 14
        aux2 = None
        if self.aux2 is not None and self.training:
            aux2 = self.aux2(x)

        x = self.inception4e(x)
        # N x 832 x 14 x 14
        x = self.maxpool4(x)
        # N x 832 x 7 x 7
        x = self.inception5a(x)
        # N x 832 x 7 x 7
        x = self.inception5b(x)
        # N x 1024 x 7 x 7

        x = self.avgpool(x)
        # N x 1024 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 1024
        if not self.backbone_mode:
            x = self.dropout(x)
            x = self.fc(x)
        # N x num_classes
        return x, aux2, aux1

    def forward(self, x):
        x, aux1, aux2 = self._forward(x)
        if self.training and self.aux_logits:
            return GoogLeNetOutputs(x, aux2, aux1)
        else:
            return x

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights
            c_temp = torch.nn.Linear(1024, self.num_classes)
            torch.nn.init.xavier_uniform(c_temp.weight)
            pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
            pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in V3_2/src/super_gradients/training/models/classification_models/googlenet.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights
        c_temp = torch.nn.Linear(1024, self.num_classes)
        torch.nn.init.xavier_uniform(c_temp.weight)
        pretrained_backbone_weights_dict["fc.weight"] = c_temp.weight
        pretrained_backbone_weights_dict["fc.bias"] = c_temp.bias
        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

LeNet in PyTorch.

https://yann.lecun.com/exdb/lenet/

MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" for more details.

Block

Bases: nn.Module

Depthwise conv + Pointwise conv

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenet.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class Block(nn.Module):
    """Depthwise conv + Pointwise conv"""

    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out

MobileNet

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenet.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class MobileNet(SgModule):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, 128, (128, 2), 256, (256, 2), 512, 512, 512, 512, 512, (512, 2), 1024, (1024, 2)]

    def __init__(self, num_classes=10, backbone_mode=False, up_to_layer=None, in_channels: int = 3):
        super(MobileNet, self).__init__()
        self.backbone_mode = backbone_mode
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32, up_to_layer=up_to_layer if up_to_layer is not None else len(self.cfg))

        if not self.backbone_mode:
            self.linear = nn.Linear(self.cfg[-1], num_classes)

    def _make_layers(self, in_planes, up_to_layer):
        layers = []
        for x in self.cfg[:up_to_layer]:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
        """
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)

        if not self.backbone_mode:
            out = F.avg_pool2d(out, 2)
            out = out.view(out.size(0), -1)
            out = self.linear(out)

        return out

forward(x)

Parameters:

Name Type Description Default
up_to_layer

forward through the net layers up to a specific layer. if None, run all layers

required
Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenet.py
50
51
52
53
54
55
56
57
58
59
60
61
62
def forward(self, x):
    """
    :param up_to_layer: forward through the net layers up to a specific layer. if None, run all layers
    """
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layers(out)

    if not self.backbone_mode:
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)

    return out

This is a PyTorch implementation of MobileNetV2 architecture as described in the paper: Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation. https://arxiv.org/pdf/1801.04381

Code taken from https://github.com/tonylins/pytorch-mobilenet-v2 License: Apache Version 2.0, January 2004 http://www.apache.org/licenses/

Pre-trained ImageNet model: 'deci-model-repository/mobilenet_v2/ckpt_best.pth'

CustomMobileNetV2

Bases: MobileNetV2

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
@register_model(Models.CUSTOM_MOBILENET_V2)
class CustomMobileNetV2(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params:–≠ HpmStruct
            must contain:
                'num_classes': int
                'width_mult': float
                'structure' : list. specify the mobilenetv2 architecture
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            structure=arch_params.structure,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

Parameters:

Name Type Description Default
arch_params

–≠ HpmStruct must contain: 'num_classes': int 'width_mult': float 'structure' : list. specify the mobilenetv2 architecture

required
Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def __init__(self, arch_params):
    """
    :param arch_params:–≠ HpmStruct
        must contain:
            'num_classes': int
            'width_mult': float
            'structure' : list. specify the mobilenetv2 architecture
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=arch_params.width_mult,
        structure=arch_params.structure,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

InvertedResidual

Bases: nn.Module

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
        """
        :param inp: number of input channels
        :param oup: number of output channels
        :param stride: conv stride
        :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
        :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
        """
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        groups = int(hidden_dim / grouped_conv_size)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)

__init__(inp, oup, stride, expand_ratio, grouped_conv_size=1)

:grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1

Parameters:

Name Type Description Default
inp

number of input channels

required
oup

number of output channels

required
stride

conv stride

required
expand_ratio

expansion ratio of the hidden layer after pointwise conv

required
Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def __init__(self, inp, oup, stride, expand_ratio, grouped_conv_size=1):
    """
    :param inp: number of input channels
    :param oup: number of output channels
    :param stride: conv stride
    :param expand_ratio: expansion ratio of the hidden layer after pointwise conv
    :grouped_conv_size: number of channels per grouped convolution, for depth-wise-separable convolution, use grouped_conv_size=1
    """
    super(InvertedResidual, self).__init__()
    self.stride = stride
    assert stride in [1, 2]

    hidden_dim = int(inp * expand_ratio)
    groups = int(hidden_dim / grouped_conv_size)
    self.use_res_connect = self.stride == 1 and inp == oup

    if expand_ratio == 1:
        self.conv = nn.Sequential(
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )
    else:
        self.conv = nn.Sequential(
            # pw
            nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=groups, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
        )

MobileNetV2

Bases: MobileNetBase

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class MobileNetV2(MobileNetBase):
    def __init__(
        self,
        num_classes,
        dropout: float,
        width_mult=1.0,
        structure=None,
        backbone_mode: bool = False,
        grouped_conv_size=1,
        in_channels=3,
    ) -> object:
        super(MobileNetV2, self).__init__()
        self.in_channels = in_channels
        block = InvertedResidual
        last_channel = 1280
        # IF STRUCTURE IS NONE - USE THE DEFAULT STRUCTURE NOTED
        #                                                  t, c,  n, s    stage-0 is the first conv_bn layer
        self.interverted_residual_setting = structure or [
            [1, 16, 1, 1],  # stage-1
            [6, 24, 2, 2],  # stage-2
            [6, 32, 3, 2],  # stage-3
            [6, 64, 4, 2],  # stage-4
            [6, 96, 3, 1],  # stage-5
            [6, 160, 3, 2],  # stage-6
            [6, 320, 1, 1],
        ]  # stage-7
        #                                                                   stage-8  is the last_layer
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel

        curr_channels = 32
        self.features = [conv_bn(in_channels, curr_channels, 2)]
        # building inverted residual blocks
        for t, c, n, s in self.interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            for i in range(n):
                if i == 0:
                    self.features.append(block(curr_channels, output_channel, s, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                else:
                    self.features.append(block(curr_channels, output_channel, 1, expand_ratio=t, grouped_conv_size=grouped_conv_size))
                curr_channels = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(curr_channels, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)
        self.backbone_mode = backbone_mode

        if self.backbone_mode:
            self.classifier = nn.Identity()
            # TODO: remove during migration of YOLOs to the new base
            self.backbone_connection_channels = self._extract_connection_layers_input_channel_size()
        else:
            # building classifier
            self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(self.last_channel, num_classes))
        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        if self.backbone_mode:
            return x
        else:
            x = x.mean(3).mean(2)
            return self.classifier(x)

    def _extract_connection_layers_input_channel_size(self):
        """
        Extracts the number of channels out when using mobilenetV2 as yolo backbone
        """
        curr_layer_input = torch.rand(1, self.in_channels, 320, 320)  # input dims are used to extract number of channels
        layers_num_to_extract = [np.array(self.interverted_residual_setting)[:stage, 2].sum() for stage in [3, 5]]
        connection_layers_input_channel_size = []
        for layer_idx, feature in enumerate(self.features):
            curr_layer_input = feature(curr_layer_input)
            if layer_idx in layers_num_to_extract:
                connection_layers_input_channel_size.append(curr_layer_input.shape[1])
        connection_layers_input_channel_size.append(self.last_channel)
        connection_layers_input_channel_size.reverse()
        return connection_layers_input_channel_size

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

MobileNetV2Base

Bases: MobileNetV2

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@register_model(Models.MOBILENET_V2)
class MobileNetV2Base(MobileNetV2):
    def __init__(self, arch_params):
        """
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.0,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

Parameters:

Name Type Description Default
arch_params

HpmStruct must contain: 'num_classes': int

required
Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
194
195
196
197
198
199
200
201
202
203
204
205
def __init__(self, arch_params):
    """
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.0,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

MobileNetV2_135

Bases: MobileNetV2

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
@register_model(Models.MOBILE_NET_V2_135)
class MobileNetV2_135(MobileNetV2):
    def __init__(self, arch_params):
        """
        This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
        :param arch_params: HpmStruct
            must contain: 'num_classes': int
        """
        super().__init__(
            num_classes=arch_params.num_classes,
            width_mult=1.35,
            structure=None,
            dropout=get_param(arch_params, "dropout", 0.0),
            in_channels=get_param(arch_params, "in_channels", 3),
        )

__init__(arch_params)

This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50

Parameters:

Name Type Description Default
arch_params

HpmStruct must contain: 'num_classes': int

required
Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv2.py
210
211
212
213
214
215
216
217
218
219
220
221
222
def __init__(self, arch_params):
    """
    This Model achieves–≠ 75.73% on Imagenet - similar to Resnet50
    :param arch_params: HpmStruct
        must contain: 'num_classes': int
    """
    super().__init__(
        num_classes=arch_params.num_classes,
        width_mult=1.35,
        structure=None,
        dropout=get_param(arch_params, "dropout", 0.0),
        in_channels=get_param(arch_params, "in_channels", 3),
    )

Creates a MobileNetV3 Model as defined in: Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam. (2019). Searching for MobileNetV3 arXiv preprint arXiv:1905.02244.

mobilenetv3_custom

Bases: MobileNetV3

Constructs a MobileNetV3-Customized model

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv3.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
@register_model(Models.MOBILENET_V3_CUSTOM)
class mobilenetv3_custom(MobileNetV3):
    """
    Constructs a MobileNetV3-Customized model
    """

    def __init__(self, arch_params):
        super().__init__(
            cfgs=arch_params.structure,
            mode=arch_params.mode,
            num_classes=arch_params.num_classes,
            width_mult=arch_params.width_mult,
            in_channels=get_param(arch_params, "in_channels", 3),
        )

mobilenetv3_large

Bases: MobileNetV3

Constructs a MobileNetV3-Large model

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv3.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
@register_model(Models.MOBILENET_V3_LARGE)
class mobilenetv3_large(MobileNetV3):
    """
    Constructs a MobileNetV3-Large model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 0, 0, 1],
            [3, 4, 24, 0, 0, 2],
            [3, 3, 24, 0, 0, 1],
            [5, 3, 40, 1, 0, 2],
            [5, 3, 40, 1, 0, 1],
            [5, 3, 40, 1, 0, 1],
            [3, 6, 80, 0, 1, 2],
            [3, 2.5, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 2.3, 80, 0, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [3, 6, 112, 1, 1, 1],
            [5, 6, 160, 1, 1, 2],
            [5, 6, 160, 1, 1, 1],
            [5, 6, 160, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="large", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

mobilenetv3_small

Bases: MobileNetV3

Constructs a MobileNetV3-Small model

Source code in V3_2/src/super_gradients/training/models/classification_models/mobilenetv3.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
@register_model(Models.MOBILENET_V3_SMALL)
class mobilenetv3_small(MobileNetV3):
    """
    Constructs a MobileNetV3-Small model
    """

    def __init__(self, arch_params):
        width_mult = arch_params.width_mult if hasattr(arch_params, "width_mult") else 1.0
        cfgs = [
            # k, t, c, SE, HS, s
            [3, 1, 16, 1, 0, 2],
            [3, 4.5, 24, 0, 0, 2],
            [3, 3.67, 24, 0, 0, 1],
            [5, 4, 40, 1, 1, 2],
            [5, 6, 40, 1, 1, 1],
            [5, 6, 40, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 3, 48, 1, 1, 1],
            [5, 6, 96, 1, 1, 2],
            [5, 6, 96, 1, 1, 1],
            [5, 6, 96, 1, 1, 1],
        ]
        super().__init__(cfgs, mode="small", num_classes=arch_params.num_classes, width_mult=width_mult, in_channels=get_param(arch_params, "in_channels", 3))

PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search

https://github.com/kuangliu/pytorch-cifar/blob/master/models/pnasnet.py

SepConv

Bases: nn.Module

Separable Convolution.

Source code in V3_2/src/super_gradients/training/models/classification_models/pnasnet.py
13
14
15
16
17
18
19
20
21
22
class SepConv(nn.Module):
    """Separable Convolution."""

    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding=(kernel_size - 1) // 2, bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))

Pre-activation ResNet in PyTorch.

Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv:1603.05027

Based on https://github.com/kuangliu/pytorch-cifar/blob/master/models/preact_resnet.py

PreActBlock

Bases: nn.Module

Pre-activation version of the BasicBlock.

Source code in V3_2/src/super_gradients/training/models/classification_models/preact_resnet.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class PreActBlock(nn.Module):
    """Pre-activation version of the BasicBlock."""

    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out

PreActBottleneck

Bases: nn.Module

Pre-activation version of the original Bottleneck module.

Source code in V3_2/src/super_gradients/training/models/classification_models/preact_resnet.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class PreActBottleneck(nn.Module):
    """Pre-activation version of the original Bottleneck module."""

    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False))

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out

Regnet - from paper: Designing Network Design Spaces - https://arxiv.org/pdf/2003.13678.pdf Implementation of paradigm described in paper published by Facebook AI Research (FAIR) @author: Signatrix GmbH Code taken from: https://github.com/signatrix/regnet - MIT Licence

CustomAnyNet

Bases: AnyNetX

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
@register_model(Models.CUSTOM_ANYNET)
class CustomAnyNet(AnyNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            ls_num_blocks=arch_params.ls_num_blocks,
            ls_block_width=arch_params.ls_block_width,
            ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
            ls_group_width=arch_params.ls_group_width,
            stride=arch_params.stride,
            num_classes=arch_params.num_classes,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            backbone_mode=get_param(arch_params, "backbone_mode", False),
            dropout_prob=get_param(arch_params, "dropout_prob", 0),
            droppath_prob=get_param(arch_params, "droppath_prob", 0),
            input_channels=get_param(arch_params, "input_channels", 3),
        )

__init__(arch_params)

All parameters must be provided in arch_params other than SE

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        ls_num_blocks=arch_params.ls_num_blocks,
        ls_block_width=arch_params.ls_block_width,
        ls_bottleneck_ratio=arch_params.ls_bottleneck_ratio,
        ls_group_width=arch_params.ls_group_width,
        stride=arch_params.stride,
        num_classes=arch_params.num_classes,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        backbone_mode=get_param(arch_params, "backbone_mode", False),
        dropout_prob=get_param(arch_params, "dropout_prob", 0),
        droppath_prob=get_param(arch_params, "droppath_prob", 0),
        input_channels=get_param(arch_params, "input_channels", 3),
    )

CustomRegNet

Bases: RegNetX

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
@register_model(Models.CUSTOM_REGNET)
class CustomRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters must be provided in arch_params other than SE"""
        super().__init__(
            initial_width=arch_params.initial_width,
            slope=arch_params.slope,
            quantized_param=arch_params.quantized_param,
            network_depth=arch_params.network_depth,
            bottleneck_ratio=arch_params.bottleneck_ratio,
            group_width=arch_params.group_width,
            stride=arch_params.stride,
            arch_params=arch_params,
            se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
            input_channels=get_param(arch_params, "input_channels", 3),
        )

__init__(arch_params)

All parameters must be provided in arch_params other than SE

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def __init__(self, arch_params):
    """All parameters must be provided in arch_params other than SE"""
    super().__init__(
        initial_width=arch_params.initial_width,
        slope=arch_params.slope,
        quantized_param=arch_params.quantized_param,
        network_depth=arch_params.network_depth,
        bottleneck_ratio=arch_params.bottleneck_ratio,
        group_width=arch_params.group_width,
        stride=arch_params.stride,
        arch_params=arch_params,
        se_ratio=arch_params.se_ratio if hasattr(arch_params, "se_ratio") else None,
        input_channels=get_param(arch_params, "input_channels", 3),
    )

NASRegNet

Bases: RegNetX

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
@register_model(Models.NAS_REGNET)
class NASRegNet(RegNetX):
    def __init__(self, arch_params):
        """All parameters are provided as a single structure list: arch_params.structure"""
        structure = arch_params.structure
        super().__init__(
            initial_width=structure[0],
            slope=structure[1],
            quantized_param=structure[2],
            network_depth=structure[3],
            bottleneck_ratio=structure[4],
            group_width=structure[5],
            stride=structure[6],
            se_ratio=structure[7] if structure[7] > 0 else None,
            arch_params=arch_params,
        )

__init__(arch_params)

All parameters are provided as a single structure list: arch_params.structure

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
def __init__(self, arch_params):
    """All parameters are provided as a single structure list: arch_params.structure"""
    structure = arch_params.structure
    super().__init__(
        initial_width=structure[0],
        slope=structure[1],
        quantized_param=structure[2],
        network_depth=structure[3],
        bottleneck_ratio=structure[4],
        group_width=structure[5],
        stride=structure[6],
        se_ratio=structure[7] if structure[7] > 0 else None,
        arch_params=arch_params,
    )

verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width)

VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER

Source code in V3_2/src/super_gradients/training/models/classification_models/regnet.py
226
227
228
229
230
231
232
233
234
235
236
def verify_correctness_of_parameters(ls_num_blocks, ls_block_width, ls_bottleneck_ratio, ls_group_width):
    """VERIFY THAT THE GIVEN PARAMETERS FIT THE SEARCH SPACE DEFINED IN THE REGNET PAPER"""
    err_message = "Parameters don't fit"
    assert len(set(ls_bottleneck_ratio)) == 1, f"{err_message} AnyNetXb"
    assert len(set(ls_group_width)) == 1, f"{err_message} AnyNetXc"
    assert all(i <= j for i, j in zip(ls_block_width, ls_block_width[1:])) is True, f"{err_message} AnyNetXd"
    if len(ls_num_blocks) > 2:
        assert all(i <= j for i, j in zip(ls_num_blocks[:-2], ls_num_blocks[1:-1])) is True, f"{err_message} AnyNetXe"
    # For each stage & each layer, number of channels (block width / bottleneck ratio) must be divisible by group width
    for block_width, bottleneck_ratio, group_width in zip(ls_block_width, ls_bottleneck_ratio, ls_group_width):
        assert int(block_width // bottleneck_ratio) % group_width == 0

Repvgg Pytorch Implementation. This model trains a vgg with residual blocks but during inference (in deployment mode) will convert the model to vgg model. Pretrained models: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq Refrerences: [1] https://github.com/DingXiaoH/RepVGG [2] https://arxiv.org/pdf/2101.03697.pdf

Based on https://github.com/DingXiaoH/RepVGG

RepVGG

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/repvgg.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class RepVGG(SgModule):
    def __init__(
        self,
        struct,
        num_classes=1000,
        width_multiplier=None,
        build_residual_branches=True,
        use_se=False,
        backbone_mode=False,
        in_channels=3,
    ):
        """
        :param struct: list containing number of blocks per repvgg stage
        :param num_classes: number of classes if nut in backbone mode
        :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
        :param build_residual_branches: whether to add residual connections or not
        :param use_se: use squeeze and excitation layers
        :param backbone_mode: if true, dropping the final linear layer
        :param in_channels: input channels
        """
        super(RepVGG, self).__init__()

        if isinstance(width_multiplier, float):
            width_multiplier = [width_multiplier] * 4
        else:
            assert len(width_multiplier) == 4

        self.build_residual_branches = build_residual_branches
        self.use_se = use_se
        self.backbone_mode = backbone_mode

        self.in_planes = int(64 * width_multiplier[0])

        self.stem = RepVGGBlock(
            in_channels=in_channels,
            out_channels=self.in_planes,
            stride=2,
            build_residual_branches=build_residual_branches,
            activation_type=nn.ReLU,
            activation_kwargs=dict(inplace=True),
            se_type=SEBlock if self.use_se else nn.Identity,
            se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
        )
        self.cur_layer_idx = 1
        self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
        self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
        self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
        self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
            self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

        if not build_residual_branches:
            self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
            fuse_repvgg_blocks_residual_branches(self)

        self.final_width_mult = width_multiplier[3]

    def _make_stage(self, planes, struct, stride):
        strides = [stride] + [1] * (struct - 1)
        blocks = []
        for stride in strides:
            blocks.append(
                RepVGGBlock(
                    in_channels=self.in_planes,
                    out_channels=planes,
                    stride=stride,
                    groups=1,
                    build_residual_branches=self.build_residual_branches,
                    activation_type=nn.ReLU,
                    activation_kwargs=dict(inplace=True),
                    se_type=SEBlock if self.use_se else nn.Identity,
                    se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
                )
            )
            self.in_planes = planes
            self.cur_layer_idx += 1
        return nn.Sequential(*blocks)

    def forward(self, x):
        out = self.stem(x)
        out = self.stage1(out)
        out = self.stage2(out)
        out = self.stage3(out)
        out = self.stage4(out)
        if not self.backbone_mode:
            out = self.avgpool(out)
            out = out.view(out.size(0), -1)
            out = self.linear(out)
        return out

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        if self.build_residual_branches:
            fuse_repvgg_blocks_residual_branches(self)

    def train(self, mode: bool = True):

        assert (
            not mode or self.build_residual_branches
        ), "Trying to train a model without residual branches, set arch_params.build_residual_branches to True and retrain the model"
        super(RepVGG, self).train(mode=mode)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(int(512 * self.final_width_mult), new_num_classes)

__init__(struct, num_classes=1000, width_multiplier=None, build_residual_branches=True, use_se=False, backbone_mode=False, in_channels=3)

Parameters:

Name Type Description Default
struct

list containing number of blocks per repvgg stage

required
num_classes

number of classes if nut in backbone mode

1000
width_multiplier

list of per stage width multiplier or float if using single value for all stages

None
build_residual_branches

whether to add residual connections or not

True
use_se

use squeeze and excitation layers

False
backbone_mode

if true, dropping the final linear layer

False
in_channels

input channels

3
Source code in V3_2/src/super_gradients/training/models/classification_models/repvgg.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    struct,
    num_classes=1000,
    width_multiplier=None,
    build_residual_branches=True,
    use_se=False,
    backbone_mode=False,
    in_channels=3,
):
    """
    :param struct: list containing number of blocks per repvgg stage
    :param num_classes: number of classes if nut in backbone mode
    :param width_multiplier: list of per stage width multiplier or float if using single value for all stages
    :param build_residual_branches: whether to add residual connections or not
    :param use_se: use squeeze and excitation layers
    :param backbone_mode: if true, dropping the final linear layer
    :param in_channels: input channels
    """
    super(RepVGG, self).__init__()

    if isinstance(width_multiplier, float):
        width_multiplier = [width_multiplier] * 4
    else:
        assert len(width_multiplier) == 4

    self.build_residual_branches = build_residual_branches
    self.use_se = use_se
    self.backbone_mode = backbone_mode

    self.in_planes = int(64 * width_multiplier[0])

    self.stem = RepVGGBlock(
        in_channels=in_channels,
        out_channels=self.in_planes,
        stride=2,
        build_residual_branches=build_residual_branches,
        activation_type=nn.ReLU,
        activation_kwargs=dict(inplace=True),
        se_type=SEBlock if self.use_se else nn.Identity,
        se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
    )
    self.cur_layer_idx = 1
    self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
    self.stage2 = self._make_stage(int(128 * width_multiplier[1]), struct[1], stride=2)
    self.stage3 = self._make_stage(int(256 * width_multiplier[2]), struct[2], stride=2)
    self.stage4 = self._make_stage(int(512 * width_multiplier[3]), struct[3], stride=2)
    if not self.backbone_mode:
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

    if not build_residual_branches:
        self.eval()  # fusing has to be made in eval mode. When called in init, model will be built in eval mode
        fuse_repvgg_blocks_residual_branches(self)

    self.final_width_mult = width_multiplier[3]

ResNet in PyTorch. For Pre-activation ResNet, see 'preact_resnet.py'. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385

Pre-trained ImageNet models: 'deci-model-repository/resnet?/ckpt_best.pth' => ? = the type of resnet (e.g. 18, 34...) Pre-trained CIFAR10 models: 'deci-model-repository/CIFAR_NAS_#??????/ckpt_best.pth' => ? = num of model, structure, width_mult

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

ResNet

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/resnet.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
class ResNet(SgModule):
    def __init__(
        self,
        block,
        num_blocks: list,
        num_classes: int = 10,
        width_mult: float = 1,
        expansion: int = 1,
        droppath_prob=0.0,
        input_batchnorm: bool = False,
        backbone_mode: bool = False,
    ):
        super(ResNet, self).__init__()
        self.expansion = expansion
        self.backbone_mode = backbone_mode
        self.structure = [num_blocks, width_mult]
        self.in_planes = width_multiplier(64, width_mult)
        self.input_batchnorm = input_batchnorm
        if self.input_batchnorm:
            self.bn0 = nn.BatchNorm2d(3)

        self.conv1 = nn.Conv2d(3, width_multiplier(64, width_mult), kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(width_multiplier(64, width_mult))
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, width_multiplier(64, width_mult), num_blocks[0], stride=1, droppath_prob=droppath_prob)
        self.layer2 = self._make_layer(block, width_multiplier(128, width_mult), num_blocks[1], stride=2, droppath_prob=droppath_prob)
        self.layer3 = self._make_layer(block, width_multiplier(256, width_mult), num_blocks[2], stride=2, droppath_prob=droppath_prob)
        self.layer4 = self._make_layer(block, width_multiplier(512, width_mult), num_blocks[3], stride=2, droppath_prob=droppath_prob)

        if not self.backbone_mode:
            # IF RESNET IS IN BACK_BONE MODE WE DON'T NEED THE FINAL CLASSIFIER LAYERS, BUT ONLY THE NET BLOCK STRUCTURE
            self.linear = nn.Linear(width_multiplier(512, width_mult) * self.expansion, num_classes)
            self.avgpool = nn.AdaptiveAvgPool2d(1)

        self.width_mult = width_mult

    def _make_layer(self, block, planes, num_blocks, stride, droppath_prob):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        if num_blocks == 0:
            # When the number of blocks is zero but spatial dimension and/or number of filters about to change we put 1
            # 3X3 conv layer to make this change to the new dimensions.
            if stride != 1 or self.in_planes != planes:
                layers.append(nn.Sequential(nn.Conv2d(self.in_planes, planes, kernel_size=3, stride=stride, bias=False, padding=1), nn.BatchNorm2d(planes)))
                self.in_planes = planes

        else:
            for stride in strides:
                layers.append(block(self.in_planes, planes, stride, droppath_prob=droppath_prob))
                self.in_planes = planes * self.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        if self.input_batchnorm:
            x = self.bn0(x)
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        if not self.backbone_mode:
            # IF RESNET IS *NOT* IN BACK_BONE MODE WE  NEED THE FINAL CLASSIFIER LAYERS OUTPUTS
            out = self.avgpool(out)
            out = out.squeeze(dim=2).squeeze(dim=2)
            out = self.linear(out)

        return out

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
            pretrained_model_weights_dict.popitem()
            pretrained_model_weights_dict.popitem()

            pretrained_backbone_weights_dict = OrderedDict()
            for layer_name, weights in pretrained_model_weights_dict.items():
                # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
                name_without_module_prefix = layer_name.split("module.")[1]

                # MAKE SURE THESE ARE NOT THE FINAL LAYERS
                pretrained_backbone_weights_dict[name_without_module_prefix] = weights

            # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
            super().load_state_dict(pretrained_backbone_weights_dict, strict)
        else:
            super().load_state_dict(pretrained_model_weights_dict, strict)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.linear = new_head
        else:
            self.linear = nn.Linear(width_multiplier(512, self.width_mult) * self.expansion, new_num_classes)

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in V3_2/src/super_gradients/training/models/classification_models/resnet.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # FIRST LET'S POP THE LAST TWO LAYERS - NO NEED TO LOAD THEIR VALUES SINCE THEY ARE IRRELEVANT AS A BACKBONE
        pretrained_model_weights_dict.popitem()
        pretrained_model_weights_dict.popitem()

        pretrained_backbone_weights_dict = OrderedDict()
        for layer_name, weights in pretrained_model_weights_dict.items():
            # GET THE LAYER NAME WITHOUT THE 'module.' PREFIX
            name_without_module_prefix = layer_name.split("module.")[1]

            # MAKE SURE THESE ARE NOT THE FINAL LAYERS
            pretrained_backbone_weights_dict[name_without_module_prefix] = weights

        # RETURNING THE UNMODIFIED/MODIFIED STATE DICT DEPENDING ON THE backbone_mode VALUE
        super().load_state_dict(pretrained_backbone_weights_dict, strict)
    else:
        super().load_state_dict(pretrained_model_weights_dict, strict)

ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.

Code adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

GroupedConvBlock

Bases: nn.Module

Grouped convolution block.

Source code in V3_2/src/super_gradients/training/models/classification_models/resnext.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class GroupedConvBlock(nn.Module):
    """Grouped convolution block."""

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None):
        super(GroupedConvBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        self.norm_layer = norm_layer
        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

conv1x1(in_planes, out_planes, stride=1)

1x1 convolution

Source code in V3_2/src/super_gradients/training/models/classification_models/resnext.py
20
21
22
def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1)

3x3 convolution with padding

Source code in V3_2/src/super_gradients/training/models/classification_models/resnext.py
15
16
17
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation)

SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.

Code adapted from https://github.com/fastai/imagenet-fast/blob/master/cifar10/models/cifar10/senet.py

ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.

https://github.com/kuangliu/pytorch-cifar/blob/master/models/shufflenet.py

ShuffleBlock

Bases: nn.Module

Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenet.py
13
14
15
16
17
18
19
20
21
22
class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
        N, C, H, W = x.size()
        g = self.groups
        return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

forward(x)

Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]

Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenet.py
18
19
20
21
22
def forward(self, x):
    """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
    N, C, H, W = x.size()
    g = self.groups
    return x.view(N, g, C // g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)

ShuffleNetV2 in PyTorch.

See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. (https://arxiv.org/abs/1807.11164)

Code taken from torchvision/models/shufflenetv2.py

ChannelShuffleInvertedResidual

Bases: nn.Module

Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

  • When stride > 1
  • the whole input goes through branch1,
  • the whole input goes through branch2 , and the arbitrary number of output channels are produced.
  • When stride == 1
  • half of input channels in are passed as identity,
  • another half of input channels goes through branch2, and the number of output channels after the block remains the same as in input.

Channel shuffle is performed on a concatenation in both cases.

Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenetv2.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class ChannelShuffleInvertedResidual(nn.Module):
    """
    Implement Inverted Residual block as in [https://arxiv.org/abs/1807.11164] in Fig.3 (c) & (d):

    * When stride > 1
      - the whole input goes through branch1,
      - the whole input goes through branch2 ,
      and the arbitrary number of output channels are produced.
    * When stride == 1
      - half of input channels in are passed as identity,
      - another half of input channels goes through branch2,
      and the number of output channels after the block remains the same as in input.

    Channel shuffle is performed on a concatenation in both cases.
    """

    def __init__(self, inp: int, out: int, stride: int) -> None:
        super(ChannelShuffleInvertedResidual, self).__init__()

        assert 1 <= stride <= 3, "Illegal stride value"
        assert (stride != 1) or (inp == out), "When stride == 1 num of input channels should be equal to the requested num of out output channels"

        self.stride = stride
        # half of requested out channels will be produced by each branch
        branch_features = out // 2

        if self.stride > 1:
            self.branch1 = nn.Sequential(
                nn.Conv2d(inp, inp, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=inp),
                nn.BatchNorm2d(inp),
                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(branch_features),
                nn.ReLU(inplace=True),
            )
        else:
            # won't be called if self.stride == 1
            self.branch1 = nn.Identity()

        self.branch2 = nn.Sequential(
            # branch 2 operates on the whole input when stride > 1 and on half of it otherwise
            nn.Conv2d(inp if (self.stride > 1) else inp // 2, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1, bias=False, groups=branch_features),
            nn.BatchNorm2d(branch_features),
            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(branch_features),
            nn.ReLU(inplace=True),
        )

    @staticmethod
    def channel_shuffle(x: Tensor, groups: int) -> Tensor:
        """
        From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
            A “channel shuffle” operation is then introduced to enable
            information communication between different groups of channels and improve accuracy.

        The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

        Example:
            If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
            then activation maps in x are:
            from_B1, from_B1, ... from_B2, from_B2
            After channel_shuffle activation maps in x will be:
            from_B1, from_B2, ... from_B1, from_B2
        """

        batch_size, num_channels, height, width = x.size()
        channels_per_group = num_channels // groups

        # reshape
        x = x.view(batch_size, groups, channels_per_group, height, width)
        x = torch.transpose(x, 1, 2).contiguous()

        # flatten
        x = x.view(batch_size, -1, height, width)
        return x

    def forward(self, x: Tensor) -> Tensor:
        if self.stride == 1:
            # num channels remains the same due to assert that inp == out in __init__
            x1, x2 = x.chunk(2, dim=1)
            out = torch.cat((x1, self.branch2(x2)), dim=1)
        else:
            # inp num channels can change to a requested arbitrary out num channels
            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)

        out = self.channel_shuffle(out, 2)
        return out

channel_shuffle(x, groups) staticmethod

From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164): A “channel shuffle” operation is then introduced to enable information communication between different groups of channels and improve accuracy.

The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

Example: If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle), then activation maps in x are: from_B1, from_B1, ... from_B2, from_B2 After channel_shuffle activation maps in x will be: from_B1, from_B2, ... from_B1, from_B2

Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenetv2.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@staticmethod
def channel_shuffle(x: Tensor, groups: int) -> Tensor:
    """
    From "ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" (https://arxiv.org/abs/1807.11164):
        A “channel shuffle” operation is then introduced to enable
        information communication between different groups of channels and improve accuracy.

    The operation preserves x.size(), but shuffles its channels in the manner explained further in the example.

    Example:
        If group = 2 (2 branches with the same # of activation maps were concatenated before channel_shuffle),
        then activation maps in x are:
        from_B1, from_B1, ... from_B2, from_B2
        After channel_shuffle activation maps in x will be:
        from_B1, from_B2, ... from_B1, from_B2
    """

    batch_size, num_channels, height, width = x.size()
    channels_per_group = num_channels // groups

    # reshape
    x = x.view(batch_size, groups, channels_per_group, height, width)
    x = torch.transpose(x, 1, 2).contiguous()

    # flatten
    x = x.view(batch_size, -1, height, width)
    return x

ShuffleNetV2Base

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenetv2.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class ShuffleNetV2Base(SgModule):
    def __init__(
        self,
        structure: List[int],
        stages_out_channels: List[int],
        backbone_mode: bool = False,
        num_classes: int = 1000,
        block: nn.Module = ChannelShuffleInvertedResidual,
    ):
        super(ShuffleNetV2Base, self).__init__()

        self.backbone_mode = backbone_mode

        if len(structure) != 3:
            raise ValueError("expected structure as list of 3 positive ints")
        if len(stages_out_channels) != 5:
            raise ValueError("expected stages_out_channels as list of 5 positive ints")
        self.structure = structure
        self.out_channels = stages_out_channels

        input_channels = 3
        output_channels = self.out_channels[0]
        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )
        input_channels = output_channels

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Static annotations for mypy
        self.layer2 = self._make_layer(block, input_channels, self.out_channels[1], self.structure[0])
        self.layer3 = self._make_layer(block, self.out_channels[1], self.out_channels[2], self.structure[1])
        self.layer4 = self._make_layer(block, self.out_channels[2], self.out_channels[3], self.structure[2])

        input_channels = self.out_channels[3]
        output_channels = self.out_channels[-1]
        self.conv5 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
        )

        if not self.backbone_mode:
            self.avgpool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(output_channels, num_classes)

    @staticmethod
    def _make_layer(block, input_channels, output_channels, repeats):
        # add first block with stride 2 to downsize the input
        seq = [block(input_channels, output_channels, 2)]

        for _ in range(repeats - 1):
            seq.append(block(output_channels, output_channels, 1))
        return nn.Sequential(*seq)

    def load_state_dict(self, state_dict, strict=True):
        """
        load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
        :param state_dict:  The state_dict to load
        :param strict:      strict loading (see super() docs)
        """
        pretrained_model_weights_dict = state_dict.copy()

        if self.backbone_mode:
            # removing fc weights first not to break strict loading
            fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

            for key in fc_weights_keys:
                pretrained_model_weights_dict.pop(key)

        super().load_state_dict(pretrained_model_weights_dict, strict)

    def forward(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.conv5(x)

        if not self.backbone_mode:
            x = self.avgpool(x)
            x = x.view(x.size(0), -1)
            x = self.fc(x)
        return x

load_state_dict(state_dict, strict=True)

load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone

Parameters:

Name Type Description Default
state_dict

The state_dict to load

required
strict

strict loading (see super() docs)

True
Source code in V3_2/src/super_gradients/training/models/classification_models/shufflenetv2.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def load_state_dict(self, state_dict, strict=True):
    """
    load_state_dict - Overloads the base method and calls it to load a modified dict for usage as a backbone
    :param state_dict:  The state_dict to load
    :param strict:      strict loading (see super() docs)
    """
    pretrained_model_weights_dict = state_dict.copy()

    if self.backbone_mode:
        # removing fc weights first not to break strict loading
        fc_weights_keys = [k for k in pretrained_model_weights_dict if "fc" in k]

        for key in fc_weights_keys:
            pretrained_model_weights_dict.pop(key)

    super().load_state_dict(pretrained_model_weights_dict, strict)

VGG11/13/16/19 in Pytorch. Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py

Vision Transformer in PyTorch. Reference: [1] Dosovitskiy, Alexey, et al. "An image is worth 16x16 words: Transformers for image recognition at scale." arXiv preprint arXiv:2010.11929 (2020)

Code adapted from https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py

Attention

Bases: nn.Module

self attention layer with residual connection

Source code in V3_2/src/super_gradients/training/models/classification_models/vit.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class Attention(nn.Module):
    """
    self attention layer with residual connection
    """

    def __init__(self, hidden_dim, heads=8):
        super().__init__()
        dim_head = hidden_dim // heads
        inner_dim = dim_head * heads

        self.heads = heads
        self.scale = dim_head**-0.5

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(hidden_dim, inner_dim * 3, bias=True)  # Qx, Kx, Vx are calculated at once
        self.proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):

        B, N, C = x.shape
        # computing query, key and value matrices at once
        qkv = self.to_qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        out = (attn @ v).transpose(1, 2).reshape(B, N, C)

        out = self.proj(out)

        return out

FeedForward

Bases: nn.Module

feed forward block with residual connection

Source code in V3_2/src/super_gradients/training/models/classification_models/vit.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class FeedForward(nn.Module):
    """
    feed forward block with residual connection
    """

    def __init__(self, hidden_dim, mlp_dim, dropout=0.0):
        super().__init__()
        self.fc1 = nn.Linear(hidden_dim, mlp_dim)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(mlp_dim, hidden_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.act(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

PatchEmbed

Bases: nn.Module

2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)

Source code in V3_2/src/super_gradients/training/models/classification_models/vit.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class PatchEmbed(nn.Module):
    """
    2D Image to Patch Embedding Using Conv layers (Faster than rearranging + Linear)
    """

    def __init__(self, img_size: tuple, patch_size: tuple, in_channels=3, hidden_dim=768, norm_layer=None, flatten=True):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(in_channels, hidden_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(hidden_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x

ViT

Bases: SgModule

Source code in V3_2/src/super_gradients/training/models/classification_models/vit.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
class ViT(SgModule):
    def __init__(
        self,
        image_size: tuple,
        patch_size: tuple,
        num_classes: int,
        hidden_dim: int,
        depth: int,
        heads: int,
        mlp_dim: int,
        in_channels=3,
        dropout_prob=0.0,
        emb_dropout_prob=0.0,
        backbone_mode=False,
    ):
        """
        :param image_size: Image size tuple for data processing into patches done within the model.
        :param patch_size: Patch size tuple for data processing into patches done within the model.
        :param num_classes: Number of classes for the classification head.
        :param hidden_dim: Output dimension of each transformer block.
        :param depth: Number of transformer blocks
        :param heads: Number of attention heads
        :param mlp_dim: Intermediate dimension of the transformer block's feed forward
        :param in_channels: input channels
        :param dropout: Dropout ratio between the feed forward layers.
        :param emb_dropout: Dropout ratio between after the embedding layer
        :param backbone_mode: If True output after pooling layer
        """

        super().__init__()
        image_height, image_width = image_size
        patch_height, patch_width = patch_size

        assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
        assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

        num_patches = (image_height // patch_height) * (image_width // patch_width)

        self.patch_embedding = PatchEmbed(image_size, patch_size, in_channels=in_channels, hidden_dim=hidden_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
        self.dropout = nn.Dropout(emb_dropout_prob)

        self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

        self.backbone_mode = backbone_mode
        self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
        self.head = nn.Linear(hidden_dim, num_classes)

    def forward(self, img):
        x = self.patch_embedding(img)  # Convert image to patches and embed
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, : (n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = self.pre_head_norm(x)
        x = x[:, 0]
        if self.backbone_mode:
            return x
        else:
            return self.head(x)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head = nn.Linear(self.head.in_features, new_num_classes)

__init__(image_size, patch_size, num_classes, hidden_dim, depth, heads, mlp_dim, in_channels=3, dropout_prob=0.0, emb_dropout_prob=0.0, backbone_mode=False)

Parameters:

Name Type Description Default
image_size tuple

Image size tuple for data processing into patches done within the model.

required
patch_size tuple

Patch size tuple for data processing into patches done within the model.

required
num_classes int

Number of classes for the classification head.

required
hidden_dim int

Output dimension of each transformer block.

required
depth int

Number of transformer blocks

required
heads int

Number of attention heads

required
mlp_dim int

Intermediate dimension of the transformer block's feed forward

required
in_channels

input channels

3
dropout

Dropout ratio between the feed forward layers.

required
emb_dropout

Dropout ratio between after the embedding layer

required
backbone_mode

If True output after pooling layer

False
Source code in V3_2/src/super_gradients/training/models/classification_models/vit.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def __init__(
    self,
    image_size: tuple,
    patch_size: tuple,
    num_classes: int,
    hidden_dim: int,
    depth: int,
    heads: int,
    mlp_dim: int,
    in_channels=3,
    dropout_prob=0.0,
    emb_dropout_prob=0.0,
    backbone_mode=False,
):
    """
    :param image_size: Image size tuple for data processing into patches done within the model.
    :param patch_size: Patch size tuple for data processing into patches done within the model.
    :param num_classes: Number of classes for the classification head.
    :param hidden_dim: Output dimension of each transformer block.
    :param depth: Number of transformer blocks
    :param heads: Number of attention heads
    :param mlp_dim: Intermediate dimension of the transformer block's feed forward
    :param in_channels: input channels
    :param dropout: Dropout ratio between the feed forward layers.
    :param emb_dropout: Dropout ratio between after the embedding layer
    :param backbone_mode: If True output after pooling layer
    """

    super().__init__()
    image_height, image_width = image_size
    patch_height, patch_width = patch_size

    assert image_height % patch_height == 0 and image_width % patch_width == 0, "Image dimensions must be divisible by the patch size."
    assert hidden_dim % heads == 0, "Hidden dimension must be divisible by the number of heads."

    num_patches = (image_height // patch_height) * (image_width // patch_width)

    self.patch_embedding = PatchEmbed(image_size, patch_size, in_channels=in_channels, hidden_dim=hidden_dim)
    self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
    self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
    self.dropout = nn.Dropout(emb_dropout_prob)

    self.transformer = Transformer(hidden_dim, depth, heads, mlp_dim, dropout_prob)

    self.backbone_mode = backbone_mode
    self.pre_head_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
    self.head = nn.Linear(hidden_dim, num_classes)

ConvertableCompletePipelineModel

Bases: torch.nn.Module

Exportable nn.Module that wraps the model, preprocessing and postprocessing.

Parameters:

Name Type Description Default
model torch.nn.Module

torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.

required
pre_process torch.nn.Module

torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().

None
**prep_model_for_conversion_kwargs

for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call.

{}
Source code in V3_2/src/super_gradients/training/models/conversion.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class ConvertableCompletePipelineModel(torch.nn.Module):
    """
    Exportable nn.Module that wraps the model, preprocessing and postprocessing.

    :param model: torch.nn.Module, the main model. takes input from pre_process' output, and feeds pre_process.
    :param pre_process: torch.nn.Module, preprocessing module, its output will be model's input. When none (default), set to Identity().
    :param pre_process: torch.nn.Module, postprocessing module, its output is the final output. When none (default), set to Identity().
    :param **prep_model_for_conversion_kwargs: for SgModules- args to be passed to model.prep_model_for_conversion
            prior to torch.onnx.export call.
    """

    def __init__(self, model: torch.nn.Module, pre_process: torch.nn.Module = None, post_process: torch.nn.Module = None, **prep_model_for_conversion_kwargs):
        super(ConvertableCompletePipelineModel, self).__init__()
        model.eval()
        pre_process = pre_process or Identity()
        post_process = post_process or Identity()
        if hasattr(model, "prep_model_for_conversion"):
            model.prep_model_for_conversion(**prep_model_for_conversion_kwargs)
        self.model = model
        self.pre_process = pre_process
        self.post_process = post_process

    def forward(self, x):
        return self.post_process(self.model(self.pre_process(x)))

convert_from_config(cfg)

Exports model according to cfg.

See: super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation, and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.

Parameters:

Name Type Description Default
cfg DictConfig required

Returns:

Type Description
str

out_path, the path of the saved .onnx file.

Source code in V3_2/src/super_gradients/training/models/conversion.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def convert_from_config(cfg: DictConfig) -> str:
    """
    Exports model according to cfg.

    See:
     super_gradients/recipes/conversion_params/default_conversion_params.yaml for the full cfg content documentation,
     and super_gradients/examples/convert_recipe_example/convert_recipe_example.py for usage.
    :param cfg:
    :return: out_path, the path of the saved .onnx file.
    """
    cfg, experiment_cfg = prepare_conversion_cfgs(cfg)
    model = models.get(
        model_name=experiment_cfg.architecture,
        num_classes=experiment_cfg.arch_params.num_classes,
        arch_params=experiment_cfg.arch_params,
        strict_load=cfg.strict_load,
        checkpoint_path=cfg.checkpoint_path,
    )
    cfg = parse_args(cfg, models.convert_to_onnx)
    out_path = models.convert_to_onnx(model=model, **cfg)
    logger.info(f"Successfully exported model at {out_path}")
    return out_path

convert_to_coreml(model, out_path, input_size=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, export_as_ml_program=False, torch_trace_kwargs=None)

Exports a given SG model to CoreML mlprogram or package.

:param model: torch.nn.Module, model to export to CoreML.
:param out_path: str, destination path for the .mlmodel file.
:param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
:param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
:param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
:param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
 prior to ct.convert call. Supported keys are:
- input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
:param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                    (Supports more iOS versions and devices, but this format will be deprecated at some point).
:param torch_trace_kwargs: kwargs for torch.jit.trace

Returns:

Type Description

Path

Source code in V3_2/src/super_gradients/training/models/conversion.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_coreml(
    model: torch.nn.Module,
    out_path: str,
    input_size: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    export_as_ml_program=False,
    torch_trace_kwargs=None,
):
    """
        Exports a given SG model to CoreML mlprogram or package.

        :param model: torch.nn.Module, model to export to CoreML.
        :param out_path: str, destination path for the .mlmodel file.
        :param input_size: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
        :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
        :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
        :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
         prior to ct.convert call. Supported keys are:
        - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
        :param export_as_ml_program: Whether to convert to the new program format (better) or legacy coreml proto file
                            (Supports more iOS versions and devices, but this format will be deprecated at some point).
        :param torch_trace_kwargs: kwargs for torch.jit.trace
    :return: Path
    """
    if ct is None:
        raise ImportError(
            '"coremltools" is required for CoreML export, but is not installed. Please install CoreML Tools using:\n'
            '   "python3 -m pip install coremltools" and try again (Tested with version 6.3.0);'
        )

    logger.debug("Building model...")
    logger.debug(model)
    logger.debug("Model child nodes:")
    logger.debug(next(model.named_children()))

    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the CoreML file.")
    torch_trace_kwargs = torch_trace_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_size is not None:
        input_size = (1, *input_size)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_coreml(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    # TODO: support more than 1 input when prep_for_conversoin will support it.
    example_inputs = [torch.Tensor(np.zeros(input_size))]

    if not out_path.endswith(".mlpackage") and not out_path.endswith(".mlmodel"):
        out_path += ".mlpackage" if export_as_ml_program else ".mlmodel"

    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    # Set the model in evaluation mode.
    complete_model.eval()

    logger.info("Creating torch jit trace...")
    traced_model = torch.jit.trace(complete_model, example_inputs, **torch_trace_kwargs)
    logger.info("Tracing the model with the provided inputs...")
    out = traced_model(*example_inputs)  # using * because example_inputs is a list
    logger.info(f"Inferred output shapes: {[o.shape for o in out]}")
    if export_as_ml_program:
        coreml_model = ct.convert(
            traced_model, convert_to="mlprogram", inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)]
        )
    else:
        coreml_model = ct.convert(traced_model, inputs=[ct.ImageType(name=f"x_{i + 1}", shape=_.shape) for i, _ in enumerate(example_inputs)])

    spec = coreml_model.get_spec()
    logger.debug(spec.description)

    # Changing the input names:
    #   In CoreML, the input name is compiled into classes (named keyword argument in predict).
    #   We want to re-use the same names among different models to make research easier.
    #   We normalize the inputs names to be x_1, x_2, etc.
    for i, _input in enumerate(spec.description.input):
        new_input_name = "x_" + str(i + 1)
        logger.info(f"Renaming input {_input.name} to {new_input_name}")
        ct.utils.rename_feature(spec, _input.name, new_input_name)

    # Re-Initializing the model with the new spec
    coreml_model = ct.models.MLModel(spec, weights_dir=coreml_model.weights_dir)

    # Saving the model
    coreml_model.save(out_path)
    logger.info(f"CoreML model successfully save to {os.path.abspath(out_path)}")
    return out_path

convert_to_onnx(model, out_path, input_shape=None, pre_process=None, post_process=None, prep_model_for_conversion_kwargs=None, torch_onnx_export_kwargs=None, simplify=True)

Exports model to ONNX.

Parameters:

Name Type Description Default
model torch.nn.Module

torch.nn.Module, model to export to ONNX.

required
out_path str

str, destination path for the .onnx file.

required
input_shape tuple

Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1. DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.

None
pre_process torch.nn.Module

torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()

None
post_process torch.nn.Module

torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()

None
prep_model_for_conversion_kwargs

dict, for SgModules- args to be passed to model.prep_model_for_conversion prior to torch.onnx.export call. Supported keys are: - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.

None
torch_onnx_export_kwargs

kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call

None
simplify bool

bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path. When true, the simplified model will be saved in out_path (default=True).

True

Returns:

Type Description

out_path

Source code in V3_2/src/super_gradients/training/models/conversion.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
@resolve_param("pre_process", TransformsFactory())
@resolve_param("post_process", TransformsFactory())
def convert_to_onnx(
    model: torch.nn.Module,
    out_path: str,
    input_shape: tuple = None,
    pre_process: torch.nn.Module = None,
    post_process: torch.nn.Module = None,
    prep_model_for_conversion_kwargs=None,
    torch_onnx_export_kwargs=None,
    simplify: bool = True,
):
    """
    Exports model to ONNX.

    :param model: torch.nn.Module, model to export to ONNX.
    :param out_path: str, destination path for the .onnx file.
    :param input_shape: Input shape without batch dimensions ([C,H,W]). Batch size assumed to be 1.
    DEPRECATED USE input_size KWARG IN prep_model_for_conversion_kwargs INSTEAD.
    :param pre_process: torch.nn.Module, preprocessing pipeline, will be resolved by TransformsFactory()
    :param post_process: torch.nn.Module, postprocessing pipeline, will be resolved by TransformsFactory()
    :param prep_model_for_conversion_kwargs: dict, for SgModules- args to be passed to model.prep_model_for_conversion
     prior to torch.onnx.export call. Supported keys are:
    - input_size - Shape of inputs with batch dimension, [C,H,W] for image inputs.
    :param torch_onnx_export_kwargs: kwargs (EXCLUDING: FIRST 3 KWARGS- MODEL, F, ARGS). to be unpacked in torch.onnx.export call
    :param simplify: bool,whether to apply onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path.
     When true, the simplified model will be saved in out_path (default=True).

    :return: out_path
    """
    if not os.path.isdir(pathlib.Path(out_path).parent.resolve()):
        raise FileNotFoundError(f"Could not find destination directory {out_path} for the ONNX file.")
    torch_onnx_export_kwargs = torch_onnx_export_kwargs or dict()
    prep_model_for_conversion_kwargs = prep_model_for_conversion_kwargs or dict()

    if input_shape is not None:
        input_size = (1, *input_shape)
        logger.warning(
            f"input_shape is deprecated and will be removed in the next major release."
            f"Use the convert_to_onnx(..., prep_model_for_conversion_kwargs(input_size={input_size})) instead"
        )
        prep_model_for_conversion_kwargs["input_size"] = input_size

    if "input_size" not in prep_model_for_conversion_kwargs:
        raise KeyError("input_size must be provided in prep_model_for_conversion_kwargs")

    input_size = prep_model_for_conversion_kwargs["input_size"]

    onnx_input = torch.Tensor(np.zeros(input_size))
    if not out_path.endswith(".onnx"):
        out_path = out_path + ".onnx"
    complete_model = ConvertableCompletePipelineModel(model, pre_process, post_process, **prep_model_for_conversion_kwargs)

    torch.onnx.export(model=complete_model, args=onnx_input, f=out_path, **torch_onnx_export_kwargs)
    if simplify:
        onnx_simplify(out_path, out_path)
    return out_path

onnx_simplify(onnx_path, onnx_sim_path)

onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path

Parameters:

Name Type Description Default
onnx_path str

path to onnx model

required
onnx_sim_path str

path for output onnx simplified model

required
Source code in V3_2/src/super_gradients/training/models/conversion.py
267
268
269
270
271
272
273
274
275
276
def onnx_simplify(onnx_path: str, onnx_sim_path: str):
    """
    onnx simplifier method, same as `python -m onnxsim onnx_path onnx_sim_path
    :param onnx_path: path to onnx model
    :param onnx_sim_path: path for output onnx simplified model
    """
    model_sim, check = simplify(model=onnx_path)
    if not check:
        raise RuntimeError("Simplified ONNX model could not be validated")
    onnx.save_model(model_sim, onnx_sim_path)

prepare_conversion_cfgs(cfg)

Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name) to be used by convert_recipe_example

Parameters:

Name Type Description Default
cfg DictConfig

DictConfig, converion_params config

required

Returns:

Type Description

cfg, experiment_cfg

Source code in V3_2/src/super_gradients/training/models/conversion.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def prepare_conversion_cfgs(cfg: DictConfig):
    """
    Builds the cfg (i.e conversion_params) and experiment_cfg (i.e recipe config according to cfg.experiment_name)
     to be used by convert_recipe_example

    :param cfg: DictConfig, converion_params config
    :return: cfg, experiment_cfg
    """
    cfg = hydra.utils.instantiate(cfg)
    # CREATE THE EXPERIMENT CFG
    experiment_cfg = load_experiment_cfg(cfg.experiment_name, cfg.ckpt_root_dir)
    hydra.utils.instantiate(experiment_cfg)
    if cfg.checkpoint_path is None:
        logger.info(
            "checkpoint_params.checkpoint_path was not provided, so the model will be converted using weights from "
            "checkpoints_dir/training_hyperparams.ckpt_name "
        )
        checkpoints_dir = Path(get_checkpoints_dir_path(experiment_name=cfg.experiment_name, ckpt_root_dir=cfg.ckpt_root_dir))
        cfg.checkpoint_path = str(checkpoints_dir / cfg.ckpt_name)
    cfg.out_path = cfg.out_path or cfg.checkpoint_path.replace(".pth", ".onnx")
    logger.info(f"Exporting checkpoint: {cfg.checkpoint_path} to ONNX.")
    return cfg, experiment_cfg

CSP Darknet

CSPLayer

Bases: nn.Module

CSP Bottleneck with 3 convolutions

Parameters:

Name Type Description Default
in_channels int

int, input channels.

required
out_channels int

int, output channels.

required
num_bottlenecks int

int, number of bottleneck conv layers.

required
act Type[nn.Module]

Type[nn.module], activation type.

required
shortcut bool

bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).

True
depthwise bool

bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).

False
expansion float

float, determines the number of hidden channels (default=0.5).

0.5
Source code in V3_2/src/super_gradients/training/models/detection_models/csp_darknet53.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
class CSPLayer(nn.Module):
    """
    CSP Bottleneck with 3 convolutions

    :param in_channels: int, input channels.
    :param out_channels: int, output channels.
    :param num_bottlenecks: int, number of bottleneck conv layers.
    :param act: Type[nn.module], activation type.
    :param shortcut: bool, whether to apply shortcut (i.e add input to result) in bottlenecks (default=True).
    :param depthwise: bool, whether to use GroupedConvBlock in last conv in bottlenecks (default=False).
    :param expansion: float, determines the number of hidden channels (default=0.5).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        act: Type[nn.Module],
        shortcut: bool = True,
        depthwise: bool = False,
        expansion: float = 0.5,
    ):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=act)
        self.conv3 = Conv(2 * hidden_channels, out_channels, 1, stride=1, activation_type=act)
        module_list = [Bottleneck(hidden_channels, hidden_channels, shortcut, act, depthwise) for _ in range(num_bottlenecks)]
        self.bottlenecks = nn.Sequential(*module_list)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((x_1, x_2), dim=1)
        return self.conv3(x)

GroupedConvBlock

Bases: nn.Module

Grouped Conv KxK -> usual Conv 1x1

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_darknet53.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class GroupedConvBlock(nn.Module):
    """
    Grouped Conv KxK -> usual Conv 1x1
    """

    def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
        """
        :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                        (groups = input channels)
        """
        super().__init__()

        self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
        self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

    def forward(self, x):
        return self.conv(self.dconv(x))

__init__(input_channels, output_channels, kernel, stride, activation_type, padding=None, groups=None)

Parameters:

Name Type Description Default
groups int

num of groups in the first conv; if None depthwise separable conv will be used (groups = input channels)

None
Source code in V3_2/src/super_gradients/training/models/detection_models/csp_darknet53.py
42
43
44
45
46
47
48
49
50
def __init__(self, input_channels, output_channels, kernel, stride, activation_type: Type[nn.Module], padding: int = None, groups: int = None):
    """
    :param groups:  num of groups in the first conv; if None depthwise separable conv will be used
                    (groups = input channels)
    """
    super().__init__()

    self.dconv = Conv(input_channels, input_channels, kernel, stride, activation_type, padding, groups=groups or input_channels)
    self.conv = Conv(input_channels, output_channels, 1, 1, activation_type)

SPP

Bases: BaseDetectionModule

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_darknet53.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
@register_detection_module()
class SPP(BaseDetectionModule):
    # SPATIAL PYRAMID POOLING LAYER
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(self, in_channels, output_channels, k: Tuple, activation_type: Type[nn.Module]):
        super().__init__(in_channels)
        self._output_channels = output_channels

        hidden_channels = in_channels // 2
        self.cv1 = Conv(in_channels, hidden_channels, 1, 1, activation_type)
        self.cv2 = Conv(hidden_channels * (len(k) + 1), output_channels, 1, 1, activation_type)
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        x = self.cv1(x)
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))

    @property
    def out_channels(self):
        """
        :return: channels of tensor(s) that will be returned by a module  in forward
        """
        return self._output_channels

out_channels property

Returns:

Type Description

channels of tensor(s) that will be returned by a module in forward

ViewModule

Bases: nn.Module

Returns a reshaped version of the input, to be used in None-Backbone Mode

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_darknet53.py
157
158
159
160
161
162
163
164
165
166
167
class ViewModule(nn.Module):
    """
    Returns a reshaped version of the input, to be used in None-Backbone Mode
    """

    def __init__(self, features=1024):
        super(ViewModule, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(-1, self.features)

CSPResNetBackbone

Bases: nn.Module

CSPResNet backbone

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
@register_detection_module()
class CSPResNetBackbone(nn.Module):
    """
    CSPResNet backbone
    """

    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        layers: Tuple[int, ...],
        channels: Tuple[int, ...],
        activation: Type[nn.Module],
        return_idx: Tuple[int, int, int],
        use_large_stem: bool,
        width_mult: float,
        depth_mult: float,
        use_alpha: bool,
        pretrained_weights: Optional[str] = None,
        in_channels: int = 3,
    ):
        """

        :param layers: Number of blocks in each stage
        :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
        :param activation: Used activation type for all child modules.
        :param return_idx: Indexes of returned feature maps
        :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
        :param width_mult: Scaling factor for a number of channels
        :param depth_mult: Scaling factor for a number of blocks in each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        :param pretrained_weights:
        :param in_channels: Number of input channels. Default: 3
        """
        super().__init__()
        channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
        layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

        if use_large_stem:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(
                                channels[0] // 2,
                                channels[0] // 2,
                                3,
                                stride=1,
                                padding=1,
                                activation_type=activation,
                                bias=False,
                            ),
                        ),
                        (
                            "conv3",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )
        else:
            self.stem = nn.Sequential(
                collections.OrderedDict(
                    [
                        (
                            "conv1",
                            ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                        ),
                        (
                            "conv2",
                            ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                        ),
                    ]
                )
            )

        n = len(channels) - 1
        self.stages = nn.ModuleList(
            [
                CSPResStage(
                    channels[i],
                    channels[i + 1],
                    layers[i],
                    stride=2,
                    activation_type=activation,
                    use_alpha=use_alpha,
                )
                for i in range(n)
            ]
        )

        self._out_channels = channels[1:]
        self._out_strides = [4 * 2**i for i in range(n)]
        self.return_idx = tuple(return_idx)

        if pretrained_weights:
            if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
                state_dict = torch.load(str(pretrained_weights), map_location="cpu")
            elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
                with wait_for_the_master(get_local_rank()):
                    state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
            else:
                raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
            self.load_state_dict(state_dict)

    def forward(self, x: Tensor) -> List[Tensor]:
        x = self.stem(x)
        outs = []
        for idx, stage in enumerate(self.stages):
            x = stage(x)
            if idx in self.return_idx:
                outs.append(x)

        return outs

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """
        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.fuse_block_residual_branches()

    @property
    def out_channels(self) -> Tuple[int]:
        return tuple(self._out_channels)

__init__(layers, channels, activation, return_idx, use_large_stem, width_mult, depth_mult, use_alpha, pretrained_weights=None, in_channels=3)

Parameters:

Name Type Description Default
layers Tuple[int, ...]

Number of blocks in each stage

required
channels Tuple[int, ...]

Number of channels [stem, stage 0, stage 1, stage 2, ...]

required
activation Type[nn.Module]

Used activation type for all child modules.

required
return_idx Tuple[int, int, int]

Indexes of returned feature maps

required
use_large_stem bool

If True, uses 3 conv+bn+act instead of 2 in stem blocks

required
width_mult float

Scaling factor for a number of channels

required
depth_mult float

Scaling factor for a number of blocks in each stage

required
use_alpha bool

If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock

required
pretrained_weights Optional[str] None
in_channels int

Number of input channels. Default: 3

3
Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
@resolve_param("activation", ActivationsTypeFactory())
def __init__(
    self,
    layers: Tuple[int, ...],
    channels: Tuple[int, ...],
    activation: Type[nn.Module],
    return_idx: Tuple[int, int, int],
    use_large_stem: bool,
    width_mult: float,
    depth_mult: float,
    use_alpha: bool,
    pretrained_weights: Optional[str] = None,
    in_channels: int = 3,
):
    """

    :param layers: Number of blocks in each stage
    :param channels: Number of channels [stem, stage 0, stage 1, stage 2, ...]
    :param activation: Used activation type for all child modules.
    :param return_idx: Indexes of returned feature maps
    :param use_large_stem: If True, uses 3 conv+bn+act instead of 2 in stem blocks
    :param width_mult: Scaling factor for a number of channels
    :param depth_mult: Scaling factor for a number of blocks in each stage
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    :param pretrained_weights:
    :param in_channels: Number of input channels. Default: 3
    """
    super().__init__()
    channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
    layers = [max(round(num_layers * depth_mult), 1) for num_layers in layers]

    if use_large_stem:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(in_channels, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(
                            channels[0] // 2,
                            channels[0] // 2,
                            3,
                            stride=1,
                            padding=1,
                            activation_type=activation,
                            bias=False,
                        ),
                    ),
                    (
                        "conv3",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )
    else:
        self.stem = nn.Sequential(
            collections.OrderedDict(
                [
                    (
                        "conv1",
                        ConvBNAct(3, channels[0] // 2, 3, stride=2, padding=1, activation_type=activation, bias=False),
                    ),
                    (
                        "conv2",
                        ConvBNAct(channels[0] // 2, channels[0], 3, stride=1, padding=1, activation_type=activation, bias=False),
                    ),
                ]
            )
        )

    n = len(channels) - 1
    self.stages = nn.ModuleList(
        [
            CSPResStage(
                channels[i],
                channels[i + 1],
                layers[i],
                stride=2,
                activation_type=activation,
                use_alpha=use_alpha,
            )
            for i in range(n)
        ]
    )

    self._out_channels = channels[1:]
    self._out_strides = [4 * 2**i for i in range(n)]
    self.return_idx = tuple(return_idx)

    if pretrained_weights:
        if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
            state_dict = torch.load(str(pretrained_weights), map_location="cpu")
        elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
            with wait_for_the_master(get_local_rank()):
                state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
        else:
            raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
        self.load_state_dict(state_dict)

prep_model_for_conversion(input_size=None, **kwargs)

Prepare the model to be converted to ONNX or other frameworks. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules with convertible substitutes and remove all auxiliary or training related parts.

Parameters:

Name Type Description Default
input_size Union[tuple, list]

[H,W]

None
Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
231
232
233
234
235
236
237
238
239
240
def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
    """
    Prepare the model to be converted to ONNX or other frameworks.
    Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
    with convertible substitutes and remove all auxiliary or training related parts.
    :param input_size: [H,W]
    """
    for module in self.modules():
        if isinstance(module, RepVGGBlock):
            module.fuse_block_residual_branches()

CSPResNetBasicBlock

Bases: nn.Module

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class CSPResNetBasicBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
        """

        :param in_channels:
        :param out_channels:
        :param activation_type:
        :param use_residual_connection: Whether to add input x to the output
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
        """
        super().__init__()
        if use_residual_connection and in_channels != out_channels:
            raise RuntimeError(
                f"Number of input channels (got {in_channels}) must be equal to the "
                f"number of output channels (got {out_channels}) when use_residual_connection=True"
            )
        self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
        self.conv2 = RepVGGBlock(
            out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
        )
        self.use_residual_connection = use_residual_connection

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        if self.use_residual_connection:
            return x + y
        else:
            return y

__init__(in_channels, out_channels, activation_type, use_residual_connection=True, use_alpha=False)

Parameters:

Name Type Description Default
in_channels int required
out_channels int required
activation_type Type[nn.Module] required
use_residual_connection bool

Whether to add input x to the output

True
use_alpha

If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock

False
Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(self, in_channels: int, out_channels: int, activation_type: Type[nn.Module], use_residual_connection: bool = True, use_alpha=False):
    """

    :param in_channels:
    :param out_channels:
    :param activation_type:
    :param use_residual_connection: Whether to add input x to the output
    :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
    """
    super().__init__()
    if use_residual_connection and in_channels != out_channels:
        raise RuntimeError(
            f"Number of input channels (got {in_channels}) must be equal to the "
            f"number of output channels (got {out_channels}) when use_residual_connection=True"
        )
    self.conv1 = ConvBNAct(in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation_type=activation_type, bias=False)
    self.conv2 = RepVGGBlock(
        out_channels, out_channels, activation_type=activation_type, se_type=nn.Identity, use_residual_connection=False, use_alpha=use_alpha
    )
    self.use_residual_connection = use_residual_connection

CSPResStage

Bases: nn.Module

Source code in V3_2/src/super_gradients/training/models/detection_models/csp_resnet.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class CSPResStage(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks,
        stride: int,
        activation_type: Type[nn.Module],
        use_attention: bool = True,
        use_alpha: bool = False,
    ):
        """

        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of blocks in stage
        :param stride: Desired down-sampling for the stage (Usually 2)
        :param activation_type: Non-linearity type used in child modules.
        :param use_attention: If True, adds EffectiveSEBlock at the end of each stage
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in underlying RepVGG blocks (PP-Yolo-E Plus)
        """
        super().__init__()

        mid_channels = (in_channels + out_channels) // 2
        half_mid_channels = mid_channels // 2
        mid_channels = 2 * half_mid_channels

        if stride != 1:
            self.conv_down = ConvBNAct(in_channels, mid_channels, 3, stride=stride, padding=1, activation_type=activation_type, bias=False)
        else:
            self.conv_down = None
        self.conv1 = ConvBNAct(mid_channels, half_mid_channels, kernel_size=1, stride=1, padding=0, activation_type=activation_type, bias=