Skip to content

Losses

BCEDiceLoss

Bases: torch.nn.Module

Binary Cross Entropy + Dice Loss

Weighted average of BCE and Dice loss

Parameters:

Name Type Description Default
loss_weights List[float]

List of size 2 s.t loss_weights[0], loss_weights[1] are the weights for BCE, Dice respectively.

[0.5, 0.5]
logits bool

Whether to use logits or not.

True
Source code in V3_6/src/super_gradients/training/losses/bce_dice_loss.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@register_loss(name=Losses.BCE_DICE_LOSS, deprecated_name="bce_dice_loss")
class BCEDiceLoss(torch.nn.Module):
    """
    Binary Cross Entropy + Dice Loss

    Weighted average of BCE and Dice loss

    :param loss_weights: List of size 2 s.t loss_weights[0], loss_weights[1] are the weights for BCE, Dice respectively.
    :param logits:       Whether to use logits or not.
    """

    def __init__(self, loss_weights: List[float] = [0.5, 0.5], logits: bool = True):
        super(BCEDiceLoss, self).__init__()
        self.loss_weights = loss_weights
        self.bce = BCE()
        self.dice = BinaryDiceLoss(apply_sigmoid=logits)

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        """

        :param input: Network's raw output shaped (N,1,H,W)
        :param target: Ground truth shaped (N,H,W)
        """

        return self.loss_weights[0] * self.bce(input, target) + self.loss_weights[1] * self.dice(input, target)

forward(input, target)

Parameters:

Name Type Description Default
input torch.Tensor

Network's raw output shaped (N,1,H,W)

required
target torch.Tensor

Ground truth shaped (N,H,W)

required
Source code in V3_6/src/super_gradients/training/losses/bce_dice_loss.py
27
28
29
30
31
32
33
34
def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
    """

    :param input: Network's raw output shaped (N,1,H,W)
    :param target: Ground truth shaped (N,H,W)
    """

    return self.loss_weights[0] * self.bce(input, target) + self.loss_weights[1] * self.dice(input, target)

BCE

Bases: BCEWithLogitsLoss

Binary Cross Entropy Loss

Source code in V3_6/src/super_gradients/training/losses/bce_loss.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
class BCE(BCEWithLogitsLoss):
    """
    Binary Cross Entropy Loss
    """

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        """

        :param input: Network's raw output shaped (N,1,*)
        :param target: Ground truth shaped (N,*)
        """
        return super(BCE, self).forward(input.squeeze(1), target.float())

forward(input, target)

Parameters:

Name Type Description Default
input torch.Tensor

Network's raw output shaped (N,1,*)

required
target torch.Tensor

Ground truth shaped (N,*)

required
Source code in V3_6/src/super_gradients/training/losses/bce_loss.py
10
11
12
13
14
15
16
def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
    """

    :param input: Network's raw output shaped (N,1,*)
    :param target: Ground truth shaped (N,*)
    """
    return super(BCE, self).forward(input.squeeze(1), target.float())

ChannelWiseKnowledgeDistillationLoss

Bases: nn.Module

Implementation of Channel-wise Knowledge distillation loss.

paper: "Channel-wise Knowledge Distillation for Dense Prediction", https://arxiv.org/abs/2011.13256 Official implementation: https://github.com/irfanICMLL/TorchDistiller/tree/main/SemSeg-distill

Source code in V3_6/src/super_gradients/training/losses/cwd_loss.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class ChannelWiseKnowledgeDistillationLoss(nn.Module):
    """
    Implementation of Channel-wise Knowledge distillation loss.

    paper: "Channel-wise Knowledge Distillation for Dense Prediction", https://arxiv.org/abs/2011.13256
    Official implementation: https://github.com/irfanICMLL/TorchDistiller/tree/main/SemSeg-distill
    """

    def __init__(self, normalization_mode: str = "channel_wise", temperature: float = 4.0, ignore_index: Optional[int] = None):
        """
        :param normalization_mode: default is for `channel-wise` normalization as implemented in the original paper,
         softmax is applied upon the spatial dimensions. For vanilla normalization, to apply softmax upon the channel
         dimension, set this value as `spatial_wise`.
        :param temperature: temperature relaxation value applied upon the logits before the normalization. default value
         is set to `4.0` as the original implementation.
        """
        super().__init__()
        self.T = temperature
        self.ignore_index = ignore_index

        self.kl_div = nn.KLDivLoss(reduction="sum" if ignore_index is None else "none")

        if normalization_mode not in ["channel_wise", "spatial_wise"]:
            raise ValueError(f"Unsupported normalization mode: {normalization_mode}")

        self.normalization_mode = normalization_mode

    def forward(self, student_preds: torch.Tensor, teacher_preds: torch.Tensor, target: Optional[torch.Tensor] = None):
        B, C, H, W = student_preds.size()

        # set the normalization axis and the averaging scalar.
        norm_axis = -1 if self.normalization_mode == "channel_wise" else 1
        averaging_scalar = (B * C) if self.normalization_mode == "channel_wise" else (B * H * W)

        # Softmax normalization
        softmax_teacher = torch.softmax(teacher_preds.view(B, C, -1) / self.T, dim=norm_axis)
        log_softmax_student = torch.log_softmax(student_preds.view(B, C, -1) / self.T, dim=norm_axis)

        loss = self.kl_div(log_softmax_student, softmax_teacher)

        if self.ignore_index is not None:
            valid_mask = target.view(B, -1).ne(self.ignore_index).unsqueeze(1).expand_as(loss)
            loss = (loss * valid_mask).sum()

        loss = loss * (self.T**2) / averaging_scalar
        return loss

__init__(normalization_mode='channel_wise', temperature=4.0, ignore_index=None)

Parameters:

Name Type Description Default
normalization_mode str

default is for channel-wise normalization as implemented in the original paper, softmax is applied upon the spatial dimensions. For vanilla normalization, to apply softmax upon the channel dimension, set this value as spatial_wise.

'channel_wise'
temperature float

temperature relaxation value applied upon the logits before the normalization. default value is set to 4.0 as the original implementation.

4.0
Source code in V3_6/src/super_gradients/training/losses/cwd_loss.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(self, normalization_mode: str = "channel_wise", temperature: float = 4.0, ignore_index: Optional[int] = None):
    """
    :param normalization_mode: default is for `channel-wise` normalization as implemented in the original paper,
     softmax is applied upon the spatial dimensions. For vanilla normalization, to apply softmax upon the channel
     dimension, set this value as `spatial_wise`.
    :param temperature: temperature relaxation value applied upon the logits before the normalization. default value
     is set to `4.0` as the original implementation.
    """
    super().__init__()
    self.T = temperature
    self.ignore_index = ignore_index

    self.kl_div = nn.KLDivLoss(reduction="sum" if ignore_index is None else "none")

    if normalization_mode not in ["channel_wise", "spatial_wise"]:
        raise ValueError(f"Unsupported normalization mode: {normalization_mode}")

    self.normalization_mode = normalization_mode

DDRNetLoss

Bases: OhemCELoss

Source code in V3_6/src/super_gradients/training/losses/ddrnet_loss.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class DDRNetLoss(OhemCELoss):
    def __init__(
        self,
        threshold: float = 0.7,
        ohem_percentage: float = 0.1,
        weights: List[float] = [1.0, 0.4],
        ignore_label: int = 255,
        num_pixels_exclude_ignored: bool = False,
    ):
        """
        This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.

        as define in paper:
        Accurate Semantic Segmentation of Road Scenes ( https://arxiv.org/pdf/2101.06085.pdf )

        :param threshold: threshold to th hard example mining algorithm
        :param ohem_percentage: minimum percentage of total pixels for the hard example mining algorithm
        (taking only the largest) losses
        :param weights: weights per each input of the loss. This loss supports a multi output (like in DDRNet with
        an auxiliary head). the losses of each head can be weighted.
        :param ignore_label: targets label to be ignored
        :param num_pixels_exclude_ignored: whether to exclude ignore pixels when calculating the mining percentage.
        see OhemCELoss doc for more details.
        """
        super().__init__(threshold=threshold, mining_percent=ohem_percentage, ignore_lb=ignore_label, num_pixels_exclude_ignored=num_pixels_exclude_ignored)
        self.weights = weights

    def forward(self, predictions_list: Union[list, tuple, torch.Tensor], targets: torch.Tensor):
        if isinstance(predictions_list, torch.Tensor):
            predictions_list = (predictions_list,)

        assert len(predictions_list) == len(self.weights), "num of prediction must be the same as num of loss weights"

        losses = []
        unweighted_losses = []
        for predictions, weight in zip(predictions_list, self.weights):
            unweighted_loss = super().forward(predictions, targets)
            unweighted_losses.append(unweighted_loss)
            losses.append(unweighted_loss * weight)
        total_loss = sum(losses)
        unweighted_losses.append(total_loss)

        return total_loss, torch.stack(unweighted_losses, dim=0).detach()

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["main_loss", "aux_loss1", "loss"]

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(threshold=0.7, ohem_percentage=0.1, weights=[1.0, 0.4], ignore_label=255, num_pixels_exclude_ignored=False)

This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.

as define in paper: Accurate Semantic Segmentation of Road Scenes ( https://arxiv.org/pdf/2101.06085.pdf )

Parameters:

Name Type Description Default
threshold float

threshold to th hard example mining algorithm

0.7
ohem_percentage float

minimum percentage of total pixels for the hard example mining algorithm (taking only the largest) losses

0.1
weights List[float]

weights per each input of the loss. This loss supports a multi output (like in DDRNet with an auxiliary head). the losses of each head can be weighted.

[1.0, 0.4]
ignore_label int

targets label to be ignored

255
num_pixels_exclude_ignored bool

whether to exclude ignore pixels when calculating the mining percentage. see OhemCELoss doc for more details.

False
Source code in V3_6/src/super_gradients/training/losses/ddrnet_loss.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def __init__(
    self,
    threshold: float = 0.7,
    ohem_percentage: float = 0.1,
    weights: List[float] = [1.0, 0.4],
    ignore_label: int = 255,
    num_pixels_exclude_ignored: bool = False,
):
    """
    This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.

    as define in paper:
    Accurate Semantic Segmentation of Road Scenes ( https://arxiv.org/pdf/2101.06085.pdf )

    :param threshold: threshold to th hard example mining algorithm
    :param ohem_percentage: minimum percentage of total pixels for the hard example mining algorithm
    (taking only the largest) losses
    :param weights: weights per each input of the loss. This loss supports a multi output (like in DDRNet with
    an auxiliary head). the losses of each head can be weighted.
    :param ignore_label: targets label to be ignored
    :param num_pixels_exclude_ignored: whether to exclude ignore pixels when calculating the mining percentage.
    see OhemCELoss doc for more details.
    """
    super().__init__(threshold=threshold, mining_percent=ohem_percentage, ignore_lb=ignore_label, num_pixels_exclude_ignored=num_pixels_exclude_ignored)
    self.weights = weights

DEKRLoss

Bases: nn.Module

Implementation of the loss function from the "Bottom-Up Human Pose Estimation Via Disentangled Keypoint Regression" paper (https://arxiv.org/abs/2104.02300)

This loss should be used in conjunction with DEKRTargetsGenerator.

Source code in V3_6/src/super_gradients/training/losses/dekr_loss.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
@register_loss(name=Losses.DEKR_LOSS, deprecated_name="dekr_loss")
class DEKRLoss(nn.Module):
    """
    Implementation of the loss function from the "Bottom-Up Human Pose Estimation Via Disentangled Keypoint Regression"
    paper (https://arxiv.org/abs/2104.02300)

    This loss should be used in conjunction with DEKRTargetsGenerator.
    """

    def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "mse"):
        """
        Instantiate the DEKR loss function. It is two-component loss function, consisting of a heatmap (MSE) loss and an offset (Smooth L1) losses.
        The total loss is the sum of the two individual losses, weighted by the corresponding factors.

        :param heatmap_loss_factor: Weighting factor for heatmap loss
        :param offset_loss_factor: Weighting factor for offset loss
        :param heatmap_loss: Type of heatmap loss to use. Can be "mse" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
                             We use QFL in our recipe as it produces better results.
        """
        super().__init__()
        self.heatmap_loss_factor = float(heatmap_loss_factor)
        self.offset_loss_factor = float(offset_loss_factor)
        self.heatmap_loss = {"mse": self.heatmap_mse_loss, "qfl": self.heatmap_qfl_loss}[heatmap_loss]

    @property
    def component_names(self):
        """
        Names of individual loss components for logging during training.
        """
        return ["heatmap", "offset", "total"]

    def forward(self, predictions: Tuple[Tensor, Tensor], targets: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
        """

        :param predictions: Tuple of (heatmap, offset) predictions.
            heatmap is of shape (B, NumJoints + 1, H, W)
            offset is of shape (B, NumJoints * 2, H, W)

        :param targets: Tuple of (heatmap, mask, offset, offset_weight).
            heatmap is of shape (B, NumJoints + 1, H, W)
            mask is of shape (B, NumJoints + 1, H, W)
            offset is of shape (B, NumJoints * 2, H, W)
            offset_weight is of shape (B, NumJoints * 2, H, W)

        :return: Tuple of (loss, loss_components)
            loss is a scalar tensor with the total loss
            loss_components is a tensor of shape (3,) containing the individual loss components for logging (detached from the graph)
        """
        pred_heatmap, pred_offset = predictions
        gt_heatmap, mask, gt_offset, offset_weight = targets

        heatmap_loss = self.heatmap_loss(pred_heatmap, gt_heatmap, mask) * self.heatmap_loss_factor
        offset_loss = self.offset_loss(pred_offset, gt_offset, offset_weight) * self.offset_loss_factor

        loss = heatmap_loss + offset_loss
        components = torch.cat(
            (
                heatmap_loss.unsqueeze(0),
                offset_loss.unsqueeze(0),
                loss.unsqueeze(0),
            )
        ).detach()

        return loss, components

    def heatmap_mse_loss(self, pred_heatmap, true_heatmap, mask):
        loss = torch.nn.functional.mse_loss(pred_heatmap, true_heatmap, reduction="none") * mask
        loss = loss.mean()
        return loss

    def heatmap_qfl_loss(self, pred_heatmap, true_heatmap, mask):
        scale_factor = (true_heatmap - pred_heatmap.sigmoid()).abs().pow(2)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(pred_heatmap, true_heatmap, reduction="none") * scale_factor
        loss = loss.mean()
        return loss

    def offset_loss(self, pred_offsets, true_offsets, weights):
        num_pos = torch.nonzero(weights > 0).size()[0]
        loss = torch.nn.functional.smooth_l1_loss(pred_offsets, true_offsets, reduction="none", beta=1.0 / 9) * weights
        if num_pos == 0:
            num_pos = 1.0
        loss = loss.sum() / num_pos
        return loss

component_names property

Names of individual loss components for logging during training.

__init__(heatmap_loss_factor=1.0, offset_loss_factor=0.1, heatmap_loss='mse')

Instantiate the DEKR loss function. It is two-component loss function, consisting of a heatmap (MSE) loss and an offset (Smooth L1) losses. The total loss is the sum of the two individual losses, weighted by the corresponding factors.

Parameters:

Name Type Description Default
heatmap_loss_factor float

Weighting factor for heatmap loss

1.0
offset_loss_factor float

Weighting factor for offset loss

0.1
heatmap_loss str

Type of heatmap loss to use. Can be "mse" (Used in DEKR paper) or "qfl" (Quality Focal Loss). We use QFL in our recipe as it produces better results.

'mse'
Source code in V3_6/src/super_gradients/training/losses/dekr_loss.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "mse"):
    """
    Instantiate the DEKR loss function. It is two-component loss function, consisting of a heatmap (MSE) loss and an offset (Smooth L1) losses.
    The total loss is the sum of the two individual losses, weighted by the corresponding factors.

    :param heatmap_loss_factor: Weighting factor for heatmap loss
    :param offset_loss_factor: Weighting factor for offset loss
    :param heatmap_loss: Type of heatmap loss to use. Can be "mse" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
                         We use QFL in our recipe as it produces better results.
    """
    super().__init__()
    self.heatmap_loss_factor = float(heatmap_loss_factor)
    self.offset_loss_factor = float(offset_loss_factor)
    self.heatmap_loss = {"mse": self.heatmap_mse_loss, "qfl": self.heatmap_qfl_loss}[heatmap_loss]

forward(predictions, targets)

Parameters:

Name Type Description Default
predictions Tuple[Tensor, Tensor]

Tuple of (heatmap, offset) predictions. heatmap is of shape (B, NumJoints + 1, H, W) offset is of shape (B, NumJoints * 2, H, W)

required
targets Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple of (heatmap, mask, offset, offset_weight). heatmap is of shape (B, NumJoints + 1, H, W) mask is of shape (B, NumJoints + 1, H, W) offset is of shape (B, NumJoints * 2, H, W) offset_weight is of shape (B, NumJoints * 2, H, W)

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple of (loss, loss_components) loss is a scalar tensor with the total loss loss_components is a tensor of shape (3,) containing the individual loss components for logging (detached from the graph)

Source code in V3_6/src/super_gradients/training/losses/dekr_loss.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def forward(self, predictions: Tuple[Tensor, Tensor], targets: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
    """

    :param predictions: Tuple of (heatmap, offset) predictions.
        heatmap is of shape (B, NumJoints + 1, H, W)
        offset is of shape (B, NumJoints * 2, H, W)

    :param targets: Tuple of (heatmap, mask, offset, offset_weight).
        heatmap is of shape (B, NumJoints + 1, H, W)
        mask is of shape (B, NumJoints + 1, H, W)
        offset is of shape (B, NumJoints * 2, H, W)
        offset_weight is of shape (B, NumJoints * 2, H, W)

    :return: Tuple of (loss, loss_components)
        loss is a scalar tensor with the total loss
        loss_components is a tensor of shape (3,) containing the individual loss components for logging (detached from the graph)
    """
    pred_heatmap, pred_offset = predictions
    gt_heatmap, mask, gt_offset, offset_weight = targets

    heatmap_loss = self.heatmap_loss(pred_heatmap, gt_heatmap, mask) * self.heatmap_loss_factor
    offset_loss = self.offset_loss(pred_offset, gt_offset, offset_weight) * self.offset_loss_factor

    loss = heatmap_loss + offset_loss
    components = torch.cat(
        (
            heatmap_loss.unsqueeze(0),
            offset_loss.unsqueeze(0),
            loss.unsqueeze(0),
        )
    ).detach()

    return loss, components

DiceCEEdgeLoss

Bases: _Loss

Source code in V3_6/src/super_gradients/training/losses/dice_ce_edge_loss.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
@register_loss(name=Losses.DICE_CE_EDGE_LOSS, deprecated_name="dice_ce_edge_loss")
class DiceCEEdgeLoss(_Loss):
    def __init__(
        self,
        num_classes: int,
        num_aux_heads: int = 2,
        num_detail_heads: int = 1,
        weights: Union[tuple, list] = (1, 1, 1, 1),
        dice_ce_weights: Union[tuple, list] = (1, 1),
        ignore_index: int = -100,
        edge_kernel: int = 3,
        ce_edge_weights: Union[tuple, list] = (0.5, 0.5),
    ):
        """
        Total loss is computed as follows:

            Loss-cls-edge = λ1 * CE + λ2 * M * CE , where [λ1, λ2] are ce_edge_weights.

        For each Main feature maps and auxiliary heads the loss is calculated as:

            Loss-main-aux = λ3 * Loss-cls-edge + λ4 * Loss-Dice, where [λ3, λ4] are dice_ce_weights.

        For Feature maps defined as detail maps that predicts only the edge mask, the loss is computed as follow:

            Loss-detail = BinaryCE + BinaryDice

        Finally the total loss is computed as follows for the whole feature maps:

            Loss = Σw[i] * Loss-main-aux[i] + Σw[j] * Loss-detail[j], where `w` is defined as the `weights` argument
                `i` in [0, 1 + num_aux_heads], 1 is for the main feature map.
                `j` in [1 + num_aux_heads, 1 + num_aux_heads + num_detail_heads].


        :param num_aux_heads: num of auxiliary heads.
        :param num_detail_heads: num of detail heads.
        :param weights: Loss lambda weights.
        :param dice_ce_weights: weights lambdas between (Dice, CE) losses.
        :param edge_kernel: kernel size of dilation erosion convolutions for creating the edge feature map.
        :param ce_edge_weights: weights lambdas between regular CE and edge attention CE.
        """
        super().__init__()
        # Check that arguments are valid.
        assert len(weights) == num_aux_heads + num_detail_heads + 1, "Lambda loss weights must be in same size as loss items."
        assert len(dice_ce_weights) == 2, f"dice_ce_weights must an iterable with size 2, found: {len(dice_ce_weights)}"
        assert len(ce_edge_weights) == 2, f"dice_ce_weights must an iterable with size 2, found: {len(ce_edge_weights)}"

        self.edge_kernel = edge_kernel
        self.num_classes = num_classes
        self.ignore_index = ignore_index
        self.weights = weights
        self.dice_ce_weights = dice_ce_weights
        self.use_detail = num_detail_heads > 0

        self.num_aux_heads = num_aux_heads
        self.num_detail_heads = num_detail_heads

        if self.use_detail:
            self.bce = nn.BCEWithLogitsLoss()
            self.binary_dice = BinaryDiceLoss(apply_sigmoid=True)

        self.ce_edge = MaskAttentionLoss(criterion=nn.CrossEntropyLoss(reduction="none", ignore_index=ignore_index), loss_weights=ce_edge_weights)
        self.dice_loss = DiceLoss(apply_softmax=True, ignore_index=None if ignore_index < 0 else ignore_index)

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        names = ["main_loss"]
        # Append aux losses names
        names += [f"aux_loss{i}" for i in range(self.num_aux_heads)]
        # Append detail losses names
        names += [f"detail_loss{i}" for i in range(self.num_detail_heads)]
        names += ["loss"]
        return names

    def forward(self, preds: Tuple[torch.Tensor], target: torch.Tensor):
        """
        :param preds: Model output predictions, must be in the followed format:
         [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]
        """
        assert (
            len(preds) == self.num_aux_heads + self.num_detail_heads + 1
        ), f"Wrong num of predictions tensors, expected {self.num_aux_heads + self.num_detail_heads + 1} found {len(preds)}"

        edge_target = target_to_binary_edge(
            target, num_classes=self.num_classes, kernel_size=self.edge_kernel, ignore_index=self.ignore_index, flatten_channels=True
        )
        losses = []
        total_loss = 0
        # Main and auxiliaries feature maps losses
        for i in range(0, 1 + self.num_aux_heads):
            ce_loss = self.ce_edge(preds[i], target, edge_target)
            dice_loss = self.dice_loss(preds[i], target)

            loss = ce_loss * self.dice_ce_weights[0] + dice_loss * self.dice_ce_weights[1]
            total_loss += self.weights[i] * loss
            losses.append(loss)

        # Detail feature maps losses
        if self.use_detail:
            for i in range(1 + self.num_aux_heads, len(preds)):
                bce_loss = self.bce(preds[i], edge_target)
                dice_loss = self.binary_dice(preds[i], edge_target)

                loss = bce_loss * self.dice_ce_weights[0] + dice_loss * self.dice_ce_weights[1]
                total_loss += self.weights[i] * loss
                losses.append(loss)

        losses.append(total_loss)

        return total_loss, torch.stack(losses, dim=0).detach()

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(num_classes, num_aux_heads=2, num_detail_heads=1, weights=(1, 1, 1, 1), dice_ce_weights=(1, 1), ignore_index=-100, edge_kernel=3, ce_edge_weights=(0.5, 0.5))

Total loss is computed as follows:

Loss-cls-edge = λ1 * CE + λ2 * M * CE , where [λ1, λ2] are ce_edge_weights.

For each Main feature maps and auxiliary heads the loss is calculated as:

Loss-main-aux = λ3 * Loss-cls-edge + λ4 * Loss-Dice, where [λ3, λ4] are dice_ce_weights.

For Feature maps defined as detail maps that predicts only the edge mask, the loss is computed as follow:

Loss-detail = BinaryCE + BinaryDice

Finally the total loss is computed as follows for the whole feature maps:

Loss = Σw[i] * Loss-main-aux[i] + Σw[j] * Loss-detail[j], where `w` is defined as the `weights` argument
    `i` in [0, 1 + num_aux_heads], 1 is for the main feature map.
    `j` in [1 + num_aux_heads, 1 + num_aux_heads + num_detail_heads].

Parameters:

Name Type Description Default
num_aux_heads int

num of auxiliary heads.

2
num_detail_heads int

num of detail heads.

1
weights Union[tuple, list]

Loss lambda weights.

(1, 1, 1, 1)
dice_ce_weights Union[tuple, list]

weights lambdas between (Dice, CE) losses.

(1, 1)
edge_kernel int

kernel size of dilation erosion convolutions for creating the edge feature map.

3
ce_edge_weights Union[tuple, list]

weights lambdas between regular CE and edge attention CE.

(0.5, 0.5)
Source code in V3_6/src/super_gradients/training/losses/dice_ce_edge_loss.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(
    self,
    num_classes: int,
    num_aux_heads: int = 2,
    num_detail_heads: int = 1,
    weights: Union[tuple, list] = (1, 1, 1, 1),
    dice_ce_weights: Union[tuple, list] = (1, 1),
    ignore_index: int = -100,
    edge_kernel: int = 3,
    ce_edge_weights: Union[tuple, list] = (0.5, 0.5),
):
    """
    Total loss is computed as follows:

        Loss-cls-edge = λ1 * CE + λ2 * M * CE , where [λ1, λ2] are ce_edge_weights.

    For each Main feature maps and auxiliary heads the loss is calculated as:

        Loss-main-aux = λ3 * Loss-cls-edge + λ4 * Loss-Dice, where [λ3, λ4] are dice_ce_weights.

    For Feature maps defined as detail maps that predicts only the edge mask, the loss is computed as follow:

        Loss-detail = BinaryCE + BinaryDice

    Finally the total loss is computed as follows for the whole feature maps:

        Loss = Σw[i] * Loss-main-aux[i] + Σw[j] * Loss-detail[j], where `w` is defined as the `weights` argument
            `i` in [0, 1 + num_aux_heads], 1 is for the main feature map.
            `j` in [1 + num_aux_heads, 1 + num_aux_heads + num_detail_heads].


    :param num_aux_heads: num of auxiliary heads.
    :param num_detail_heads: num of detail heads.
    :param weights: Loss lambda weights.
    :param dice_ce_weights: weights lambdas between (Dice, CE) losses.
    :param edge_kernel: kernel size of dilation erosion convolutions for creating the edge feature map.
    :param ce_edge_weights: weights lambdas between regular CE and edge attention CE.
    """
    super().__init__()
    # Check that arguments are valid.
    assert len(weights) == num_aux_heads + num_detail_heads + 1, "Lambda loss weights must be in same size as loss items."
    assert len(dice_ce_weights) == 2, f"dice_ce_weights must an iterable with size 2, found: {len(dice_ce_weights)}"
    assert len(ce_edge_weights) == 2, f"dice_ce_weights must an iterable with size 2, found: {len(ce_edge_weights)}"

    self.edge_kernel = edge_kernel
    self.num_classes = num_classes
    self.ignore_index = ignore_index
    self.weights = weights
    self.dice_ce_weights = dice_ce_weights
    self.use_detail = num_detail_heads > 0

    self.num_aux_heads = num_aux_heads
    self.num_detail_heads = num_detail_heads

    if self.use_detail:
        self.bce = nn.BCEWithLogitsLoss()
        self.binary_dice = BinaryDiceLoss(apply_sigmoid=True)

    self.ce_edge = MaskAttentionLoss(criterion=nn.CrossEntropyLoss(reduction="none", ignore_index=ignore_index), loss_weights=ce_edge_weights)
    self.dice_loss = DiceLoss(apply_softmax=True, ignore_index=None if ignore_index < 0 else ignore_index)

forward(preds, target)

Parameters:

Name Type Description Default
preds Tuple[torch.Tensor]

Model output predictions, must be in the followed format: [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]

required
Source code in V3_6/src/super_gradients/training/losses/dice_ce_edge_loss.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def forward(self, preds: Tuple[torch.Tensor], target: torch.Tensor):
    """
    :param preds: Model output predictions, must be in the followed format:
     [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]
    """
    assert (
        len(preds) == self.num_aux_heads + self.num_detail_heads + 1
    ), f"Wrong num of predictions tensors, expected {self.num_aux_heads + self.num_detail_heads + 1} found {len(preds)}"

    edge_target = target_to_binary_edge(
        target, num_classes=self.num_classes, kernel_size=self.edge_kernel, ignore_index=self.ignore_index, flatten_channels=True
    )
    losses = []
    total_loss = 0
    # Main and auxiliaries feature maps losses
    for i in range(0, 1 + self.num_aux_heads):
        ce_loss = self.ce_edge(preds[i], target, edge_target)
        dice_loss = self.dice_loss(preds[i], target)

        loss = ce_loss * self.dice_ce_weights[0] + dice_loss * self.dice_ce_weights[1]
        total_loss += self.weights[i] * loss
        losses.append(loss)

    # Detail feature maps losses
    if self.use_detail:
        for i in range(1 + self.num_aux_heads, len(preds)):
            bce_loss = self.bce(preds[i], edge_target)
            dice_loss = self.binary_dice(preds[i], edge_target)

            loss = bce_loss * self.dice_ce_weights[0] + dice_loss * self.dice_ce_weights[1]
            total_loss += self.weights[i] * loss
            losses.append(loss)

    losses.append(total_loss)

    return total_loss, torch.stack(losses, dim=0).detach()

BinaryDiceLoss

Bases: DiceLoss

Compute Dice Loss for binary class tasks (1 class only). Except target to be a binary map with 0 and 1 values.

Source code in V3_6/src/super_gradients/training/losses/dice_loss.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class BinaryDiceLoss(DiceLoss):
    """
    Compute Dice Loss for binary class tasks (1 class only).
    Except target to be a binary map with 0 and 1 values.
    """

    def __init__(self, apply_sigmoid: bool = True, smooth: float = 1.0, eps: float = 1e-5):
        """
        :param apply_sigmoid: Whether to apply sigmoid to the predictions.
        :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice
            coefficient is to 1, which can be used as a regularization effect.
            As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
        :param eps: epsilon value to avoid inf.
        """
        super().__init__(apply_softmax=False, ignore_index=None, smooth=smooth, eps=eps, reduce_over_batches=False)
        self.apply_sigmoid = apply_sigmoid

    def forward(self, predict: torch.tensor, target: torch.tensor) -> torch.tensor:
        if self.apply_sigmoid:
            predict = torch.sigmoid(predict)
        return super().forward(predict=predict, target=target)

__init__(apply_sigmoid=True, smooth=1.0, eps=1e-05)

Parameters:

Name Type Description Default
apply_sigmoid bool

Whether to apply sigmoid to the predictions.

True
smooth float

laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice coefficient is to 1, which can be used as a regularization effect. As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895

1.0
eps float

epsilon value to avoid inf.

1e-05
Source code in V3_6/src/super_gradients/training/losses/dice_loss.py
50
51
52
53
54
55
56
57
58
59
def __init__(self, apply_sigmoid: bool = True, smooth: float = 1.0, eps: float = 1e-5):
    """
    :param apply_sigmoid: Whether to apply sigmoid to the predictions.
    :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice
        coefficient is to 1, which can be used as a regularization effect.
        As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
    :param eps: epsilon value to avoid inf.
    """
    super().__init__(apply_softmax=False, ignore_index=None, smooth=smooth, eps=eps, reduce_over_batches=False)
    self.apply_sigmoid = apply_sigmoid

DiceLoss

Bases: AbstarctSegmentationStructureLoss

Compute average Dice loss between two tensors, It can support both multi-classes and binary tasks. Defined in the paper: "V-Net: Fully Convolutional Neural Networks for Volumetric Medical Image Segmentation"

Source code in V3_6/src/super_gradients/training/losses/dice_loss.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class DiceLoss(AbstarctSegmentationStructureLoss):
    """
    Compute average Dice loss between two tensors, It can support both multi-classes and binary tasks.
    Defined in the paper: "V-Net: Fully Convolutional Neural Networks for Volumetric Medical Image Segmentation"
    """

    def _calc_numerator_denominator(self, labels_one_hot: torch.tensor, predict: torch.tensor) -> Tuple[torch.tensor, torch.tensor]:
        """
        Calculate dice metric's numerator and denominator.

        :param labels_one_hot: target in one hot format.   shape: [BS, num_classes, img_width, img_height]
        :param predict: predictions tensor.                shape: [BS, num_classes, img_width, img_height]
        :return:
            numerator = intersection between predictions and target. shape: [BS, num_classes, img_width, img_height]
            denominator = sum of predictions and target areas.       shape: [BS, num_classes, img_width, img_height]
        """
        numerator = labels_one_hot * predict
        denominator = labels_one_hot + predict
        return numerator, denominator

    def _calc_loss(self, numerator: torch.tensor, denominator: torch.tensor) -> torch.tensor:
        """
        Calculate dice loss.
        All tensors are of shape [BS] if self.reduce_over_batches else [num_classes].

        :param numerator: intersection between predictions and target.
        :param denominator: total number of pixels of prediction and target.
        """
        loss = 1.0 - ((2.0 * numerator + self.smooth) / (denominator + self.eps + self.smooth))
        return loss

GeneralizedDiceLoss

Bases: DiceLoss

Compute the Generalised Dice loss, contribution of each label is normalized by the inverse of its volume, in order to deal with class imbalance. Defined in the paper: "Generalised Dice overlap as a deep learning loss function for highly unbalanced segmentations"

Parameters:

Name Type Description Default
smooth float

default value is 0, smooth laplacian is not recommended to be used with GeneralizedDiceLoss. because the weighted values to be added are very small.

0.0
eps float

default value is 1e-17, must be a very small value, because weighted intersection and denominator are very small after multiplication with 1 / counts ** 2

1e-17
Source code in V3_6/src/super_gradients/training/losses/dice_loss.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class GeneralizedDiceLoss(DiceLoss):
    """
    Compute the Generalised Dice loss, contribution of each label is normalized by the inverse of its volume, in order
     to deal with class imbalance.
    Defined in the paper: "Generalised Dice overlap as a deep learning loss function for highly unbalanced
     segmentations"

    :param smooth:  default value is 0, smooth laplacian is not recommended to be used with GeneralizedDiceLoss.
         because the weighted values to be added are very small.
    :param eps:     default value is 1e-17, must be a very small value, because weighted `intersection` and
        `denominator` are very small after multiplication with `1 / counts ** 2`
    """

    def __init__(
        self,
        apply_softmax: bool = True,
        ignore_index: int = None,
        smooth: float = 0.0,
        eps: float = 1e-17,
        reduce_over_batches: bool = False,
        reduction: Union[LossReduction, str] = "mean",
    ):
        """
        :param apply_softmax: Whether to apply softmax to the predictions.
        :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice
            coefficient is to 1, which can be used as a regularization effect.
            As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
        :param eps: epsilon value to avoid inf.
        :param reduce_over_batches: Whether to apply reduction over the batch axis if set True,
         default is `False` to average over the classes axis.
        :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
            `none`: no reduction will be applied.
            `mean`: the sum of the output will be divided by the number of elements in the output.
            `sum`: the output will be summed.
            Default: `mean`
        """
        super().__init__(
            apply_softmax=apply_softmax,
            ignore_index=ignore_index,
            smooth=smooth,
            eps=eps,
            reduce_over_batches=reduce_over_batches,
            generalized_metric=True,
            weight=None,
            reduction=reduction,
        )

__init__(apply_softmax=True, ignore_index=None, smooth=0.0, eps=1e-17, reduce_over_batches=False, reduction='mean')

Parameters:

Name Type Description Default
apply_softmax bool

Whether to apply softmax to the predictions.

True
smooth float

laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice coefficient is to 1, which can be used as a regularization effect. As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895

0.0
eps float

epsilon value to avoid inf.

1e-17
reduce_over_batches bool

Whether to apply reduction over the batch axis if set True, default is False to average over the classes axis.

False
reduction Union[LossReduction, str]

Specifies the reduction to apply to the output: none | mean | sum. none: no reduction will be applied. mean: the sum of the output will be divided by the number of elements in the output. sum: the output will be summed. Default: mean

'mean'
Source code in V3_6/src/super_gradients/training/losses/dice_loss.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def __init__(
    self,
    apply_softmax: bool = True,
    ignore_index: int = None,
    smooth: float = 0.0,
    eps: float = 1e-17,
    reduce_over_batches: bool = False,
    reduction: Union[LossReduction, str] = "mean",
):
    """
    :param apply_softmax: Whether to apply softmax to the predictions.
    :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the dice
        coefficient is to 1, which can be used as a regularization effect.
        As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
    :param eps: epsilon value to avoid inf.
    :param reduce_over_batches: Whether to apply reduction over the batch axis if set True,
     default is `False` to average over the classes axis.
    :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
        `none`: no reduction will be applied.
        `mean`: the sum of the output will be divided by the number of elements in the output.
        `sum`: the output will be summed.
        Default: `mean`
    """
    super().__init__(
        apply_softmax=apply_softmax,
        ignore_index=ignore_index,
        smooth=smooth,
        eps=eps,
        reduce_over_batches=reduce_over_batches,
        generalized_metric=True,
        weight=None,
        reduction=reduction,
    )

FocalLoss

Bases: _Loss

Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)

Source code in V3_6/src/super_gradients/training/losses/focal_loss.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class FocalLoss(_Loss):
    """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)"""

    def __init__(self, loss_fcn: nn.BCEWithLogitsLoss, gamma: float = 1.5, alpha: float = 0.25):
        super(FocalLoss, self).__init__()
        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = loss_fcn.reduction
        self.loss_fcn.reduction = "none"  # required to apply FocalLoss to each element

    def forward(self, pred: torch.tensor, true: torch.tensor) -> torch.tensor:
        loss = self.loss_fcn(pred, true)

        pred_prob = torch.sigmoid(pred)  # prob from logits
        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
        modulating_factor = (1.0 - p_t) ** self.gamma
        loss *= alpha_factor * modulating_factor

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:  # 'none'
            return loss

bbox_ciou_loss(pred_bboxes, target_bboxes, eps)

Compute CIoU loss between predicted and target bboxes.

Parameters:

Name Type Description Default
pred_bboxes Tensor

Predicted boxes in xyxy format of [D0, D1,...Di, 4] shape

required
target_bboxes Tensor

Target boxes in xyxy format of [D0, D1,...Di, 4] shape

required

Returns:

Type Description
Tensor

CIoU loss per each box as tensor of shape [D0, D1,...Di]

Source code in V3_6/src/super_gradients/training/losses/functional.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def bbox_ciou_loss(pred_bboxes: Tensor, target_bboxes: Tensor, eps: float) -> Tensor:
    """
    Compute CIoU loss between predicted and target bboxes.
    :param pred_bboxes:   Predicted boxes in xyxy format of [D0, D1,...Di, 4] shape
    :param target_bboxes: Target boxes in xyxy format of [D0, D1,...Di, 4] shape
    :return:              CIoU loss per each box as tensor of shape [D0, D1,...Di]
    """

    b1_x1, b1_y1, b1_x2, b1_y2 = pred_bboxes.chunk(4, dim=-1)
    b2_x1, b2_y1, b2_x2, b2_y2 = target_bboxes.chunk(4, dim=-1)

    box1 = (b1_x1, b1_y1, b1_x2, b1_y2)
    box2 = (b2_x1, b2_y1, b2_x2, b2_y2)
    iou, overlap, union = bbox_overlap(box1, box2, eps)

    iou_term = 1 - iou

    xc1 = torch.minimum(b1_x1, b2_x1)
    yc1 = torch.minimum(b1_y1, b2_y1)
    xc2 = torch.maximum(b1_x2, b2_x2)
    yc2 = torch.maximum(b1_y2, b2_y2)

    cw = xc2 - xc1
    ch = yc2 - yc1

    # convex diagonal squared
    diagonal_distance_squared = cw**2 + ch**2

    # compute center distance squared
    b1_cx = (b1_x1 + b1_x2) / 2
    b1_cy = (b1_y1 + b1_y2) / 2
    b2_cx = (b2_x1 + b2_x2) / 2
    b2_cy = (b2_y1 + b2_y2) / 2

    centers_distance_squared = (b1_cx - b2_cx) ** 2 + (b1_cy - b2_cy) ** 2
    distance_term = centers_distance_squared / (diagonal_distance_squared + eps)

    c2 = cw**2 + ch**2 + eps  # noqa

    # centerpoint distance squared
    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
    v = (4 / math.pi**2) * torch.pow(
        torch.atan(w2 / h2) - torch.atan(w1 / h1),
        2,
    )
    with torch.no_grad():
        alpha = v / ((1 - iou) + v).clamp_min(eps)

    aspect_ratio_term = v * alpha

    return iou_term + distance_term + aspect_ratio_term  # CIoU

bbox_overlap(box1, box2, eps=1e-10)

Calculate the iou of box1 and box2. Shape of box1 and box2 should be the same, or broadcastable to the same shape.

Parameters:

Name Type Description Default
box1 Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple containing the x1, y1, x2, y2 coordinates of box1

required
box2 Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple containing the x1, y1, x2, y2 coordinates of box2

required
eps float

epsilon to avoid divide by zero

1e-10

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor]

Tuple containing (iou, overlap, union) - iou: iou of box1 and box2 - overlap: overlap of box1 and box2 - union: union of box1 and box2

Source code in V3_6/src/super_gradients/training/losses/functional.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def bbox_overlap(box1: Tuple[Tensor, Tensor, Tensor, Tensor], box2: Tuple[Tensor, Tensor, Tensor, Tensor], eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]:
    """
    Calculate the iou of box1 and box2.
    Shape of box1 and box2 should be the same, or broadcastable to the same shape.

    :param box1:    Tuple containing the x1, y1, x2, y2 coordinates of box1
    :param box2:    Tuple containing the x1, y1, x2, y2 coordinates of box2
    :param eps:     epsilon to avoid divide by zero
    :return:        Tuple containing (iou, overlap, union)
                    - iou:      iou of box1 and box2
                    - overlap:  overlap of box1 and box2
                    - union:    union of box1 and box2
    """
    x1, y1, x2, y2 = box1
    x1g, y1g, x2g, y2g = box2

    xkis1 = torch.maximum(x1, x1g)
    ykis1 = torch.maximum(y1, y1g)
    xkis2 = torch.minimum(x2, x2g)
    ykis2 = torch.minimum(y2, y2g)
    w_inter = (xkis2 - xkis1).clip(0)
    h_inter = (ykis2 - ykis1).clip(0)
    overlap = w_inter * h_inter

    area1 = (x2 - x1) * (y2 - y1)
    area2 = (x2g - x1g) * (y2g - y1g)
    union = area1 + area2 - overlap + eps
    iou = overlap / union

    return iou, overlap, union

get_bbox_center(bbox)

Compute the center of a bounding box from X1, Y1, X2, Y2 coordinates

Parameters:

Name Type Description Default
bbox Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple of (x1, y1, x2, y2) tensors of arbitrary shape

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple of (cx, cy) tensors of the same shape as bbox

Source code in V3_6/src/super_gradients/training/losses/functional.py
58
59
60
61
62
63
64
65
66
67
def get_bbox_center(bbox: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
    """
    Compute the center of a bounding box from X1, Y1, X2, Y2 coordinates
    :param bbox: Tuple of (x1, y1, x2, y2) tensors of arbitrary shape
    :return:     Tuple of (cx, cy) tensors of the same shape as bbox
    """
    b1_x1, b1_y1, b1_x2, b1_y2 = bbox
    cx = (b1_x1 + b1_x2) * 0.5
    cy = (b1_y1 + b1_y2) * 0.5
    return cx, cy

get_bbox_width_height(bbox)

Compute the width and height of the bounding box from X1, Y1, X2, Y2 coordinates

Parameters:

Name Type Description Default
bbox Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple of (x1, y1, x2, y2) tensors of arbitrary shape

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple of (w, h) tensors of the same shape as bbox

Source code in V3_6/src/super_gradients/training/losses/functional.py
70
71
72
73
74
75
76
77
78
79
def get_bbox_width_height(bbox: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
    """
    Compute the width and height of the bounding box from X1, Y1, X2, Y2 coordinates
    :param bbox:  Tuple of (x1, y1, x2, y2) tensors of arbitrary shape
    :return:      Tuple of (w, h) tensors of the same shape as bbox
    """
    b1_x1, b1_y1, b1_x2, b1_y2 = bbox
    w = b1_x2 - b1_x1
    h = b1_y2 - b1_y1
    return w, h

get_convex_bbox(box1, box2)

Compute the convex bounding box around box1 and box2

Parameters:

Name Type Description Default
box1 Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple containing the x1, y1, x2, y2 coordinates of box1

required
box2 Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple containing the x1, y1, x2, y2 coordinates of box2

required

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple containing the x1, y1, x2, y2 coordinates of the convex bounding box

Source code in V3_6/src/super_gradients/training/losses/functional.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def get_convex_bbox(box1: Tuple[Tensor, Tensor, Tensor, Tensor], box2: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """
    Compute the convex bounding box around box1 and box2
    :param box1: Tuple containing the x1, y1, x2, y2 coordinates of box1
    :param box2: Tuple containing the x1, y1, x2, y2 coordinates of box2
    :return:     Tuple containing the x1, y1, x2, y2 coordinates of the convex bounding box
    """
    b1_x1, b1_y1, b1_x2, b1_y2 = box1
    b2_x1, b2_y1, b2_x2, b2_y2 = box2

    xc1 = torch.minimum(b1_x1, b2_x1)
    yc1 = torch.minimum(b1_y1, b2_y1)
    xc2 = torch.maximum(b1_x2, b2_x2)
    yc2 = torch.maximum(b1_y2, b2_y2)

    return xc1, yc1, xc2, yc2

BinaryIoULoss

Bases: IoULoss

Compute IoU Loss for binary class tasks (1 class only). Except target to be a binary map with 0 and 1 values.

Source code in V3_6/src/super_gradients/training/losses/iou_loss.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class BinaryIoULoss(IoULoss):
    """
    Compute IoU Loss for binary class tasks (1 class only).
    Except target to be a binary map with 0 and 1 values.
    """

    def __init__(self, apply_sigmoid: bool = True, smooth: float = 1.0, eps: float = 1e-5):
        """
        :param apply_sigmoid: Whether to apply sigmoid to the predictions.
        :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the IoU
            coefficient is to 1, which can be used as a regularization effect.
            As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
        :param eps: epsilon value to avoid inf.
        """
        super().__init__(apply_softmax=False, ignore_index=None, smooth=smooth, eps=eps, reduce_over_batches=False)
        self.apply_sigmoid = apply_sigmoid

    def forward(self, predict: torch.tensor, target: torch.tensor) -> torch.tensor:
        if self.apply_sigmoid:
            predict = torch.sigmoid(predict)
        return super().forward(predict=predict, target=target)

__init__(apply_sigmoid=True, smooth=1.0, eps=1e-05)

Parameters:

Name Type Description Default
apply_sigmoid bool

Whether to apply sigmoid to the predictions.

True
smooth float

laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the IoU coefficient is to 1, which can be used as a regularization effect. As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895

1.0
eps float

epsilon value to avoid inf.

1e-05
Source code in V3_6/src/super_gradients/training/losses/iou_loss.py
49
50
51
52
53
54
55
56
57
58
def __init__(self, apply_sigmoid: bool = True, smooth: float = 1.0, eps: float = 1e-5):
    """
    :param apply_sigmoid: Whether to apply sigmoid to the predictions.
    :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the IoU
        coefficient is to 1, which can be used as a regularization effect.
        As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
    :param eps: epsilon value to avoid inf.
    """
    super().__init__(apply_softmax=False, ignore_index=None, smooth=smooth, eps=eps, reduce_over_batches=False)
    self.apply_sigmoid = apply_sigmoid

GeneralizedIoULoss

Bases: IoULoss

Compute the Generalised IoU loss, contribution of each label is normalized by the inverse of its volume, in order to deal with class imbalance.

FIXME: Why duplicate some parats in class and init docstring ? (+they have different description)

Parameters:

Name Type Description Default
(float) smooth

default value is 0, smooth laplacian is not recommended to be used with GeneralizedIoULoss. because the weighted values to be added are very small.

required
Source code in V3_6/src/super_gradients/training/losses/iou_loss.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
class GeneralizedIoULoss(IoULoss):
    """
    Compute the Generalised IoU loss, contribution of each label is normalized by the inverse of its volume, in order
     to deal with class imbalance.

    # FIXME: Why duplicate some parats in class and __init__ docstring ? (+they have different description)
    :param smooth (float): default value is 0, smooth laplacian is not recommended to be used with GeneralizedIoULoss.
         because the weighted values to be added are very small.
    :param eps (float): default value is 1e-17, must be a very small value, because weighted `intersection` and
        `denominator` are very small after multiplication with `1 / counts ** 2`
    """

    def __init__(
        self,
        apply_softmax: bool = True,
        ignore_index: int = None,
        smooth: float = 0.0,
        eps: float = 1e-17,
        reduce_over_batches: bool = False,
        reduction: Union[LossReduction, str] = "mean",
    ):
        """
        :param apply_softmax: Whether to apply softmax to the predictions.
        :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the iou
            coefficient is to 1, which can be used as a regularization effect.
            As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
        :param eps: epsilon value to avoid inf.
        :param reduce_over_batches: Whether to apply reduction over the batch axis if set True,
         default is `False` to average over the classes axis.
        :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
            `none`: no reduction will be applied.
            `mean`: the sum of the output will be divided by the number of elements in the output.
            `sum`: the output will be summed.
            Default: `mean`
        """
        super().__init__(
            apply_softmax=apply_softmax,
            ignore_index=ignore_index,
            smooth=smooth,
            eps=eps,
            reduce_over_batches=reduce_over_batches,
            generalized_metric=True,
            weight=None,
            reduction=reduction,
        )

__init__(apply_softmax=True, ignore_index=None, smooth=0.0, eps=1e-17, reduce_over_batches=False, reduction='mean')

Parameters:

Name Type Description Default
apply_softmax bool

Whether to apply softmax to the predictions.

True
smooth float

laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the iou coefficient is to 1, which can be used as a regularization effect. As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895

0.0
eps float

epsilon value to avoid inf.

1e-17
reduce_over_batches bool

Whether to apply reduction over the batch axis if set True, default is False to average over the classes axis.

False
reduction Union[LossReduction, str]

Specifies the reduction to apply to the output: none | mean | sum. none: no reduction will be applied. mean: the sum of the output will be divided by the number of elements in the output. sum: the output will be summed. Default: mean

'mean'
Source code in V3_6/src/super_gradients/training/losses/iou_loss.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def __init__(
    self,
    apply_softmax: bool = True,
    ignore_index: int = None,
    smooth: float = 0.0,
    eps: float = 1e-17,
    reduce_over_batches: bool = False,
    reduction: Union[LossReduction, str] = "mean",
):
    """
    :param apply_softmax: Whether to apply softmax to the predictions.
    :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the iou
        coefficient is to 1, which can be used as a regularization effect.
        As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
    :param eps: epsilon value to avoid inf.
    :param reduce_over_batches: Whether to apply reduction over the batch axis if set True,
     default is `False` to average over the classes axis.
    :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
        `none`: no reduction will be applied.
        `mean`: the sum of the output will be divided by the number of elements in the output.
        `sum`: the output will be summed.
        Default: `mean`
    """
    super().__init__(
        apply_softmax=apply_softmax,
        ignore_index=ignore_index,
        smooth=smooth,
        eps=eps,
        reduce_over_batches=reduce_over_batches,
        generalized_metric=True,
        weight=None,
        reduction=reduction,
    )

IoULoss

Bases: AbstarctSegmentationStructureLoss

Compute average IoU loss between two tensors, It can support both multi-classes and binary tasks.

Source code in V3_6/src/super_gradients/training/losses/iou_loss.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class IoULoss(AbstarctSegmentationStructureLoss):
    """
    Compute average IoU loss between two tensors, It can support both multi-classes and binary tasks.
    """

    def _calc_numerator_denominator(self, labels_one_hot: torch.tensor, predict: torch.tensor) -> Tuple[torch.tensor, torch.tensor]:
        """
        Calculate iou metric's numerator and denominator.

        :param labels_one_hot: target in one hot format.   shape: [BS, num_classes, img_width, img_height]
        :param predict: predictions tensor.                shape: [BS, num_classes, img_width, img_height]
        :return:
            numerator = intersection between predictions and target.    shape: [BS, num_classes, img_width, img_height]
            denominator = area of union between predictions and target. shape: [BS, num_classes, img_width, img_height]
        """
        numerator = labels_one_hot * predict
        denominator = labels_one_hot + predict - numerator
        return numerator, denominator

    def _calc_loss(self, numerator, denominator):
        """
        Calculate iou loss.
        All tensors are of shape [BS] if self.reduce_over_batches else [num_classes]

        :param numerator: intersection between predictions and target.
        :param denominator: area of union between prediction pixels and target pixels.
        """
        loss = 1.0 - ((numerator + self.smooth) / (denominator + self.eps + self.smooth))
        return loss

KDLogitsLoss

Bases: _Loss

Knowledge distillation loss, wraps the task loss and distillation loss

Source code in V3_6/src/super_gradients/training/losses/kd_losses.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@register_loss(name=Losses.KD_LOSS, deprecated_name="kd_loss")
class KDLogitsLoss(_Loss):
    """Knowledge distillation loss, wraps the task loss and distillation loss"""

    @resolve_param("task_loss_fn", LossesFactory())
    def __init__(self, task_loss_fn: _Loss, distillation_loss_fn: _Loss = KDklDivLoss(), distillation_loss_coeff: float = 0.5):
        """
        :param task_loss_fn: task loss. E.g., CrossEntropyLoss
        :param distillation_loss_fn: distillation loss. E.g., KLDivLoss
        :param distillation_loss_coeff:
        """

        super(KDLogitsLoss, self).__init__()
        self.task_loss_fn = task_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.distillation_loss_coeff = distillation_loss_coeff

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["Loss", "Task Loss", "Distillation Loss"]

    def forward(self, kd_module_output, target):
        task_loss = self.task_loss_fn(kd_module_output.student_output, target)
        if isinstance(task_loss, tuple):  # SOME LOSS FUNCTIONS RETURNS LOSS AND LOG_ITEMS
            task_loss = task_loss[0]
        distillation_loss = self.distillation_loss_fn(kd_module_output.student_output, kd_module_output.teacher_output)
        loss = task_loss * (1 - self.distillation_loss_coeff) + distillation_loss * self.distillation_loss_coeff

        return loss, torch.cat((loss.unsqueeze(0), task_loss.unsqueeze(0), distillation_loss.unsqueeze(0))).detach()

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(task_loss_fn, distillation_loss_fn=KDklDivLoss(), distillation_loss_coeff=0.5)

Parameters:

Name Type Description Default
task_loss_fn _Loss

task loss. E.g., CrossEntropyLoss

required
distillation_loss_fn _Loss

distillation loss. E.g., KLDivLoss

KDklDivLoss()
distillation_loss_coeff float 0.5
Source code in V3_6/src/super_gradients/training/losses/kd_losses.py
24
25
26
27
28
29
30
31
32
33
34
35
@resolve_param("task_loss_fn", LossesFactory())
def __init__(self, task_loss_fn: _Loss, distillation_loss_fn: _Loss = KDklDivLoss(), distillation_loss_coeff: float = 0.5):
    """
    :param task_loss_fn: task loss. E.g., CrossEntropyLoss
    :param distillation_loss_fn: distillation loss. E.g., KLDivLoss
    :param distillation_loss_coeff:
    """

    super(KDLogitsLoss, self).__init__()
    self.task_loss_fn = task_loss_fn
    self.distillation_loss_fn = distillation_loss_fn
    self.distillation_loss_coeff = distillation_loss_coeff

KDklDivLoss

Bases: KLDivLoss

KL divergence wrapper for knowledge distillation

Source code in V3_6/src/super_gradients/training/losses/kd_losses.py
10
11
12
13
14
15
16
17
class KDklDivLoss(KLDivLoss):
    """KL divergence wrapper for knowledge distillation"""

    def __init__(self):
        super(KDklDivLoss, self).__init__(reduction="batchmean")

    def forward(self, student_output, teacher_output):
        return super(KDklDivLoss, self).forward(torch.log_softmax(student_output, dim=1), torch.softmax(teacher_output, dim=1))

CrossEntropyLoss

Bases: nn.CrossEntropyLoss

CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing

Source code in V3_6/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
@register_loss(name=Losses.CROSS_ENTROPY, deprecated_name="cross_entropy")
class CrossEntropyLoss(nn.CrossEntropyLoss):
    """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing"""

    def __init__(self, weight=None, ignore_index=-100, reduction="mean", smooth_eps=None, smooth_dist=None, from_logits=True):
        super(CrossEntropyLoss, self).__init__(weight=weight, ignore_index=ignore_index, reduction=reduction)
        self.smooth_eps = smooth_eps
        self.smooth_dist = smooth_dist
        self.from_logits = from_logits

    def forward(self, input, target, smooth_dist=None):
        if smooth_dist is None:
            smooth_dist = self.smooth_dist
        loss = cross_entropy(
            input,
            target,
            weight=self.weight,
            ignore_index=self.ignore_index,
            reduction=self.reduction,
            smooth_eps=self.smooth_eps,
            smooth_dist=smooth_dist,
            from_logits=self.from_logits,
        )
        # CHANGED TO THE CURRENT FORMAT- OUR CRITERION FUNCTIONS SHOULD ALL NPW RETURN A TUPLE OF (LOSS_FOR_BACKPROP, ADDITIONAL_ITEMS)
        # WHERE ADDITIONAL ITEMS ARE TORCH TENSORS OF SIZE (N_ITEMS,...) DETACHED FROM THEIR GRADIENTS FOR LOGGING
        return loss, loss.unsqueeze(0).detach()

cross_entropy(inputs, target, weight=None, ignore_index=-100, reduction='mean', smooth_eps=None, smooth_dist=None, from_logits=True)

cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567

Source code in V3_6/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def cross_entropy(inputs, target, weight=None, ignore_index=-100, reduction="mean", smooth_eps=None, smooth_dist=None, from_logits=True):  # noqa: C901
    """cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567"""
    smooth_eps = smooth_eps or 0

    # ordinary log-liklihood - use cross_entropy from nn
    if _is_long(target) and smooth_eps == 0:
        if from_logits:
            return F.cross_entropy(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)
        else:
            return F.nll_loss(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)

    if from_logits:
        # log-softmax of inputs
        lsm = F.log_softmax(inputs, dim=-1)
    else:
        lsm = inputs

    masked_indices = None
    num_classes = inputs.size(-1)

    if _is_long(target) and ignore_index >= 0:
        masked_indices = target.eq(ignore_index)

    if smooth_eps > 0 and smooth_dist is not None:
        if _is_long(target):
            target = onehot(target, num_classes).type_as(inputs)
        if smooth_dist.dim() < target.dim():
            smooth_dist = smooth_dist.unsqueeze(0)
        target.lerp_(smooth_dist, smooth_eps)

    if weight is not None:
        lsm = lsm * weight.unsqueeze(0)

    if _is_long(target):
        eps_nll = 1.0 - smooth_eps
        likelihood = lsm.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)
        loss = -(eps_nll * likelihood + smooth_eps * lsm.mean(-1))
    else:
        loss = -(target * lsm).sum(-1)

    if masked_indices is not None:
        loss.masked_fill_(masked_indices, 0)

    if reduction == "sum":
        loss = loss.sum()
    elif reduction == "mean":
        if masked_indices is None:
            loss = loss.mean()
        else:
            loss = loss.sum() / float(loss.size(0) - masked_indices.sum())

    return loss

onehot(indexes, N=None, ignore_index=None)

Creates a one-hot representation of indexes with N possible entries if N is not specified, it will suit the maximum index appearing. indexes is a long-tensor of indexes ignore_index will be zero in onehot representation

Source code in V3_6/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def onehot(indexes, N=None, ignore_index=None):
    """
    Creates a one-hot representation of indexes with N possible entries
    if N is not specified, it will suit the maximum index appearing.
    indexes is a long-tensor of indexes
    ignore_index will be zero in onehot representation
    """
    if N is None:
        N = indexes.max() + 1
    sz = list(indexes.size())
    output = indexes.new().byte().resize_(*sz, N).zero_()
    output.scatter_(-1, indexes.unsqueeze(-1), 1)
    if ignore_index is not None and ignore_index >= 0:
        output.masked_fill_(indexes.eq(ignore_index).unsqueeze(-1), 0)
    return output

MaskAttentionLoss

Bases: _Loss

Pixel mask attention loss. For semantic segmentation usages with 4D tensors.

Source code in V3_6/src/super_gradients/training/losses/mask_loss.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class MaskAttentionLoss(_Loss):
    """
    Pixel mask attention loss. For semantic segmentation usages with 4D tensors.
    """

    def __init__(self, criterion: _Loss, loss_weights: Union[list, tuple] = (1.0, 1.0), reduction: Union[LossReduction, str] = "mean"):
        """
        :param criterion: _Loss object, loss function that apply per pixel cost penalty are supported, i.e
            CrossEntropyLoss, BCEWithLogitsLoss, MSELoss, SL1Loss.
            criterion reduction must be `none`.
        :param loss_weights: Weight to apply for each part of the loss contributions,
            [regular loss, masked loss] respectively.
        :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
            `none`: no reduction will be applied.
            `mean`: the sum of the output will be divided by the number of elements in the output.
            `sum`: the output will be summed.
            Default: `mean`
        """
        super().__init__(reduction=reduction.value if isinstance(reduction, LossReduction) else reduction)
        # Check that the arguments are valid.
        if criterion.reduction != "none":
            raise ValueError(f"criterion reduction must be `none`, for computing the mask contribution loss values," f" found reduction: {criterion.reduction}")
        if len(loss_weights) != 2:
            raise ValueError(f"loss_weights must have 2 values, found: {len(loss_weights)}")
        if loss_weights[1] <= 0:
            raise ValueError("If no loss weight is applied on mask samples, consider using simply criterion")

        self.criterion = criterion
        self.loss_weights = loss_weights

    def forward(self, predict: torch.Tensor, target: torch.Tensor, mask: torch.Tensor):
        criterion_loss = self.criterion(predict, target)

        mask = self._broadcast_mask(mask, criterion_loss.size())
        mask_loss = criterion_loss * mask

        if self.reduction == LossReduction.NONE.value:
            return criterion_loss * self.loss_weights[0] + mask_loss * self.loss_weights[1]
        mask_loss = mask_loss[mask == 1]  # consider only mask samples for mask loss computing
        # If mask doesn't include foreground values, set mask_loss as 0.
        if mask_loss.numel() == 0:
            mask_loss = torch.tensor(0.0)

        mask_loss = apply_reduce(mask_loss, self.reduction)
        criterion_loss = apply_reduce(criterion_loss, self.reduction)

        loss = criterion_loss * self.loss_weights[0] + mask_loss * self.loss_weights[1]
        return loss

    def _broadcast_mask(self, mask: torch.Tensor, size: torch.Size):
        """
        Broadcast the mask tensor before elementwise multiplication.
        """
        # Assert that batch size and spatial size are the same.
        if mask.size()[-2:] != size[-2:] or mask.size(0) != size[0]:
            raise AssertionError(
                "Mask broadcast is allowed only in channels dimension, found shape mismatch between" f"mask shape: {mask.size()}, and target shape: {size}"
            )
        # when mask is [B, 1, H, W] | [B, H, W] and size is [B, H, W]
        # or when mask is [B, 1, H, W] | [B, H, W] and size is [B, 1, H, W]
        if len(size) == 3 or (len(size) == 4 and size[1] == 1):
            mask = mask.view(*size)

        # when mask is [B, C, H, W] | [B, 1, H, W] | [B, H, W] and size is [B, C, H, W]
        else:
            mask = mask if len(mask.size()) == 4 else mask.unsqueeze(1)
            if mask.size(1) not in [1, size[1]]:
                raise AssertionError(
                    f"Broadcast is not allowed, num mask channels must be 1 or same as target channels" f"mask shape: {mask.size()}, and target shape: {size}"
                )
            mask = mask if mask.size() == size else mask.expand(*size)
        return mask

__init__(criterion, loss_weights=(1.0, 1.0), reduction='mean')

Parameters:

Name Type Description Default
criterion _Loss

_Loss object, loss function that apply per pixel cost penalty are supported, i.e CrossEntropyLoss, BCEWithLogitsLoss, MSELoss, SL1Loss. criterion reduction must be none.

required
loss_weights Union[list, tuple]

Weight to apply for each part of the loss contributions, [regular loss, masked loss] respectively.

(1.0, 1.0)
reduction Union[LossReduction, str]

Specifies the reduction to apply to the output: none | mean | sum. none: no reduction will be applied. mean: the sum of the output will be divided by the number of elements in the output. sum: the output will be summed. Default: mean

'mean'
Source code in V3_6/src/super_gradients/training/losses/mask_loss.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, criterion: _Loss, loss_weights: Union[list, tuple] = (1.0, 1.0), reduction: Union[LossReduction, str] = "mean"):
    """
    :param criterion: _Loss object, loss function that apply per pixel cost penalty are supported, i.e
        CrossEntropyLoss, BCEWithLogitsLoss, MSELoss, SL1Loss.
        criterion reduction must be `none`.
    :param loss_weights: Weight to apply for each part of the loss contributions,
        [regular loss, masked loss] respectively.
    :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
        `none`: no reduction will be applied.
        `mean`: the sum of the output will be divided by the number of elements in the output.
        `sum`: the output will be summed.
        Default: `mean`
    """
    super().__init__(reduction=reduction.value if isinstance(reduction, LossReduction) else reduction)
    # Check that the arguments are valid.
    if criterion.reduction != "none":
        raise ValueError(f"criterion reduction must be `none`, for computing the mask contribution loss values," f" found reduction: {criterion.reduction}")
    if len(loss_weights) != 2:
        raise ValueError(f"loss_weights must have 2 values, found: {len(loss_weights)}")
    if loss_weights[1] <= 0:
        raise ValueError("If no loss weight is applied on mask samples, consider using simply criterion")

    self.criterion = criterion
    self.loss_weights = loss_weights

OhemBCELoss

Bases: OhemLoss

OhemBCELoss - Online Hard Example Mining Binary Cross Entropy Loss

Source code in V3_6/src/super_gradients/training/losses/ohem_ce_loss.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class OhemBCELoss(OhemLoss):
    """
    OhemBCELoss - Online Hard Example Mining Binary Cross Entropy Loss
    """

    def __init__(
        self,
        threshold: float,
        mining_percent: float = 0.1,
        ignore_lb: int = -100,
        num_pixels_exclude_ignored: bool = True,
    ):
        super(OhemBCELoss, self).__init__(
            threshold=threshold,
            mining_percent=mining_percent,
            ignore_lb=ignore_lb,
            num_pixels_exclude_ignored=num_pixels_exclude_ignored,
            criteria=nn.BCEWithLogitsLoss(reduction="none"),
        )

    def forward(self, logits, labels):

        # REMOVE SINGLE CLASS CHANNEL WHEN DEALING WITH BINARY DATA
        if logits.shape[1] == 1:
            logits = logits.squeeze(1)
        return super(OhemBCELoss, self).forward(logits, labels.float())

OhemCELoss

Bases: OhemLoss

OhemLoss - Online Hard Example Mining Cross Entropy Loss

Source code in V3_6/src/super_gradients/training/losses/ohem_ce_loss.py
64
65
66
67
68
69
70
71
72
73
74
class OhemCELoss(OhemLoss):
    """
    OhemLoss - Online Hard Example Mining Cross Entropy Loss
    """

    def __init__(self, threshold: float, mining_percent: float = 0.1, ignore_lb: int = -100, num_pixels_exclude_ignored: bool = True):
        ignore_lb = -100 if ignore_lb is None or ignore_lb < 0 else ignore_lb
        criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction="none")
        super(OhemCELoss, self).__init__(
            threshold=threshold, mining_percent=mining_percent, ignore_lb=ignore_lb, num_pixels_exclude_ignored=num_pixels_exclude_ignored, criteria=criteria
        )

OhemLoss

Bases: _Loss

OhemLoss - Online Hard Example Mining Cross Entropy Loss

Source code in V3_6/src/super_gradients/training/losses/ohem_ce_loss.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class OhemLoss(_Loss):
    """
    OhemLoss - Online Hard Example Mining Cross Entropy Loss
    """

    def __init__(self, threshold: float, mining_percent: float = 0.1, ignore_lb: int = -100, num_pixels_exclude_ignored: bool = True, criteria: _Loss = None):
        """
        :param threshold: Sample below probability threshold, is considered hard.
        :param num_pixels_exclude_ignored: How to calculate total pixels from which extract mining percent of the
         samples.
        :param ignore_lb: label index to be ignored in loss calculation.
        :param criteria: loss to mine the examples from.

         i.e for num_pixels=100, ignore_pixels=30, mining_percent=0.1:
         num_pixels_exclude_ignored=False => num_mining = 100 * 0.1 = 10
         num_pixels_exclude_ignored=True  => num_mining = (100 - 30) * 0.1 = 7
        """
        super().__init__()

        if mining_percent < 0 or mining_percent > 1:
            raise IllegalRangeForLossAttributeException((0, 1), "mining percent")

        self.thresh = -torch.log(torch.tensor(threshold, dtype=torch.float))
        self.mining_percent = mining_percent
        self.ignore_lb = ignore_lb
        self.num_pixels_exclude_ignored = num_pixels_exclude_ignored

        if criteria.reduction != "none":
            raise RequiredLossComponentReductionException("criteria", criteria.reduction, "none")
        self.criteria = criteria

    def forward(self, logits, labels):
        loss = self.criteria(logits, labels).view(-1)
        if self.num_pixels_exclude_ignored:
            # remove ignore label elements
            loss = loss[labels.view(-1) != self.ignore_lb]
            # num pixels in a batch -> num_pixels = batch_size * width * height - ignore_pixels
            num_pixels = loss.numel()
        else:
            num_pixels = labels.numel()
        # if all pixels are ignore labels, return empty loss tensor
        if num_pixels == 0:
            return torch.tensor([0.0]).requires_grad_(True).to(logits.device)

        num_mining = int(self.mining_percent * num_pixels)
        # in case mining_percent=1, prevent out of bound exception
        num_mining = min(num_mining, num_pixels - 1)

        self.thresh = self.thresh.to(logits.device)
        loss, _ = torch.sort(loss, descending=True)
        if loss[num_mining] > self.thresh:
            loss = loss[loss > self.thresh]
        else:
            loss = loss[:num_mining]
        return torch.mean(loss)

__init__(threshold, mining_percent=0.1, ignore_lb=-100, num_pixels_exclude_ignored=True, criteria=None)

Parameters:

Name Type Description Default
threshold float

Sample below probability threshold, is considered hard.

required
num_pixels_exclude_ignored bool

How to calculate total pixels from which extract mining percent of the samples.

True
ignore_lb int

label index to be ignored in loss calculation.

-100
criteria _Loss

loss to mine the examples from. i.e for num_pixels=100, ignore_pixels=30, mining_percent=0.1: num_pixels_exclude_ignored=False => num_mining = 100 * 0.1 = 10 num_pixels_exclude_ignored=True => num_mining = (100 - 30) * 0.1 = 7

None
Source code in V3_6/src/super_gradients/training/losses/ohem_ce_loss.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(self, threshold: float, mining_percent: float = 0.1, ignore_lb: int = -100, num_pixels_exclude_ignored: bool = True, criteria: _Loss = None):
    """
    :param threshold: Sample below probability threshold, is considered hard.
    :param num_pixels_exclude_ignored: How to calculate total pixels from which extract mining percent of the
     samples.
    :param ignore_lb: label index to be ignored in loss calculation.
    :param criteria: loss to mine the examples from.

     i.e for num_pixels=100, ignore_pixels=30, mining_percent=0.1:
     num_pixels_exclude_ignored=False => num_mining = 100 * 0.1 = 10
     num_pixels_exclude_ignored=True  => num_mining = (100 - 30) * 0.1 = 7
    """
    super().__init__()

    if mining_percent < 0 or mining_percent > 1:
        raise IllegalRangeForLossAttributeException((0, 1), "mining percent")

    self.thresh = -torch.log(torch.tensor(threshold, dtype=torch.float))
    self.mining_percent = mining_percent
    self.ignore_lb = ignore_lb
    self.num_pixels_exclude_ignored = num_pixels_exclude_ignored

    if criteria.reduction != "none":
        raise RequiredLossComponentReductionException("criteria", criteria.reduction, "none")
    self.criteria = criteria

ATSSAssigner

Bases: nn.Module

Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
class ATSSAssigner(nn.Module):
    """Bridging the Gap Between Anchor-based and Anchor-free Detection
    via Adaptive Training Sample Selection
    """

    __shared__ = ["num_classes"]

    def __init__(self, topk=9, num_classes=80, force_gt_matching=False, eps=1e-9):
        """

        :param topk: Maximum number of achors that is selected for each gt box
        :param num_classes:
        :param force_gt_matching: Guarantee that each gt box is matched to at least one anchor.
            If two gt boxes match to the same anchor, the one with the larger area will be selected.
            And the second-best achnor will be assigned to the other gt box.
        :param eps: Small constant for numerical stability
        """
        super(ATSSAssigner, self).__init__()
        self.topk = topk
        self.num_classes = num_classes
        self.force_gt_matching = force_gt_matching
        self.eps = eps

    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, pad_gt_mask: Optional[Tensor]):
        gt2anchor_distances_list = torch.split(gt2anchor_distances, num_anchors_list, dim=-1)
        num_anchors_index = np.cumsum(num_anchors_list).tolist()
        num_anchors_index = [
            0,
        ] + num_anchors_index[:-1]
        is_in_topk_list = []
        topk_idxs_list = []
        for distances, anchors_index in zip(gt2anchor_distances_list, num_anchors_index):
            num_anchors = distances.shape[-1]
            _, topk_idxs = torch.topk(distances, self.topk, dim=-1, largest=False)
            topk_idxs_list.append(topk_idxs + anchors_index)
            is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(gt2anchor_distances)
            if pad_gt_mask is not None:
                is_in_topk = is_in_topk * pad_gt_mask
            is_in_topk_list.append(is_in_topk)
        is_in_topk_list = torch.cat(is_in_topk_list, dim=-1)
        topk_idxs_list = torch.cat(topk_idxs_list, dim=-1)
        return is_in_topk_list, topk_idxs_list

    @torch.no_grad()
    def forward(
        self,
        anchor_bboxes: Tensor,
        num_anchors_list: list,
        gt_labels: Tensor,
        gt_bboxes: Tensor,
        pad_gt_mask: Optional[Tensor],
        bg_index: int,
        gt_scores: Optional[Tensor] = None,
        pred_bboxes: Optional[Tensor] = None,
    ) -> Tuple[Tensor, Tensor, Tensor]:
        """
        This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

        The assignment is done in following steps
        1. compute iou between all bbox (bbox of all pyramid levels) and gt
        2. compute center distance between all bbox and gt
        3. on each pyramid level, for each gt, select k bbox whose center
           are closest to the gt center, so we total select k*l bbox as
           candidates for each gt
        4. get corresponding iou for the these candidates, and compute the
           mean and std, set mean + std as the iou threshold
        5. select these candidates whose iou are greater than or equal to
           the threshold as positive
        6. limit the positive sample's center in gt
        7. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.

        :param anchor_bboxes:       Tensor(float32) - pre-defined anchors, shape(L, 4), "xmin, xmax, ymin, ymax" format
        :param num_anchors_list:    Number of anchors in each level
        :param gt_labels:           Tensor (int64|int32) - Label of gt_bboxes, shape(B, n, 1)
        :param gt_bboxes:           Tensor (float32) - Ground truth bboxes, shape(B, n, 4)
        :param pad_gt_mask:         Tensor (float32) - 1 means bbox, 0 means no bbox, shape(B, n, 1)
        :param bg_index:            Background index
        :param gt_scores:           Tensor (float32) - Score of gt_bboxes, shape(B, n, 1), if None, then it will initialize with one_hot label
        :param pred_bboxes:         Tensor (float32) - predicted bounding boxes, shape(B, L, 4)
        :return:
            - assigned_labels: Tensor of shape (B, L)
            - assigned_bboxes: Tensor of shape (B, L, 4)
            - assigned_scores: Tensor of shape (B, L, C), if pred_bboxes is not None, then output ious
        """
        assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

        num_anchors, _ = anchor_bboxes.shape
        batch_size, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=anchor_bboxes.device)
            assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=anchor_bboxes.device)
            assigned_scores = torch.zeros([batch_size, num_anchors, self.num_classes], device=anchor_bboxes.device)
            return assigned_labels, assigned_bboxes, assigned_scores

        # 1. compute iou between gt and anchor bbox, [B, n, L]
        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
        ious = ious.reshape([batch_size, -1, num_anchors])

        # 2. compute center distance between all anchors and gt, [B, n, L]
        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
        anchor_centers = bbox_center(anchor_bboxes)
        # gt2anchor_distances = (
        #     (gt_centers - anchor_centers.unsqueeze(0)).norm(2, dim=-1).reshape([batch_size, -1, num_anchors])
        # )

        gt2anchor_distances = torch.norm(gt_centers - anchor_centers.unsqueeze(0), p=2, dim=-1).reshape([batch_size, -1, num_anchors])

        # 3. on each pyramid level, selecting top-k closest candidates
        # based on the center distance, [B, n, L]
        is_in_topk, topk_idxs = self._gather_topk_pyramid(gt2anchor_distances, num_anchors_list, pad_gt_mask)

        # 4. get corresponding iou for the these candidates, and compute the
        # mean and std, 5. set mean + std as the iou threshold
        iou_candidates = ious * is_in_topk

        iou_threshold = torch.gather(iou_candidates.flatten(end_dim=-2), dim=1, index=topk_idxs.flatten(end_dim=-2))

        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
        iou_threshold = iou_threshold.mean(dim=-1, keepdim=True) + iou_threshold.std(dim=-1, keepdim=True)
        is_in_topk = torch.where(iou_candidates > iou_threshold, is_in_topk, torch.zeros_like(is_in_topk))

        # 6. check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts
        if pad_gt_mask is not None:
            mask_positive = mask_positive * pad_gt_mask

        # 7. if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected.
        mask_positive_sum = mask_positive.sum(dim=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        # 8. make sure every gt_bbox matches the anchor
        if self.force_gt_matching:
            is_max_iou = compute_max_iou_gt(ious)
            if pad_gt_mask is not None:
                is_max_iou = is_max_iou * pad_gt_mask
            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile([1, num_max_boxes, 1])
            mask_positive = torch.where(mask_max_iou, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        assigned_gt_index = mask_positive.argmax(dim=-2)

        # assigned target
        batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
        assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

        # assigned_bboxes = torch.gather(gt_bboxes.reshape([-1, 4]), index=assigned_gt_index.flatten(), dim=0)
        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1).float()
        ind = list(range(self.num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device), dim=-1)
        if pred_bboxes is not None:
            # assigned iou
            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
            ious = ious.max(dim=-2).values.unsqueeze(-1)
            assigned_scores *= ious
        elif gt_scores is not None:
            gather_scores = torch.gather(gt_scores.flatten(), assigned_gt_index.flatten(), dim=0)
            gather_scores = gather_scores.reshape([batch_size, num_anchors])
            gather_scores = torch.where(mask_positive_sum > 0, gather_scores, torch.zeros_like(gather_scores))
            assigned_scores *= gather_scores.unsqueeze(-1)

        return assigned_labels, assigned_bboxes, assigned_scores

__init__(topk=9, num_classes=80, force_gt_matching=False, eps=1e-09)

Parameters:

Name Type Description Default
topk

Maximum number of achors that is selected for each gt box

9
num_classes 80
force_gt_matching

Guarantee that each gt box is matched to at least one anchor. If two gt boxes match to the same anchor, the one with the larger area will be selected. And the second-best achnor will be assigned to the other gt box.

False
eps

Small constant for numerical stability

1e-09
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def __init__(self, topk=9, num_classes=80, force_gt_matching=False, eps=1e-9):
    """

    :param topk: Maximum number of achors that is selected for each gt box
    :param num_classes:
    :param force_gt_matching: Guarantee that each gt box is matched to at least one anchor.
        If two gt boxes match to the same anchor, the one with the larger area will be selected.
        And the second-best achnor will be assigned to the other gt box.
    :param eps: Small constant for numerical stability
    """
    super(ATSSAssigner, self).__init__()
    self.topk = topk
    self.num_classes = num_classes
    self.force_gt_matching = force_gt_matching
    self.eps = eps

forward(anchor_bboxes, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_scores=None, pred_bboxes=None)

This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

The assignment is done in following steps 1. compute iou between all bbox (bbox of all pyramid levels) and gt 2. compute center distance between all bbox and gt 3. on each pyramid level, for each gt, select k bbox whose center are closest to the gt center, so we total select k*l bbox as candidates for each gt 4. get corresponding iou for the these candidates, and compute the mean and std, set mean + std as the iou threshold 5. select these candidates whose iou are greater than or equal to the threshold as positive 6. limit the positive sample's center in gt 7. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected.

Parameters:

Name Type Description Default
anchor_bboxes Tensor

Tensor(float32) - pre-defined anchors, shape(L, 4), "xmin, xmax, ymin, ymax" format

required
num_anchors_list list

Number of anchors in each level

required
gt_labels Tensor

Tensor (int64|int32) - Label of gt_bboxes, shape(B, n, 1)

required
gt_bboxes Tensor

Tensor (float32) - Ground truth bboxes, shape(B, n, 4)

required
pad_gt_mask Optional[Tensor]

Tensor (float32) - 1 means bbox, 0 means no bbox, shape(B, n, 1)

required
bg_index int

Background index

required
gt_scores Optional[Tensor]

Tensor (float32) - Score of gt_bboxes, shape(B, n, 1), if None, then it will initialize with one_hot label

None
pred_bboxes Optional[Tensor]

Tensor (float32) - predicted bounding boxes, shape(B, L, 4)

None

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor]
  • assigned_labels: Tensor of shape (B, L) - assigned_bboxes: Tensor of shape (B, L, 4) - assigned_scores: Tensor of shape (B, L, C), if pred_bboxes is not None, then output ious
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
@torch.no_grad()
def forward(
    self,
    anchor_bboxes: Tensor,
    num_anchors_list: list,
    gt_labels: Tensor,
    gt_bboxes: Tensor,
    pad_gt_mask: Optional[Tensor],
    bg_index: int,
    gt_scores: Optional[Tensor] = None,
    pred_bboxes: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor, Tensor]:
    """
    This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

    The assignment is done in following steps
    1. compute iou between all bbox (bbox of all pyramid levels) and gt
    2. compute center distance between all bbox and gt
    3. on each pyramid level, for each gt, select k bbox whose center
       are closest to the gt center, so we total select k*l bbox as
       candidates for each gt
    4. get corresponding iou for the these candidates, and compute the
       mean and std, set mean + std as the iou threshold
    5. select these candidates whose iou are greater than or equal to
       the threshold as positive
    6. limit the positive sample's center in gt
    7. if an anchor box is assigned to multiple gts, the one with the
       highest iou will be selected.

    :param anchor_bboxes:       Tensor(float32) - pre-defined anchors, shape(L, 4), "xmin, xmax, ymin, ymax" format
    :param num_anchors_list:    Number of anchors in each level
    :param gt_labels:           Tensor (int64|int32) - Label of gt_bboxes, shape(B, n, 1)
    :param gt_bboxes:           Tensor (float32) - Ground truth bboxes, shape(B, n, 4)
    :param pad_gt_mask:         Tensor (float32) - 1 means bbox, 0 means no bbox, shape(B, n, 1)
    :param bg_index:            Background index
    :param gt_scores:           Tensor (float32) - Score of gt_bboxes, shape(B, n, 1), if None, then it will initialize with one_hot label
    :param pred_bboxes:         Tensor (float32) - predicted bounding boxes, shape(B, L, 4)
    :return:
        - assigned_labels: Tensor of shape (B, L)
        - assigned_bboxes: Tensor of shape (B, L, 4)
        - assigned_scores: Tensor of shape (B, L, C), if pred_bboxes is not None, then output ious
    """
    assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

    num_anchors, _ = anchor_bboxes.shape
    batch_size, num_max_boxes, _ = gt_bboxes.shape

    # negative batch
    if num_max_boxes == 0:
        assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=anchor_bboxes.device)
        assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=anchor_bboxes.device)
        assigned_scores = torch.zeros([batch_size, num_anchors, self.num_classes], device=anchor_bboxes.device)
        return assigned_labels, assigned_bboxes, assigned_scores

    # 1. compute iou between gt and anchor bbox, [B, n, L]
    ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
    ious = ious.reshape([batch_size, -1, num_anchors])

    # 2. compute center distance between all anchors and gt, [B, n, L]
    gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
    anchor_centers = bbox_center(anchor_bboxes)
    # gt2anchor_distances = (
    #     (gt_centers - anchor_centers.unsqueeze(0)).norm(2, dim=-1).reshape([batch_size, -1, num_anchors])
    # )

    gt2anchor_distances = torch.norm(gt_centers - anchor_centers.unsqueeze(0), p=2, dim=-1).reshape([batch_size, -1, num_anchors])

    # 3. on each pyramid level, selecting top-k closest candidates
    # based on the center distance, [B, n, L]
    is_in_topk, topk_idxs = self._gather_topk_pyramid(gt2anchor_distances, num_anchors_list, pad_gt_mask)

    # 4. get corresponding iou for the these candidates, and compute the
    # mean and std, 5. set mean + std as the iou threshold
    iou_candidates = ious * is_in_topk

    iou_threshold = torch.gather(iou_candidates.flatten(end_dim=-2), dim=1, index=topk_idxs.flatten(end_dim=-2))

    iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
    iou_threshold = iou_threshold.mean(dim=-1, keepdim=True) + iou_threshold.std(dim=-1, keepdim=True)
    is_in_topk = torch.where(iou_candidates > iou_threshold, is_in_topk, torch.zeros_like(is_in_topk))

    # 6. check the positive sample's center in gt, [B, n, L]
    is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)

    # select positive sample, [B, n, L]
    mask_positive = is_in_topk * is_in_gts
    if pad_gt_mask is not None:
        mask_positive = mask_positive * pad_gt_mask

    # 7. if an anchor box is assigned to multiple gts,
    # the one with the highest iou will be selected.
    mask_positive_sum = mask_positive.sum(dim=-2)
    if mask_positive_sum.max() > 1:
        mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
        is_max_iou = compute_max_iou_anchor(ious)
        mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
        mask_positive_sum = mask_positive.sum(dim=-2)
    # 8. make sure every gt_bbox matches the anchor
    if self.force_gt_matching:
        is_max_iou = compute_max_iou_gt(ious)
        if pad_gt_mask is not None:
            is_max_iou = is_max_iou * pad_gt_mask
        mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile([1, num_max_boxes, 1])
        mask_positive = torch.where(mask_max_iou, is_max_iou, mask_positive)
        mask_positive_sum = mask_positive.sum(dim=-2)
    assigned_gt_index = mask_positive.argmax(dim=-2)

    # assigned target
    batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
    assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
    assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
    assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
    assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

    # assigned_bboxes = torch.gather(gt_bboxes.reshape([-1, 4]), index=assigned_gt_index.flatten(), dim=0)
    assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
    assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

    assigned_scores = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1).float()
    ind = list(range(self.num_classes + 1))
    ind.remove(bg_index)
    assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device), dim=-1)
    if pred_bboxes is not None:
        # assigned iou
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
        ious = ious.max(dim=-2).values.unsqueeze(-1)
        assigned_scores *= ious
    elif gt_scores is not None:
        gather_scores = torch.gather(gt_scores.flatten(), assigned_gt_index.flatten(), dim=0)
        gather_scores = gather_scores.reshape([batch_size, num_anchors])
        gather_scores = torch.where(mask_positive_sum > 0, gather_scores, torch.zeros_like(gather_scores))
        assigned_scores *= gather_scores.unsqueeze(-1)

    return assigned_labels, assigned_bboxes, assigned_scores

GIoULoss

Bases: object

Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630

Parameters:

Name Type Description Default
loss_weight float

giou loss weight, default as 1

1.0
eps float

epsilon to avoid divide by zero, default as 1e-10

1e-10
reduction str

Options are "none", "mean" and "sum". default as none

'none'
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
class GIoULoss(object):
    """
    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630

    :param loss_weight: giou loss weight, default as 1
    :param eps:         epsilon to avoid divide by zero, default as 1e-10
    :param reduction:   Options are "none", "mean" and "sum". default as none
    """

    def __init__(self, loss_weight: float = 1.0, eps: float = 1e-10, reduction: str = "none"):
        self.loss_weight = loss_weight
        self.eps = eps
        assert reduction in ("none", "mean", "sum")
        self.reduction = reduction

    def bbox_overlap(self, box1: Tensor, box2: Tensor, eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]:
        """
        Calculate the iou of box1 and box2.

        :param box1:    box1 with the shape (..., 4)
        :param box2:    box1 with the shape (..., 4)
        :param eps:     epsilon to avoid divide by zero
        :return:
            - iou:      iou of box1 and box2
            - overlap:  overlap of box1 and box2
            - union:    union of box1 and box2
        """
        x1, y1, x2, y2 = box1
        x1g, y1g, x2g, y2g = box2

        xkis1 = torch.maximum(x1, x1g)
        ykis1 = torch.maximum(y1, y1g)
        xkis2 = torch.minimum(x2, x2g)
        ykis2 = torch.minimum(y2, y2g)
        w_inter = (xkis2 - xkis1).clip(0)
        h_inter = (ykis2 - ykis1).clip(0)
        overlap = w_inter * h_inter

        area1 = (x2 - x1) * (y2 - y1)
        area2 = (x2g - x1g) * (y2g - y1g)
        union = area1 + area2 - overlap + eps
        iou = overlap / union

        return iou, overlap, union

    def __call__(self, pbox: Tensor, gbox: Tensor, iou_weight=1.0, loc_reweight=None):
        # x1, y1, x2, y2 = torch.split(pbox, split_size_or_sections=4, dim=-1)
        # x1g, y1g, x2g, y2g = torch.split(gbox, split_size_or_sections=4, dim=-1)

        x1, y1, x2, y2 = pbox.chunk(4, dim=-1)
        x1g, y1g, x2g, y2g = gbox.chunk(4, dim=-1)

        box1 = [x1, y1, x2, y2]
        box2 = [x1g, y1g, x2g, y2g]
        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
        xc1 = torch.minimum(x1, x1g)
        yc1 = torch.minimum(y1, y1g)
        xc2 = torch.maximum(x2, x2g)
        yc2 = torch.maximum(y2, y2g)

        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
        miou = iou - ((area_c - union) / area_c)
        if loc_reweight is not None:
            loc_reweight = torch.reshape(loc_reweight, shape=(-1, 1))
            loc_thresh = 0.9
            giou = 1 - (1 - loc_thresh) * miou - loc_thresh * miou * loc_reweight
        else:
            giou = 1 - miou
        if self.reduction == "none":
            loss = giou
        elif self.reduction == "sum":
            loss = torch.sum(giou * iou_weight)
        else:
            loss = torch.mean(giou * iou_weight)
        return loss * self.loss_weight

bbox_overlap(box1, box2, eps=1e-10)

Calculate the iou of box1 and box2.

Parameters:

Name Type Description Default
box1 Tensor

box1 with the shape (..., 4)

required
box2 Tensor

box1 with the shape (..., 4)

required
eps float

epsilon to avoid divide by zero

1e-10

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor]
  • iou: iou of box1 and box2 - overlap: overlap of box1 and box2 - union: union of box1 and box2
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
def bbox_overlap(self, box1: Tensor, box2: Tensor, eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]:
    """
    Calculate the iou of box1 and box2.

    :param box1:    box1 with the shape (..., 4)
    :param box2:    box1 with the shape (..., 4)
    :param eps:     epsilon to avoid divide by zero
    :return:
        - iou:      iou of box1 and box2
        - overlap:  overlap of box1 and box2
        - union:    union of box1 and box2
    """
    x1, y1, x2, y2 = box1
    x1g, y1g, x2g, y2g = box2

    xkis1 = torch.maximum(x1, x1g)
    ykis1 = torch.maximum(y1, y1g)
    xkis2 = torch.minimum(x2, x2g)
    ykis2 = torch.minimum(y2, y2g)
    w_inter = (xkis2 - xkis1).clip(0)
    h_inter = (ykis2 - ykis1).clip(0)
    overlap = w_inter * h_inter

    area1 = (x2 - x1) * (y2 - y1)
    area2 = (x2g - x1g) * (y2g - y1g)
    union = area1 + area2 - overlap + eps
    iou = overlap / union

    return iou, overlap, union

PPYoloELoss

Bases: nn.Module

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
@register_loss(name=Losses.PPYOLOE_LOSS, deprecated_name="ppyoloe_loss")
class PPYoloELoss(nn.Module):
    def __init__(
        self,
        num_classes: int,
        use_varifocal_loss: bool = True,
        use_static_assigner: bool = True,
        reg_max=None,
        classification_loss_weight: float = 1.0,
        iou_loss_weight: float = 2.5,
        dfl_loss_weight: float = 0.5,
        use_batched_assignment: bool = True,
    ):
        """
        :param num_classes:                Number of classes
        :param use_varifocal_loss:         Whether to use Varifocal loss for classification loss; otherwise use Focal loss
        :param classification_loss_weight: Classification loss weight
        :param iou_loss_weight:            IoU loss weight
        :param dfl_loss_weight:            DFL loss weight
        :param reg_max:                    (Deprecated) Number of regression bins. Default is None (will be inferred from model's outputs)
        :param use_batched_assignment:     Whether to use batched targets assignment or sequential (per-image).
                                           Default is True (batched).
                                           Batched assignment can be faster when number of the target per image is more or
                                           less the same across the batch, but it has higher peak GPU memory usage.
                                           Sequential assignment has lower peak GPU memory usage and preferable for cases
                                           when number of targets per image varies a lot.
        """
        if reg_max is not None:
            warnings.warn(
                "A reg_max argument is not needed for PPYoloE loss anymore. It is deprecated since SG 3.6.0 and will be removed in the SG 3.8.0."
                "You can safely omit this argument as it is not used anymore and we infer it automatically from model's outputs",
                DeprecationWarning,
            )
        super().__init__()
        self.use_varifocal_loss = use_varifocal_loss
        self.classification_loss_weight = classification_loss_weight
        self.dfl_loss_weight = dfl_loss_weight
        self.iou_loss_weight = iou_loss_weight

        self.iou_loss = GIoULoss()
        self.static_assigner = ATSSAssigner(topk=9, num_classes=num_classes)
        self.assigner = TaskAlignedAssigner(topk=13, alpha=1.0, beta=6.0)
        self.use_static_assigner = use_static_assigner
        self.num_classes = num_classes
        self.reg_max = reg_max
        self.use_batched_assignment = use_batched_assignment

    def get_proj_conv_for_reg_max(self, reg_max: int, device: torch.device) -> Tensor:
        """
        Get projection convolution for regression range [0, reg_max] to convert distribution to bbox coordinates
        :param reg_max: Number of regression bins
        :param device:  The device to create projection convolution on
        :return:        Tensor of shape (1, reg_max + 1, 1, 1)
        """
        proj = torch.linspace(0, reg_max, reg_max + 1, device=device).reshape([1, reg_max + 1, 1, 1])
        return proj

    @torch.no_grad()
    def _get_targets_for_sequential_assigner(self, flat_targets, batch_size: int) -> Tuple[List[Tensor], List[Tensor]]:
        """
        Unpack input targets into list of targets for each sample in batch
        :param flat_targets: (N, 6)
        :return: Tuple of two lists. Each list has [batch_size] elements
                 - List of tensors holding class indexes for each target in image
                 - List of tensors holding bbox coordinates (XYXY) for each target in image
        """

        image_index = flat_targets[:, 0]
        gt_class = flat_targets[:, 1:2].long()
        gt_bbox = cxcywh_to_xyxy(flat_targets[:, 2:6], image_shape=None)

        gt_class_list = []
        gt_bbox_list = []

        for i in range(batch_size):
            mask = image_index == i

            image_labels = gt_class[mask]
            image_bboxes = gt_bbox[mask, :]

            gt_class_list.append(image_labels)
            gt_bbox_list.append(image_bboxes)

        return gt_class_list, gt_bbox_list

    @torch.no_grad()
    def _get_targets_for_batched_assigner(self, targets: torch.Tensor, batch_size: int) -> Mapping[str, torch.Tensor]:
        """
        Convert targets from YoloX format to PPYolo since it's the easiest (not the cleanest) way to
        have PP Yolo training & metrics computed

        :param targets: (N, 6) format of bboxes is meant to be LABEL_CXCYWH (index, c, cx, cy, w, h)
        :return: (Dictionary [str,Tensor]) with keys:
         - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
         - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
         - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        """
        image_index = targets[:, 0]
        gt_class = targets[:, 1:2].long()
        gt_bbox = cxcywh_to_xyxy(targets[:, 2:6], image_shape=None)

        per_image_class = []
        per_image_bbox = []
        per_image_pad_mask = []

        max_boxes = 0
        for i in range(batch_size):
            mask = image_index == i

            image_labels = gt_class[mask]
            image_bboxes = gt_bbox[mask, :]
            valid_bboxes = image_bboxes.sum(dim=1, keepdims=True) > 0

            per_image_class.append(image_labels)
            per_image_bbox.append(image_bboxes)
            per_image_pad_mask.append(valid_bboxes)

            max_boxes = max(max_boxes, mask.sum().item())

        for i in range(batch_size):
            elements_to_pad = max_boxes - len(per_image_class[i])
            padding_left = 0
            padding_right = 0
            padding_top = 0
            padding_bottom = elements_to_pad
            pad = padding_left, padding_right, padding_top, padding_bottom
            per_image_class[i] = F.pad(per_image_class[i], pad, mode="constant", value=0)
            per_image_bbox[i] = F.pad(per_image_bbox[i], pad, mode="constant", value=0)
            per_image_pad_mask[i] = F.pad(per_image_pad_mask[i], pad, mode="constant", value=0)

        return {
            "gt_class": torch.stack(per_image_class, dim=0),
            "gt_bbox": torch.stack(per_image_bbox, dim=0),
            "pad_gt_mask": torch.stack(per_image_pad_mask, dim=0),
        }

    def _forward_batched(
        self,
        predictions: Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor],
        targets: Tensor,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """
        Compute the loss using batched targets-anchors assignment.
        This is the default way to compute the loss, however it may cause OOM errors when number of targets per image
        varies a lot withing a batch or there is a lot of targets per image.

        :param predictions: Model's predictions
        :param targets:     List of targets in flat format (N, 6)
        :return:            Tuple of (classification loss, iou loss, dfl loss, assigned scores sum)
        """
        (
            pred_scores,
            pred_distri,
            anchors,
            anchor_points,
            num_anchors_list,
            stride_tensor,
        ) = predictions

        targets = self._get_targets_for_batched_assigner(targets, batch_size=pred_scores.size(0))  # yolox -> ppyolo

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes, reg_max, proj_conv = self._bbox_decode(anchor_points_s, pred_distri)

        gt_labels = targets["gt_class"]
        gt_bboxes = targets["gt_bbox"]
        pad_gt_mask = targets["pad_gt_mask"]

        # label assignment
        if self.use_static_assigner:
            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
                anchor_bboxes=anchors,
                num_anchors_list=num_anchors_list,
                gt_labels=gt_labels,
                gt_bboxes=gt_bboxes,
                pad_gt_mask=pad_gt_mask,
                bg_index=self.num_classes,
                pred_bboxes=pred_bboxes.detach() * stride_tensor,
            )
            alpha_l = 0.25
        else:
            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
                pred_scores=pred_scores.detach().sigmoid(),  # Pred scores are logits on training for numerical stability
                pred_bboxes=pred_bboxes.detach() * stride_tensor,
                anchor_points=anchor_points,
                num_anchors_list=num_anchors_list,
                gt_labels=gt_labels,
                gt_bboxes=gt_bboxes,
                pad_gt_mask=pad_gt_mask,
                bg_index=self.num_classes,
            )
            alpha_l = -1
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1)[..., :-1]
            cls_loss_sum = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label)
        else:
            cls_loss_sum = self._focal_loss(pred_scores, assigned_scores, alpha_l)

        assigned_scores_sum = assigned_scores.sum()

        iou_loss_sum, dfl_loss_sum = self._bbox_loss(
            pred_distri,
            pred_bboxes,
            anchor_points_s,
            assigned_labels,
            assigned_bboxes / stride_tensor,  # rescale bbox
            assigned_scores,
            reg_max,
        )

        return cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum

    def _forward_sequential(
        self,
        predictions: Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor],
        targets: Tensor,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """
        Compute the loss using sequential (per-image) targets-anchors assignment.
        It computes assignment & loss per image, which does not cause OOM errors and in some cases can be faster
        than batched assignment (When number of targets per image varies a lot withing a batch)

        :param predictions: Model's predictions
        :param targets:     List of targets in flat format (N, 6)
        :return:            Tuple of (classification loss, iou loss, dfl loss, assigned scores sum)
        """
        (
            cls_score_list,
            reg_distri_list,
            anchors,
            anchor_points,
            num_anchors_list,
            stride_tensor,
        ) = predictions

        anchor_points_s = anchor_points / stride_tensor

        batch_size = cls_score_list.size(0)
        gt_class_list, gt_bbox_list = self._get_targets_for_sequential_assigner(targets, batch_size=batch_size)

        cls_loss_sum = 0
        iou_loss_sum = 0
        dfl_loss_sum = 0
        assigned_scores_sum_total = 0

        for gt_class, gt_bbox, pred_scores, pred_distri in zip(gt_class_list, gt_bbox_list, cls_score_list, reg_distri_list):
            pred_scores = pred_scores.unsqueeze(0)  # Add dummy batch dimension
            pred_distri = pred_distri.unsqueeze(0)  # Add dummy batch dimension

            pred_bboxes, reg_max, proj_conv = self._bbox_decode(anchor_points_s, pred_distri)

            # label assignment
            if self.use_static_assigner:
                assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
                    anchor_bboxes=anchors,
                    num_anchors_list=num_anchors_list,
                    gt_labels=gt_class.unsqueeze(0),
                    gt_bboxes=gt_bbox.unsqueeze(0),
                    pad_gt_mask=None,
                    bg_index=self.num_classes,
                    pred_bboxes=pred_bboxes.detach() * stride_tensor,
                )
                alpha_l = 0.25
            else:
                assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
                    pred_scores=pred_scores.detach().sigmoid(),  # Pred scores are logits on training for numerical stability
                    pred_bboxes=pred_bboxes.detach() * stride_tensor,
                    anchor_points=anchor_points,
                    num_anchors_list=num_anchors_list,
                    gt_labels=gt_class.unsqueeze(0),
                    gt_bboxes=gt_bbox.unsqueeze(0),
                    pad_gt_mask=None,
                    bg_index=self.num_classes,
                )
                alpha_l = -1

            # cls loss
            if self.use_varifocal_loss:
                one_hot_label = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1)[..., :-1]
                cls_loss = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label)
            else:
                cls_loss = self._focal_loss(pred_scores, assigned_scores, alpha_l)

            assigned_scores_sum = assigned_scores.sum()

            loss_iou, loss_dfl = self._bbox_loss(
                pred_distri,
                pred_bboxes,
                anchor_points_s,
                assigned_labels,
                assigned_bboxes / stride_tensor,  # rescale bbox
                assigned_scores,
                reg_max,
            )

            cls_loss_sum = cls_loss + cls_loss_sum
            iou_loss_sum = loss_iou + iou_loss_sum
            dfl_loss_sum = loss_dfl + dfl_loss_sum
            assigned_scores_sum_total = assigned_scores_sum + assigned_scores_sum_total

        return cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum_total

    def forward(
        self,
        outputs: Union[
            Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor], Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]
        ],
        targets: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """
        :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
        :param targets: (Dictionary [str,Tensor]) with keys:
         - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
         - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
         - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        :return:
        """
        # in test/eval mode the model outputs a tuple where the second item is the raw predictions
        if isinstance(outputs, tuple) and len(outputs) == 2:
            # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
            _, predictions = outputs
        else:
            predictions = outputs

        if self.use_batched_assignment:
            cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum = self._forward_batched(predictions, targets)
        else:
            cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum = self._forward_sequential(predictions, targets)

        if super_gradients.is_distributed():
            torch.distributed.all_reduce(cls_loss_sum, op=torch.distributed.ReduceOp.SUM)
            torch.distributed.all_reduce(iou_loss_sum, op=torch.distributed.ReduceOp.SUM)
            torch.distributed.all_reduce(dfl_loss_sum, op=torch.distributed.ReduceOp.SUM)
            torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
            # This is not an error, it will cancel out since loss is reduced using averaging in DDP
            assigned_scores_sum /= get_world_size()

        assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)

        cls_loss = self.classification_loss_weight * cls_loss_sum / assigned_scores_sum
        iou_loss = self.iou_loss_weight * iou_loss_sum / assigned_scores_sum
        dfl_loss = self.dfl_loss_weight * dfl_loss_sum / assigned_scores_sum
        loss = cls_loss + iou_loss + dfl_loss

        log_losses = torch.stack([cls_loss.detach(), iou_loss.detach(), dfl_loss.detach(), loss.detach()])

        return loss, log_losses

    @property
    def component_names(self):
        return ["loss_cls", "loss_iou", "loss_dfl", "loss"]

    def _df_loss(self, pred_dist: Tensor, target: Tensor) -> Tensor:
        target_left = target.long()
        target_right = target_left + 1
        weight_left = target_right.float() - target
        weight_right = 1 - weight_left

        # [B,L,C] -> [B,C,L] to make compatible with torch.nn.functional.cross_entropy
        # which expects channel dim to be at index 1
        pred_dist = torch.moveaxis(pred_dist, -1, 1)

        loss_left = torch.nn.functional.cross_entropy(pred_dist, target_left, reduction="none") * weight_left
        loss_right = torch.nn.functional.cross_entropy(pred_dist, target_right, reduction="none") * weight_right
        return (loss_left + loss_right).mean(dim=-1, keepdim=True)

    def _bbox_loss(
        self,
        pred_dist,
        pred_bboxes,
        anchor_points,
        assigned_labels,
        assigned_bboxes,
        assigned_scores,
        reg_max: int,
    ) -> Tuple[Tensor, Tensor]:
        """
        Compute IoU and DFL terms of the loss
        :param pred_dist:
        :param pred_bboxes:
        :param anchor_points:
        :param assigned_labels:
        :param assigned_bboxes:
        :param assigned_scores:
        :return: (Tensor, Tensor) Tuple if IoU and DFL losses, respectively.
                 Both are single-element tensors with the sum of loss values for all positive targets.
        """
        # select positive samples mask
        mask_positive = assigned_labels != self.num_classes
        num_pos = mask_positive.sum()
        # pos/neg loss
        if num_pos > 0:
            # l1 + iou
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
            pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = torch.masked_select(assigned_bboxes, bbox_mask).reshape([-1, 4])
            bbox_weight = torch.masked_select(assigned_scores.sum(-1), mask_positive).unsqueeze(-1)

            loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum()

            dist_mask = mask_positive.unsqueeze(-1).tile([1, 1, (reg_max + 1) * 4])
            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).reshape([-1, 4, reg_max + 1])
            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes, reg_max)
            assigned_ltrb_pos = torch.masked_select(assigned_ltrb, bbox_mask).reshape([-1, 4])
            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos) * bbox_weight
            loss_dfl = loss_dfl.sum()
        else:
            loss_iou = torch.zeros([], device=pred_bboxes.device)
            loss_dfl = pred_dist.sum() * 0.0
        return loss_iou, loss_dfl

    def _bbox_decode(self, anchor_points: Tensor, pred_dist: Tensor):
        b, l, *_ = pred_dist.size()
        pred_dist = pred_dist.reshape([b, l, 4, -1])
        reg_max = pred_dist.size(-1) - 1
        proj_conv = self.get_proj_conv_for_reg_max(reg_max, device=pred_dist.device)
        pred_dist = torch.softmax(pred_dist, dim=-1)
        pred_dist = torch.nn.functional.conv2d(pred_dist.permute(0, 3, 1, 2), proj_conv).squeeze(1)
        return batch_distance2bbox(anchor_points, pred_dist), reg_max, proj_conv

    def _bbox2distance(self, points, bbox, reg_max: int):
        x1y1, x2y2 = torch.split(bbox, 2, -1)
        lt = points - x1y1
        rb = x2y2 - points
        return torch.cat([lt, rb], dim=-1).clip(0, reg_max - 0.01)

    @staticmethod
    def _focal_loss(pred_logits: Tensor, label: Tensor, alpha=0.25, gamma=2.0) -> Tensor:
        pred_score = pred_logits.sigmoid()
        weight = (pred_score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = weight * torch.nn.functional.binary_cross_entropy_with_logits(pred_logits, label, reduction="none")
        return loss.sum()

    @staticmethod
    def _varifocal_loss(pred_logits: Tensor, gt_score: Tensor, label: Tensor, alpha=0.75, gamma=2.0) -> Tensor:
        pred_score = pred_logits.sigmoid()
        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
        loss = weight * torch.nn.functional.binary_cross_entropy_with_logits(pred_logits, gt_score, reduction="none")
        return loss.sum()

__init__(num_classes, use_varifocal_loss=True, use_static_assigner=True, reg_max=None, classification_loss_weight=1.0, iou_loss_weight=2.5, dfl_loss_weight=0.5, use_batched_assignment=True)

Parameters:

Name Type Description Default
num_classes int

Number of classes

required
use_varifocal_loss bool

Whether to use Varifocal loss for classification loss; otherwise use Focal loss

True
classification_loss_weight float

Classification loss weight

1.0
iou_loss_weight float

IoU loss weight

2.5
dfl_loss_weight float

DFL loss weight

0.5
reg_max

(Deprecated) Number of regression bins. Default is None (will be inferred from model's outputs)

None
use_batched_assignment bool

Whether to use batched targets assignment or sequential (per-image). Default is True (batched). Batched assignment can be faster when number of the target per image is more or less the same across the batch, but it has higher peak GPU memory usage. Sequential assignment has lower peak GPU memory usage and preferable for cases when number of targets per image varies a lot.

True
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
def __init__(
    self,
    num_classes: int,
    use_varifocal_loss: bool = True,
    use_static_assigner: bool = True,
    reg_max=None,
    classification_loss_weight: float = 1.0,
    iou_loss_weight: float = 2.5,
    dfl_loss_weight: float = 0.5,
    use_batched_assignment: bool = True,
):
    """
    :param num_classes:                Number of classes
    :param use_varifocal_loss:         Whether to use Varifocal loss for classification loss; otherwise use Focal loss
    :param classification_loss_weight: Classification loss weight
    :param iou_loss_weight:            IoU loss weight
    :param dfl_loss_weight:            DFL loss weight
    :param reg_max:                    (Deprecated) Number of regression bins. Default is None (will be inferred from model's outputs)
    :param use_batched_assignment:     Whether to use batched targets assignment or sequential (per-image).
                                       Default is True (batched).
                                       Batched assignment can be faster when number of the target per image is more or
                                       less the same across the batch, but it has higher peak GPU memory usage.
                                       Sequential assignment has lower peak GPU memory usage and preferable for cases
                                       when number of targets per image varies a lot.
    """
    if reg_max is not None:
        warnings.warn(
            "A reg_max argument is not needed for PPYoloE loss anymore. It is deprecated since SG 3.6.0 and will be removed in the SG 3.8.0."
            "You can safely omit this argument as it is not used anymore and we infer it automatically from model's outputs",
            DeprecationWarning,
        )
    super().__init__()
    self.use_varifocal_loss = use_varifocal_loss
    self.classification_loss_weight = classification_loss_weight
    self.dfl_loss_weight = dfl_loss_weight
    self.iou_loss_weight = iou_loss_weight

    self.iou_loss = GIoULoss()
    self.static_assigner = ATSSAssigner(topk=9, num_classes=num_classes)
    self.assigner = TaskAlignedAssigner(topk=13, alpha=1.0, beta=6.0)
    self.use_static_assigner = use_static_assigner
    self.num_classes = num_classes
    self.reg_max = reg_max
    self.use_batched_assignment = use_batched_assignment

forward(outputs, targets)

Parameters:

Name Type Description Default
outputs Union[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor], Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]]

Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor

required
targets Tensor

(Dictionary [str,Tensor]) with keys: - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)

required

Returns:

Type Description
Tuple[Tensor, Tensor]
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
def forward(
    self,
    outputs: Union[
        Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor], Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]
    ],
    targets: Tensor,
) -> Tuple[Tensor, Tensor]:
    """
    :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
    :param targets: (Dictionary [str,Tensor]) with keys:
     - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
     - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
     - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
    :return:
    """
    # in test/eval mode the model outputs a tuple where the second item is the raw predictions
    if isinstance(outputs, tuple) and len(outputs) == 2:
        # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
        _, predictions = outputs
    else:
        predictions = outputs

    if self.use_batched_assignment:
        cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum = self._forward_batched(predictions, targets)
    else:
        cls_loss_sum, iou_loss_sum, dfl_loss_sum, assigned_scores_sum = self._forward_sequential(predictions, targets)

    if super_gradients.is_distributed():
        torch.distributed.all_reduce(cls_loss_sum, op=torch.distributed.ReduceOp.SUM)
        torch.distributed.all_reduce(iou_loss_sum, op=torch.distributed.ReduceOp.SUM)
        torch.distributed.all_reduce(dfl_loss_sum, op=torch.distributed.ReduceOp.SUM)
        torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
        # This is not an error, it will cancel out since loss is reduced using averaging in DDP
        assigned_scores_sum /= get_world_size()

    assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)

    cls_loss = self.classification_loss_weight * cls_loss_sum / assigned_scores_sum
    iou_loss = self.iou_loss_weight * iou_loss_sum / assigned_scores_sum
    dfl_loss = self.dfl_loss_weight * dfl_loss_sum / assigned_scores_sum
    loss = cls_loss + iou_loss + dfl_loss

    log_losses = torch.stack([cls_loss.detach(), iou_loss.detach(), dfl_loss.detach(), loss.detach()])

    return loss, log_losses

get_proj_conv_for_reg_max(reg_max, device)

Get projection convolution for regression range [0, reg_max] to convert distribution to bbox coordinates

Parameters:

Name Type Description Default
reg_max int

Number of regression bins

required
device torch.device

The device to create projection convolution on

required

Returns:

Type Description
Tensor

Tensor of shape (1, reg_max + 1, 1, 1)

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
688
689
690
691
692
693
694
695
696
def get_proj_conv_for_reg_max(self, reg_max: int, device: torch.device) -> Tensor:
    """
    Get projection convolution for regression range [0, reg_max] to convert distribution to bbox coordinates
    :param reg_max: Number of regression bins
    :param device:  The device to create projection convolution on
    :return:        Tensor of shape (1, reg_max + 1, 1, 1)
    """
    proj = torch.linspace(0, reg_max, reg_max + 1, device=device).reshape([1, reg_max + 1, 1, 1])
    return proj

TaskAlignedAssigner

Bases: nn.Module

TOOD: Task-aligned One-stage Object Detection

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
class TaskAlignedAssigner(nn.Module):
    """TOOD: Task-aligned One-stage Object Detection"""

    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
        """

        :param topk: Maximum number of achors that is selected for each gt box
        :param alpha: Power factor for class probabilities of predicted boxes (Used compute alignment metric)
        :param beta: Power factor for IoU score of predicted boxes (Used compute alignment metric)
        :param eps: Small constant for numerical stability
        """
        super(TaskAlignedAssigner, self).__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.eps = eps

    @torch.no_grad()
    def forward(
        self,
        pred_scores: Tensor,
        pred_bboxes: Tensor,
        anchor_points: Tensor,
        num_anchors_list: list,
        gt_labels: Tensor,
        gt_bboxes: Tensor,
        pad_gt_mask: Optional[Tensor],
        bg_index: int,
        gt_scores: Optional[Tensor] = None,
    ):
        """
        This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.

        :param pred_scores: Tensor (float32): predicted class probability, shape(B, L, C)
        :param pred_bboxes: Tensor (float32): predicted bounding boxes, shape(B, L, 4)
        :param anchor_points: Tensor (float32): pre-defined anchors, shape(L, 2), "cxcy" format
        :param num_anchors_list: List ( num of anchors in each level, shape(L)
        :param gt_labels: Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)
        :param gt_bboxes: Tensor (float32): Ground truth bboxes, shape(B, n, 4)
        :param pad_gt_mask: Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1).
                            Can be None, which means all gt_bboxes are valid.
        :param bg_index: int ( background index
        :param gt_scores: Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)
        :return:
            - assigned_labels, Tensor of shape (B, L)
            - assigned_bboxes, Tensor of shape (B, L, 4)
            - assigned_scores, Tensor of shape (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=gt_labels.device)
            assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=gt_labels.device)
            assigned_scores = torch.zeros([batch_size, num_anchors, num_classes], device=gt_labels.device)
            return assigned_labels, assigned_bboxes, assigned_scores

        # compute iou between gt and pred bbox, [B, n, L]
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
        # gather pred bboxes class score
        pred_scores = torch.permute(pred_scores, [0, 2, 1])
        batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
        gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)

        bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]

        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)

        # check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

        # select topk largest alignment metrics pred bbox as candidates
        # for each gt, [B, n, L]
        is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts
        if pad_gt_mask is not None:
            mask_positive *= pad_gt_mask

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(dim=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        assigned_gt_index = mask_positive.argmax(dim=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
        assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
        max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
        alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        return assigned_labels, assigned_bboxes, assigned_scores

__init__(topk=13, alpha=1.0, beta=6.0, eps=1e-09)

Parameters:

Name Type Description Default
topk

Maximum number of achors that is selected for each gt box

13
alpha

Power factor for class probabilities of predicted boxes (Used compute alignment metric)

1.0
beta

Power factor for IoU score of predicted boxes (Used compute alignment metric)

6.0
eps

Small constant for numerical stability

1e-09
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
440
441
442
443
444
445
446
447
448
449
450
451
452
def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
    """

    :param topk: Maximum number of achors that is selected for each gt box
    :param alpha: Power factor for class probabilities of predicted boxes (Used compute alignment metric)
    :param beta: Power factor for IoU score of predicted boxes (Used compute alignment metric)
    :param eps: Small constant for numerical stability
    """
    super(TaskAlignedAssigner, self).__init__()
    self.topk = topk
    self.alpha = alpha
    self.beta = beta
    self.eps = eps

forward(pred_scores, pred_bboxes, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_scores=None)

This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

The assignment is done in following steps 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt 2. select top-k bbox as candidates for each gt 3. limit the positive sample's center in gt (because the anchor-free detector only can predict positive distance) 4. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected.

Parameters:

Name Type Description Default
pred_scores Tensor

Tensor (float32): predicted class probability, shape(B, L, C)

required
pred_bboxes Tensor

Tensor (float32): predicted bounding boxes, shape(B, L, 4)

required
anchor_points Tensor

Tensor (float32): pre-defined anchors, shape(L, 2), "cxcy" format

required
num_anchors_list list

List ( num of anchors in each level, shape(L)

required
gt_labels Tensor

Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)

required
gt_bboxes Tensor

Tensor (float32): Ground truth bboxes, shape(B, n, 4)

required
pad_gt_mask Optional[Tensor]

Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1). Can be None, which means all gt_bboxes are valid.

required
bg_index int

int ( background index

required
gt_scores Optional[Tensor]

Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)

None

Returns:

Type Description
  • assigned_labels, Tensor of shape (B, L) - assigned_bboxes, Tensor of shape (B, L, 4) - assigned_scores, Tensor of shape (B, L, C)
Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
@torch.no_grad()
def forward(
    self,
    pred_scores: Tensor,
    pred_bboxes: Tensor,
    anchor_points: Tensor,
    num_anchors_list: list,
    gt_labels: Tensor,
    gt_bboxes: Tensor,
    pad_gt_mask: Optional[Tensor],
    bg_index: int,
    gt_scores: Optional[Tensor] = None,
):
    """
    This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

    The assignment is done in following steps
    1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
    2. select top-k bbox as candidates for each gt
    3. limit the positive sample's center in gt (because the anchor-free detector
       only can predict positive distance)
    4. if an anchor box is assigned to multiple gts, the one with the
       highest iou will be selected.

    :param pred_scores: Tensor (float32): predicted class probability, shape(B, L, C)
    :param pred_bboxes: Tensor (float32): predicted bounding boxes, shape(B, L, 4)
    :param anchor_points: Tensor (float32): pre-defined anchors, shape(L, 2), "cxcy" format
    :param num_anchors_list: List ( num of anchors in each level, shape(L)
    :param gt_labels: Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)
    :param gt_bboxes: Tensor (float32): Ground truth bboxes, shape(B, n, 4)
    :param pad_gt_mask: Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1).
                        Can be None, which means all gt_bboxes are valid.
    :param bg_index: int ( background index
    :param gt_scores: Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)
    :return:
        - assigned_labels, Tensor of shape (B, L)
        - assigned_bboxes, Tensor of shape (B, L, 4)
        - assigned_scores, Tensor of shape (B, L, C)
    """
    assert pred_scores.ndim == pred_bboxes.ndim
    assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

    batch_size, num_anchors, num_classes = pred_scores.shape
    _, num_max_boxes, _ = gt_bboxes.shape

    # negative batch
    if num_max_boxes == 0:
        assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=gt_labels.device)
        assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=gt_labels.device)
        assigned_scores = torch.zeros([batch_size, num_anchors, num_classes], device=gt_labels.device)
        return assigned_labels, assigned_bboxes, assigned_scores

    # compute iou between gt and pred bbox, [B, n, L]
    ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
    # gather pred bboxes class score
    pred_scores = torch.permute(pred_scores, [0, 2, 1])
    batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
    gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)

    bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]

    # compute alignment metrics, [B, n, L]
    alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)

    # check the positive sample's center in gt, [B, n, L]
    is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

    # select topk largest alignment metrics pred bbox as candidates
    # for each gt, [B, n, L]
    is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

    # select positive sample, [B, n, L]
    mask_positive = is_in_topk * is_in_gts
    if pad_gt_mask is not None:
        mask_positive *= pad_gt_mask

    # if an anchor box is assigned to multiple gts,
    # the one with the highest iou will be selected, [B, n, L]
    mask_positive_sum = mask_positive.sum(dim=-2)
    if mask_positive_sum.max() > 1:
        mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
        is_max_iou = compute_max_iou_anchor(ious)
        mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
        mask_positive_sum = mask_positive.sum(dim=-2)
    assigned_gt_index = mask_positive.argmax(dim=-2)

    # assigned target
    assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
    assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
    assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
    assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

    assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
    assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

    assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
    ind = list(range(num_classes + 1))
    ind.remove(bg_index)
    assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
    # rescale alignment metrics
    alignment_metrics *= mask_positive
    max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
    max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
    alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
    alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
    assigned_scores = assigned_scores * alignment_metrics

    return assigned_labels, assigned_bboxes, assigned_scores

batch_iou_similarity(box1, box2, eps=1e-09)

Calculate iou of box1 and box2 in batch. Bboxes are expected to be in x1y1x2y2 format.

Parameters:

Name Type Description Default
box1 torch.Tensor

box with the shape [N, M1, 4]

required
box2 torch.Tensor

box with the shape [N, M2, 4]

required

Returns:

Type Description
float

iou between box1 and box2 with the shape [N, M1, M2]

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def batch_iou_similarity(box1: torch.Tensor, box2: torch.Tensor, eps: float = 1e-9) -> float:
    """Calculate iou of box1 and box2 in batch. Bboxes are expected to be in x1y1x2y2 format.

    :param box1: box with the shape [N, M1, 4]
    :param box2: box with the shape [N, M2, 4]
    :return iou: iou between box1 and box2 with the shape [N, M1, M2]

    """
    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
    x1y1 = torch.maximum(px1y1, gx1y1)
    x2y2 = torch.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union

bbox_center(boxes)

Get bbox centers from boxes.

Parameters:

Name Type Description Default
boxes Tensor

Boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.

required

Returns:

Type Description
Tensor

Boxes centers with shape (..., 2), "cx, cy" format.

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
233
234
235
236
237
238
239
240
241
242
def bbox_center(boxes: Tensor) -> Tensor:
    """
    Get bbox centers from boxes.

    :param boxes:   Boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
    :return:        Boxes centers with shape (..., 2), "cx, cy" format.
    """
    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
    return torch.stack([boxes_cx, boxes_cy], dim=-1)

bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-06)

Calculate overlap between two set of bboxes.

If is_aligned is False, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2.

Parameters:

Name Type Description Default
bboxes1 torch.Tensor

shape (B, m, 4) in format or empty.

required
bboxes2 torch.Tensor

shape (B, n, 4) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If is_aligned is True, then m and n must be equal.

required
mode str

Either "iou" (intersection over union) or "iof" (intersection over foreground).

'iou'
is_aligned bool

If True, then m and n must be equal. Default False.

False
eps float

A value added to the denominator for numerical stability. Default 1e-6.

1e-06

Returns:

Type Description
torch.Tensor

Tensor of shape (m, n) if is_aligned is False else shape (m,)

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def bbox_overlaps(bboxes1: torch.Tensor, bboxes2: torch.Tensor, mode: str = "iou", is_aligned: bool = False, eps: float = 1e-6) -> torch.Tensor:
    """
    Calculate overlap between two set of bboxes.

    If ``is_aligned `` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.

    :param bboxes1:     shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
    :param bboxes2:     shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
                                B indicates the batch dim, in shape (B1, B2, ..., Bn).
                                If ``is_aligned `` is ``True``, then m and n must be equal.
    :param mode:        Either "iou" (intersection over union) or "iof" (intersection over foreground).
    :param is_aligned:  If True, then m and n must be equal. Default False.
    :param eps:         A value added to the denominator for numerical stability. Default 1e-6.
    :return:            Tensor of shape (m, n) if ``is_aligned `` is False else shape (m,)
    """
    assert mode in ["iou", "iof", "giou"], "Unsupported mode {}".format(mode)
    # Either the boxes are empty or the length of boxes's last dimenstion is 4
    assert bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0
    assert bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return np.random.random(batch_shape + (rows,))
        else:
            return np.random.random(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ["iou", "giou"]:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == "giou":
            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
    else:
        lt = np.maximum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = np.minimum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ["iou", "giou"]:
            union = area1[..., None] + area2[..., None, :] - overlap
        else:
            union = area1[..., None]
        if mode == "giou":
            enclosed_lt = np.minimum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])

    eps = np.array([eps])
    union = np.maximum(union, eps)
    ious = overlap / union
    if mode in ["iou", "iof"]:
        return ious
    # calculate gious
    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
    enclose_area = np.maximum(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return gious

check_points_inside_bboxes(points, bboxes, center_radius_tensor=None, eps=1e-09)

Parameters:

Name Type Description Default
points Tensor

Tensor (float32) of shape[L, 2], "xy" format, L: num_anchors

required
bboxes Tensor

Tensor (float32) of shape[B, n, 4], "xmin, ymin, xmax, ymax" format

required
center_radius_tensor Optional[Tensor]

Tensor (float32) of shape [L, 1]. Default: None.

None
eps float

Default: 1e-9

1e-09

Returns:

Type Description
Tensor

Tensor (float32) of shape[B, n, L], value=1. means selected

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def check_points_inside_bboxes(points: Tensor, bboxes: Tensor, center_radius_tensor: Optional[Tensor] = None, eps: float = 1e-9) -> Tensor:
    """

    :param points:                  Tensor (float32) of shape[L, 2], "xy" format, L: num_anchors
    :param bboxes:                  Tensor (float32) of shape[B, n, 4], "xmin, ymin, xmax, ymax" format
    :param center_radius_tensor:    Tensor (float32) of shape [L, 1]. Default: None.
    :param eps:                     Default: 1e-9

    :return is_in_bboxes: Tensor (float32) of shape[B, n, L], value=1. means selected
    """
    points = points.unsqueeze(0).unsqueeze(0)
    x, y = points.chunk(2, dim=-1)
    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, dim=-1)
    # check whether `points` is in `bboxes`
    left = x - xmin
    top = y - ymin
    right = xmax - x
    bottom = ymax - y
    delta_ltrb = torch.cat([left, top, right, bottom], dim=-1)
    is_in_bboxes = delta_ltrb.min(dim=-1).values > eps
    if center_radius_tensor is not None:
        # check whether `points` is in `center_radius`
        center_radius_tensor = center_radius_tensor.unsqueeze(0).unsqueeze(0)
        cx = (xmin + xmax) * 0.5
        cy = (ymin + ymax) * 0.5
        left = x - (cx - center_radius_tensor)
        top = y - (cy - center_radius_tensor)
        right = (cx + center_radius_tensor) - x
        bottom = (cy + center_radius_tensor) - y
        delta_ltrb_c = torch.cat([left, top, right, bottom], dim=-1)
        is_in_center = delta_ltrb_c.min(dim=-1) > eps
        return (torch.logical_and(is_in_bboxes, is_in_center), torch.logical_or(is_in_bboxes, is_in_center))

    return is_in_bboxes.type_as(bboxes)

compute_max_iou_anchor(ious)

For each anchor, find the GT with the largest IOU.

Parameters:

Name Type Description Default
ious Tensor

Tensor (float32) of shape[B, n, L], n: num_gts, L: num_anchors

required

Returns:

Type Description
Tensor

is_max_iou is Tensor (float32) of shape[B, n, L], value=1. means selected

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
165
166
167
168
169
170
171
172
173
174
175
def compute_max_iou_anchor(ious: Tensor) -> Tensor:
    r"""
    For each anchor, find the GT with the largest IOU.

    :param ious: Tensor (float32) of shape[B, n, L], n: num_gts, L: num_anchors
    :return: is_max_iou is Tensor (float32) of shape[B, n, L], value=1. means selected
    """
    num_max_boxes = ious.shape[-2]
    max_iou_index = ious.argmax(dim=-2)
    is_max_iou: Tensor = torch.nn.functional.one_hot(max_iou_index, num_max_boxes).permute([0, 2, 1])
    return is_max_iou.type_as(ious)

compute_max_iou_gt(ious)

For each GT, find the anchor with the largest IOU.

Parameters:

Name Type Description Default
ious Tensor

Tensor (float32) of shape[B, n, L], n: num_gts, L: num_anchors

required

Returns:

Type Description
Tensor

is_max_iou, Tensor (float32) of shape[B, n, L], value=1. means selected

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
245
246
247
248
249
250
251
252
253
254
255
def compute_max_iou_gt(ious: Tensor) -> Tensor:
    """
    For each GT, find the anchor with the largest IOU.

    :param ious: Tensor (float32) of shape[B, n, L], n: num_gts, L: num_anchors
    :return:    is_max_iou, Tensor (float32) of shape[B, n, L], value=1. means selected
    """
    num_anchors = ious.shape[-1]
    max_iou_index = ious.argmax(dim=-1)
    is_max_iou = torch.nn.functional.one_hot(max_iou_index, num_anchors)
    return is_max_iou.astype(ious.dtype)

gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-09)

Parameters:

Name Type Description Default
metrics Tensor

Tensor(float32) of shape[B, n, L], n: num_gts, L: num_anchors

required
topk int

The number of top elements to look for along the axis.

required
largest bool

If set to true, algorithm will sort by descending order, otherwise sort by ascending order.

True
topk_mask Optional[Tensor]

Tensor(float32) of shape[B, n, 1], ignore bbox mask,

None
eps float

Default: 1e-9

1e-09

Returns:

Type Description
Tensor

is_in_topk, Tensor (float32) of shape[B, n, L], value=1. means selected

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def gather_topk_anchors(metrics: Tensor, topk: int, largest: bool = True, topk_mask: Optional[Tensor] = None, eps: float = 1e-9) -> Tensor:
    """

    :param metrics:     Tensor(float32) of shape[B, n, L], n: num_gts, L: num_anchors
    :param topk:        The number of top elements to look for along the axis.
    :param largest:     If set to true, algorithm will sort by descending order, otherwise sort by ascending order.
    :param topk_mask:   Tensor(float32) of shape[B, n, 1], ignore bbox mask,
    :param eps:         Default: 1e-9

    :return: is_in_topk, Tensor (float32) of shape[B, n, L], value=1. means selected
    """
    num_anchors = metrics.shape[-1]
    topk_metrics, topk_idxs = torch.topk(metrics, topk, dim=-1, largest=largest)
    if topk_mask is None:
        topk_mask = (topk_metrics.max(dim=-1, keepdim=True).values > eps).type_as(metrics)
    is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(metrics)
    return is_in_topk * topk_mask

iou_similarity(box1, box2, eps=1e-10)

Calculate iou of box1 and box2. Bboxes are expected to be in x1y1x2y2 format.

Parameters:

Name Type Description Default
box1 torch.Tensor

box with the shape [M1, 4]

required
box2 torch.Tensor

box with the shape [M2, 4]

required

Returns:

Type Description
float

iou between box1 and box2 with the shape [M1, M2]

Source code in V3_6/src/super_gradients/training/losses/ppyolo_loss.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def iou_similarity(box1: torch.Tensor, box2: torch.Tensor, eps: float = 1e-10) -> float:
    """
    Calculate iou of box1 and box2. Bboxes are expected to be in x1y1x2y2 format.

    :param box1: box with the shape [M1, 4]
    :param box2: box with the shape [M2, 4]

    :return iou: iou between box1 and box2 with the shape [M1, M2]
    """
    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
    x1y1 = torch.maximum(px1y1, gx1y1)
    x2y2 = torch.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union

RSquaredLoss

Bases: _Loss

Source code in V3_6/src/super_gradients/training/losses/r_squared_loss.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@register_loss(name=Losses.R_SQUARED_LOSS, deprecated_name="r_squared_loss")
class RSquaredLoss(_Loss):
    def forward(self, output, target):
        # FIXME - THIS NEEDS TO BE CHANGED SUCH THAT THIS CLASS INHERETS FROM _Loss (TAKE A LOOK AT YoLoV3DetectionLoss)
        """Computes the R-squared for the output and target values
        :param output: Tensor / Numpy / List
            The prediction
        :param target: Tensor / Numpy / List
            The corresponding lables
        """
        # Convert to tensor
        output = convert_to_tensor(output)
        target = convert_to_tensor(target)

        criterion_mse = nn.MSELoss()
        return 1 - criterion_mse(output, target).item() / torch.var(target).item()

forward(output, target)

Computes the R-squared for the output and target values

Parameters:

Name Type Description Default
output

Tensor / Numpy / List The prediction

required
target

Tensor / Numpy / List The corresponding lables

required
Source code in V3_6/src/super_gradients/training/losses/r_squared_loss.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def forward(self, output, target):
    # FIXME - THIS NEEDS TO BE CHANGED SUCH THAT THIS CLASS INHERETS FROM _Loss (TAKE A LOOK AT YoLoV3DetectionLoss)
    """Computes the R-squared for the output and target values
    :param output: Tensor / Numpy / List
        The prediction
    :param target: Tensor / Numpy / List
        The corresponding lables
    """
    # Convert to tensor
    output = convert_to_tensor(output)
    target = convert_to_tensor(target)

    criterion_mse = nn.MSELoss()
    return 1 - criterion_mse(output, target).item() / torch.var(target).item()

RescoringLoss

Bases: nn.Module

Source code in V3_6/src/super_gradients/training/losses/rescoring_loss.py
10
11
12
13
14
15
16
17
18
19
20
21
22
@register_loss(name=Losses.RESCORING_LOSS, deprecated_name="rescoring_loss")
class RescoringLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions: Tuple[Tensor, Tensor], targets):
        """

        :param predictions: Tuple of (poses, scores)
        :param targets: Target scores
        :return: KD loss between predicted scores and target scores
        """
        return torch.nn.functional.binary_cross_entropy_with_logits(predictions[1], targets)

forward(predictions, targets)

Parameters:

Name Type Description Default
predictions Tuple[Tensor, Tensor]

Tuple of (poses, scores)

required
targets

Target scores

required

Returns:

Type Description

KD loss between predicted scores and target scores

Source code in V3_6/src/super_gradients/training/losses/rescoring_loss.py
15
16
17
18
19
20
21
22
def forward(self, predictions: Tuple[Tensor, Tensor], targets):
    """

    :param predictions: Tuple of (poses, scores)
    :param targets: Target scores
    :return: KD loss between predicted scores and target scores
    """
    return torch.nn.functional.binary_cross_entropy_with_logits(predictions[1], targets)

SegKDLoss

Bases: nn.Module

Wrapper loss for semantic segmentation KD. This loss includes two loss components, ce_loss i.e CrossEntropyLoss, and KDLogitsLoss i.e ChannelWiseKnowledgeDistillationLoss.

Source code in V3_6/src/super_gradients/training/losses/seg_kd_loss.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class SegKDLoss(nn.Module):
    """
    Wrapper loss for semantic segmentation KD.
    This loss includes two loss components, `ce_loss` i.e CrossEntropyLoss, and `KDLogitsLoss` i.e
    `ChannelWiseKnowledgeDistillationLoss`.
    """

    def __init__(self, kd_loss: nn.Module, ce_loss: nn.Module, weights: Union[tuple, list], kd_loss_weights: Union[tuple, list]):
        """
        :param kd_loss: knowledge distillation criteria, such as, ChannelWiseKnowledgeDistillationLoss.
         This loss should except as input a triplet of the predictions from the model with shape [B, C, H, W],
         the teacher model predictions with shape [B, C, H, W] and the target labels with shape [B, H, W].
        :param ce_loss: classification criteria, such as, CE, OHEM, MaskAttention, SL1, etc.
         This loss should except as input the predictions from the model with shape [B, C, H, W], and the target labels
         with shape [B, H, W].
        :param weights: lambda weights to apply upon each prediction map heads.
        :param kd_loss_weights: lambda weights to apply upon each criterion. 2 values are excepted as follows,
         [ce_loss_weight, kd_loss_weight].
        """
        super().__init__()
        self.kd_loss_weights = kd_loss_weights
        self.weights = weights

        self.kd_loss = kd_loss
        self.ce_loss = ce_loss

        self._validate_arguments()

    def _validate_arguments(self):
        # Check num of loss weights
        if len(self.kd_loss_weights) != 2:
            raise ValueError(f"kd_loss_weights is expected to be an iterable with size 2," f" found: {len(self.kd_loss_weights)}")

    def forward(self, preds: KDOutput, target: torch.Tensor):
        if not isinstance(preds, KDOutput):
            raise RuntimeError(
                "Predictions argument for `SegKDLoss` forward method is expected to be a `KDOutput` to"
                " include the predictions from both the student and the teacher models."
            )
        teacher_preds = preds.teacher_output
        student_preds = preds.student_output

        if isinstance(teacher_preds, torch.Tensor):
            teacher_preds = (teacher_preds,)
        if isinstance(student_preds, torch.Tensor):
            student_preds = (student_preds,)

        losses = []
        total_loss = 0
        # Main and auxiliaries feature maps losses
        for i in range(len(self.weights)):
            ce_loss = self.ce_loss(student_preds[i], target)
            cwd_loss = self.kd_loss(student_preds[i], teacher_preds[i], target)

            loss = self.kd_loss_weights[0] * ce_loss + self.kd_loss_weights[1] * cwd_loss
            total_loss += self.weights[i] * loss
            losses += [ce_loss, cwd_loss]

        losses.append(total_loss)

        return total_loss, torch.stack(losses, dim=0).detach()

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        component_names = []
        for i in range(len(self.weights)):
            component_names += [f"Head-{i}_CE_Loss", f"Head-{i}_KD_Loss"]
        component_names.append("Total_Loss")
        return component_names

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(kd_loss, ce_loss, weights, kd_loss_weights)

Parameters:

Name Type Description Default
kd_loss nn.Module

knowledge distillation criteria, such as, ChannelWiseKnowledgeDistillationLoss. This loss should except as input a triplet of the predictions from the model with shape [B, C, H, W], the teacher model predictions with shape [B, C, H, W] and the target labels with shape [B, H, W].

required
ce_loss nn.Module

classification criteria, such as, CE, OHEM, MaskAttention, SL1, etc. This loss should except as input the predictions from the model with shape [B, C, H, W], and the target labels with shape [B, H, W].

required
weights Union[tuple, list]

lambda weights to apply upon each prediction map heads.

required
kd_loss_weights Union[tuple, list]

lambda weights to apply upon each criterion. 2 values are excepted as follows, [ce_loss_weight, kd_loss_weight].

required
Source code in V3_6/src/super_gradients/training/losses/seg_kd_loss.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, kd_loss: nn.Module, ce_loss: nn.Module, weights: Union[tuple, list], kd_loss_weights: Union[tuple, list]):
    """
    :param kd_loss: knowledge distillation criteria, such as, ChannelWiseKnowledgeDistillationLoss.
     This loss should except as input a triplet of the predictions from the model with shape [B, C, H, W],
     the teacher model predictions with shape [B, C, H, W] and the target labels with shape [B, H, W].
    :param ce_loss: classification criteria, such as, CE, OHEM, MaskAttention, SL1, etc.
     This loss should except as input the predictions from the model with shape [B, C, H, W], and the target labels
     with shape [B, H, W].
    :param weights: lambda weights to apply upon each prediction map heads.
    :param kd_loss_weights: lambda weights to apply upon each criterion. 2 values are excepted as follows,
     [ce_loss_weight, kd_loss_weight].
    """
    super().__init__()
    self.kd_loss_weights = kd_loss_weights
    self.weights = weights

    self.kd_loss = kd_loss
    self.ce_loss = ce_loss

    self._validate_arguments()

ShelfNetOHEMLoss

Bases: OhemCELoss

Source code in V3_6/src/super_gradients/training/losses/shelfnet_ohem_loss.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@register_loss(name=Losses.SHELFNET_OHEM_LOSS, deprecated_name="shelfnet_ohem_loss")
class ShelfNetOHEMLoss(OhemCELoss):
    def __init__(self, threshold: float = 0.7, mining_percent: float = 1e-4, ignore_lb: int = 255):
        """
        This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.
        :param threshold: threshold to th hard example mining algorithm
        :param mining_percent: minimum percentage of total pixels for the hard example mining algorithm
        (taking only the largest) losses.
        Default is 1e-4, according to legacy settings, number of 400 pixels for typical input of (512x512) and batch of
         16.
        :param ignore_lb: targets label to be ignored
        """
        super().__init__(threshold=threshold, mining_percent=mining_percent, ignore_lb=ignore_lb)

    def forward(self, predictions_list: list, targets):
        losses = []
        for predictions in predictions_list:
            losses.append(super().forward(predictions, targets))
        total_loss = sum(losses)
        losses.append(total_loss)

        return total_loss, torch.stack(losses, dim=0).detach()

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["Loss1/4", "Loss1/8", "Loss1/16", "Loss"]

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(threshold=0.7, mining_percent=0.0001, ignore_lb=255)

This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.

Parameters:

Name Type Description Default
threshold float

threshold to th hard example mining algorithm

0.7
mining_percent float

minimum percentage of total pixels for the hard example mining algorithm (taking only the largest) losses. Default is 1e-4, according to legacy settings, number of 400 pixels for typical input of (512x512) and batch of 16.

0.0001
ignore_lb int

targets label to be ignored

255
Source code in V3_6/src/super_gradients/training/losses/shelfnet_ohem_loss.py
10
11
12
13
14
15
16
17
18
19
20
def __init__(self, threshold: float = 0.7, mining_percent: float = 1e-4, ignore_lb: int = 255):
    """
    This loss is an extension of the Ohem (Online Hard Example Mining Cross Entropy) Loss.
    :param threshold: threshold to th hard example mining algorithm
    :param mining_percent: minimum percentage of total pixels for the hard example mining algorithm
    (taking only the largest) losses.
    Default is 1e-4, according to legacy settings, number of 400 pixels for typical input of (512x512) and batch of
     16.
    :param ignore_lb: targets label to be ignored
    """
    super().__init__(threshold=threshold, mining_percent=mining_percent, ignore_lb=ignore_lb)

ShelfNetSemanticEncodingLoss

Bases: nn.CrossEntropyLoss

2D Cross Entropy Loss with Auxilary Loss

Source code in V3_6/src/super_gradients/training/losses/shelfnet_semantic_encoding_loss.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@register_loss(name=Losses.SHELFNET_SE_LOSS, deprecated_name="shelfnet_se_loss")
class ShelfNetSemanticEncodingLoss(nn.CrossEntropyLoss):
    """2D Cross Entropy Loss with Auxilary Loss"""

    # FIXME - THIS LOSS SHOULD BE CHANGED TO SUPPORT APEX
    def __init__(self, se_weight=0.2, nclass=21, aux_weight=0.4, weight=None, ignore_index=-1):
        super().__init__(weight, None, ignore_index)
        self.nclass = nclass
        self.se_weight = se_weight
        self.aux_weight = aux_weight

        # FIXME - TEST CODE LOTEM, CHANGED IN ORDER TO WORK WITH apex.amp
        self.bcewithlogitsloss = nn.BCELoss(weight)

    def forward(self, logits, labels):
        pred1, se_pred, pred2 = logits

        batch = labels.size(0)
        se_target = Variable(torch.zeros(batch, self.nclass))
        # FIXME - THIS IS WHAT apex MIGHT BE FAILING TO WORK WITH
        for i in range(batch):
            hist = torch.histc(labels[i].cpu().data.float(), bins=self.nclass, min=0, max=self.nclass - 1)
            vect = hist > 0
            se_target[i] = vect

        loss1 = super().forward(pred1, labels)
        loss2 = super().forward(pred2, labels)
        loss3 = self.bcewithlogitsloss(torch.sigmoid(se_pred), se_target.data.cuda())  # FIXME - MAYBE CHANGE TO SIGMOID
        total_loss = loss1 + self.aux_weight * loss2 + self.se_weight * loss3
        losses = [loss1, loss2, loss3, total_loss]
        return total_loss, torch.stack(losses, dim=0).detach()

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["loss1", "loss2", "loss3", "total_loss"]

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

HardMiningCrossEntropyLoss

Bases: _Loss

L_cls = [CE of all positives] + [CE of the hardest backgrounds] where the second term is built from [neg_pos_ratio * positive pairs] background cells with the highest CE (the hardest background cells)

Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class HardMiningCrossEntropyLoss(_Loss):
    """
    L_cls = [CE of all positives] + [CE of the hardest backgrounds]
    where the second term is built from [neg_pos_ratio * positive pairs] background cells with the highest CE
    (the hardest background cells)
    """

    def __init__(self, neg_pos_ratio: float):
        """
        :param neg_pos_ratio:   a ratio of negative samples to positive samples in the loss
                                (unlike positives, not all negatives will be used:
                                for each positive the [neg_pos_ratio] hardest negatives will be selected)
        """
        super().__init__()
        self.neg_pos_ratio = neg_pos_ratio
        self.ce = nn.CrossEntropyLoss(reduce=False)

    def forward(self, pred_labels, target_labels):
        mask = target_labels > 0  # not background
        pos_num = mask.sum(dim=1)

        # HARD NEGATIVE MINING
        con = self.ce(pred_labels, target_labels)

        # POSITIVE MASK WILL NOT BE SELECTED
        # set 0. loss for all positive objects, leave the loss where the object is background
        con_neg = con.clone()
        con_neg[mask] = 0
        # sort background cells by CE loss value (bigger_first)
        _, con_idx = con_neg.sort(dim=1, descending=True)
        # restore cells order, get each cell's order (rank) in CE loss sorting
        _, con_rank = con_idx.sort(dim=1)

        # NUMBER OF NEGATIVE THREE TIMES POSITIVE
        neg_num = torch.clamp(self.neg_pos_ratio * pos_num, max=mask.size(1)).unsqueeze(-1)
        # for each image into neg mask we'll take (3 * positive pairs) background objects with the highest CE
        neg_mask = con_rank < neg_num

        closs = (con * (mask.float() + neg_mask.float())).sum(dim=1)
        return closs

__init__(neg_pos_ratio)

Parameters:

Name Type Description Default
neg_pos_ratio float

a ratio of negative samples to positive samples in the loss (unlike positives, not all negatives will be used: for each positive the [neg_pos_ratio] hardest negatives will be selected)

required
Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
20
21
22
23
24
25
26
27
28
def __init__(self, neg_pos_ratio: float):
    """
    :param neg_pos_ratio:   a ratio of negative samples to positive samples in the loss
                            (unlike positives, not all negatives will be used:
                            for each positive the [neg_pos_ratio] hardest negatives will be selected)
    """
    super().__init__()
    self.neg_pos_ratio = neg_pos_ratio
    self.ce = nn.CrossEntropyLoss(reduce=False)

SSDLoss

Bases: _Loss

Implements the loss as the sum of the followings:
1. Confidence Loss: All labels, with hard negative mining
2. Localization Loss: Only on positive labels

L = (2 - alpha) * L_l1 + alpha * L_cls, where * L_cls is HardMiningCrossEntropyLoss * L_l1 = [SmoothL1Loss for all positives]

Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
@register_loss(name=Losses.SSD_LOSS, deprecated_name="ssd_loss")
class SSDLoss(_Loss):
    """
        Implements the loss as the sum of the followings:
        1. Confidence Loss: All labels, with hard negative mining
        2. Localization Loss: Only on positive labels

    L = (2 - alpha) * L_l1 + alpha * L_cls, where
        * L_cls is HardMiningCrossEntropyLoss
        * L_l1 = [SmoothL1Loss for all positives]
    """

    def __init__(self, dboxes: DefaultBoxes, alpha: float = 1.0, iou_thresh: float = 0.5, neg_pos_ratio: float = 3.0):
        """
        :param dboxes:          model anchors, shape [Num Grid Cells * Num anchors x 4]
        :param alpha:           a weighting factor between classification and regression loss
        :param iou_thresh:      a threshold for matching of anchors in each grid cell to GTs
                                (a match should have IoU > iou_thresh)
        :param neg_pos_ratio:   a ratio for HardMiningCrossEntropyLoss
        """
        super(SSDLoss, self).__init__()
        self.scale_xy = dboxes.scale_xy
        self.scale_wh = dboxes.scale_wh
        self.alpha = alpha
        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False)
        self.sl1_loss = nn.SmoothL1Loss(reduce=False)

        self.con_loss = HardMiningCrossEntropyLoss(neg_pos_ratio)
        self.iou_thresh = iou_thresh

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["smooth_l1", "closs", "Loss"]

    def _norm_relative_bbox(self, loc):
        """
        convert bbox locations into relative locations (relative to the dboxes)
        :param loc a tensor of shape [batch, 4, num_boxes]
        """
        gxy = (
            (loc[:, :2, :] - self.dboxes[:, :2, :])
            / self.dboxes[
                :,
                2:,
            ]
        ) / self.scale_xy
        gwh = (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log() / self.scale_wh
        return torch.cat((gxy, gwh), dim=1).contiguous()

    def match_dboxes(self, targets):
        """
        creates tensors with target boxes and labels for each dboxes, so with the same len as dboxes.

        * Each GT is assigned with a grid cell with the highest IoU, this creates a pair for each GT and some cells;
        * The rest of grid cells are assigned to a GT with the highest IoU, assuming it's > self.iou_thresh;
          If this condition is not met the grid cell is marked as background

        GT-wise: one to many
        Grid-cell-wise: one to one

        :param targets: a tensor containing the boxes for a single image;
                        shape [num_boxes, 6] (image_id, label, x, y, w, h)
        :return:        two tensors
                        boxes - shape of dboxes [4, num_dboxes] (x,y,w,h)
                        labels - sahpe [num_dboxes]
        """
        device = targets.device
        each_cell_target_locations = self.dboxes.data.clone().squeeze()
        each_cell_target_labels = torch.zeros((self.dboxes.data.shape[2])).to(device)

        if len(targets) > 0:
            target_boxes = targets[:, 2:]
            target_labels = targets[:, 1]
            ious = calculate_bbox_iou_matrix(target_boxes, self.dboxes.data.squeeze().T, x1y1x2y2=False)

            # one best GT for EACH cell (does not guarantee that all GTs will be used)
            best_target_per_cell, best_target_per_cell_index = ious.max(0)

            # one best grid cell (anchor in it) for EACH target
            best_cell_per_target, best_cell_per_target_index = ious.max(1)
            # make sure EACH target has a grid cell assigned
            best_target_per_cell_index[best_cell_per_target_index] = torch.arange(len(targets)).to(device)
            # 2. is higher than any IoU, so it is guaranteed to pass any IoU threshold
            # which ensures that the pairs selected for each target will be included in the mask below
            # while the threshold will only affect other grid cell anchors that aren't pre-assigned to any target
            best_target_per_cell[best_cell_per_target_index] = 2.0

            mask = best_target_per_cell > self.iou_thresh
            each_cell_target_locations[:, mask] = target_boxes[best_target_per_cell_index[mask]].T
            each_cell_target_labels[mask] = target_labels[best_target_per_cell_index[mask]] + 1

        return each_cell_target_locations, each_cell_target_labels

    def forward(self, predictions: Tuple, targets):
        """
        Compute the loss
            :param predictions - predictions tensor coming from the network,
            tuple with shapes ([Batch Size, 4, num_dboxes], [Batch Size, num_classes + 1, num_dboxes])
            were predictions have logprobs for background and other classes
            :param targets - targets for the batch. [num targets, 6] (index in batch, label, x,y,w,h)
        """
        if isinstance(predictions, tuple) and isinstance(predictions[1], tuple):
            # Calculate loss in a validation mode
            predictions = predictions[1]
        batch_target_locations = []
        batch_target_labels = []
        (ploc, plabel) = predictions
        targets = targets.to(self.dboxes.device)
        for i in range(ploc.shape[0]):
            target_locations, target_labels = self.match_dboxes(targets[targets[:, 0] == i])
            batch_target_locations.append(target_locations)
            batch_target_labels.append(target_labels)
        batch_target_locations = torch.stack(batch_target_locations)
        batch_target_labels = torch.stack(batch_target_labels).type(torch.long)

        mask = batch_target_labels > 0  # not background
        pos_num = mask.sum(dim=1)

        vec_gd = self._norm_relative_bbox(batch_target_locations)

        # SUM ON FOUR COORDINATES, AND MASK
        sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
        sl1 = (mask.float() * sl1).sum(dim=1)

        closs = self.con_loss(plabel, batch_target_labels)

        # AVOID NO OBJECT DETECTED
        total_loss = (2 - self.alpha) * sl1 + self.alpha * closs
        num_mask = (pos_num > 0).float()  # a mask with 0 for images that have no positive pairs at all
        pos_num = pos_num.float().clamp(min=1e-6)
        ret = (total_loss * num_mask / pos_num).mean(dim=0)  # normalize by the number of positive pairs

        return ret, torch.cat((sl1.mean().unsqueeze(0), closs.mean().unsqueeze(0), ret.unsqueeze(0))).detach()

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(dboxes, alpha=1.0, iou_thresh=0.5, neg_pos_ratio=3.0)

Parameters:

Name Type Description Default
dboxes DefaultBoxes

model anchors, shape [Num Grid Cells * Num anchors x 4]

required
alpha float

a weighting factor between classification and regression loss

1.0
iou_thresh float

a threshold for matching of anchors in each grid cell to GTs (a match should have IoU > iou_thresh)

0.5
neg_pos_ratio float

a ratio for HardMiningCrossEntropyLoss

3.0
Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(self, dboxes: DefaultBoxes, alpha: float = 1.0, iou_thresh: float = 0.5, neg_pos_ratio: float = 3.0):
    """
    :param dboxes:          model anchors, shape [Num Grid Cells * Num anchors x 4]
    :param alpha:           a weighting factor between classification and regression loss
    :param iou_thresh:      a threshold for matching of anchors in each grid cell to GTs
                            (a match should have IoU > iou_thresh)
    :param neg_pos_ratio:   a ratio for HardMiningCrossEntropyLoss
    """
    super(SSDLoss, self).__init__()
    self.scale_xy = dboxes.scale_xy
    self.scale_wh = dboxes.scale_wh
    self.alpha = alpha
    self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False)
    self.sl1_loss = nn.SmoothL1Loss(reduce=False)

    self.con_loss = HardMiningCrossEntropyLoss(neg_pos_ratio)
    self.iou_thresh = iou_thresh

forward(predictions, targets)

Compute the loss :param predictions - predictions tensor coming from the network, tuple with shapes ([Batch Size, 4, num_dboxes], [Batch Size, num_classes + 1, num_dboxes]) were predictions have logprobs for background and other classes :param targets - targets for the batch. [num targets, 6] (index in batch, label, x,y,w,h)

Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def forward(self, predictions: Tuple, targets):
    """
    Compute the loss
        :param predictions - predictions tensor coming from the network,
        tuple with shapes ([Batch Size, 4, num_dboxes], [Batch Size, num_classes + 1, num_dboxes])
        were predictions have logprobs for background and other classes
        :param targets - targets for the batch. [num targets, 6] (index in batch, label, x,y,w,h)
    """
    if isinstance(predictions, tuple) and isinstance(predictions[1], tuple):
        # Calculate loss in a validation mode
        predictions = predictions[1]
    batch_target_locations = []
    batch_target_labels = []
    (ploc, plabel) = predictions
    targets = targets.to(self.dboxes.device)
    for i in range(ploc.shape[0]):
        target_locations, target_labels = self.match_dboxes(targets[targets[:, 0] == i])
        batch_target_locations.append(target_locations)
        batch_target_labels.append(target_labels)
    batch_target_locations = torch.stack(batch_target_locations)
    batch_target_labels = torch.stack(batch_target_labels).type(torch.long)

    mask = batch_target_labels > 0  # not background
    pos_num = mask.sum(dim=1)

    vec_gd = self._norm_relative_bbox(batch_target_locations)

    # SUM ON FOUR COORDINATES, AND MASK
    sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
    sl1 = (mask.float() * sl1).sum(dim=1)

    closs = self.con_loss(plabel, batch_target_labels)

    # AVOID NO OBJECT DETECTED
    total_loss = (2 - self.alpha) * sl1 + self.alpha * closs
    num_mask = (pos_num > 0).float()  # a mask with 0 for images that have no positive pairs at all
    pos_num = pos_num.float().clamp(min=1e-6)
    ret = (total_loss * num_mask / pos_num).mean(dim=0)  # normalize by the number of positive pairs

    return ret, torch.cat((sl1.mean().unsqueeze(0), closs.mean().unsqueeze(0), ret.unsqueeze(0))).detach()

match_dboxes(targets)

creates tensors with target boxes and labels for each dboxes, so with the same len as dboxes.

  • Each GT is assigned with a grid cell with the highest IoU, this creates a pair for each GT and some cells;
  • The rest of grid cells are assigned to a GT with the highest IoU, assuming it's > self.iou_thresh; If this condition is not met the grid cell is marked as background

GT-wise: one to many Grid-cell-wise: one to one

Parameters:

Name Type Description Default
targets

a tensor containing the boxes for a single image; shape [num_boxes, 6] (image_id, label, x, y, w, h)

required

Returns:

Type Description

two tensors boxes - shape of dboxes [4, num_dboxes] (x,y,w,h) labels - sahpe [num_dboxes]

Source code in V3_6/src/super_gradients/training/losses/ssd_loss.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def match_dboxes(self, targets):
    """
    creates tensors with target boxes and labels for each dboxes, so with the same len as dboxes.

    * Each GT is assigned with a grid cell with the highest IoU, this creates a pair for each GT and some cells;
    * The rest of grid cells are assigned to a GT with the highest IoU, assuming it's > self.iou_thresh;
      If this condition is not met the grid cell is marked as background

    GT-wise: one to many
    Grid-cell-wise: one to one

    :param targets: a tensor containing the boxes for a single image;
                    shape [num_boxes, 6] (image_id, label, x, y, w, h)
    :return:        two tensors
                    boxes - shape of dboxes [4, num_dboxes] (x,y,w,h)
                    labels - sahpe [num_dboxes]
    """
    device = targets.device
    each_cell_target_locations = self.dboxes.data.clone().squeeze()
    each_cell_target_labels = torch.zeros((self.dboxes.data.shape[2])).to(device)

    if len(targets) > 0:
        target_boxes = targets[:, 2:]
        target_labels = targets[:, 1]
        ious = calculate_bbox_iou_matrix(target_boxes, self.dboxes.data.squeeze().T, x1y1x2y2=False)

        # one best GT for EACH cell (does not guarantee that all GTs will be used)
        best_target_per_cell, best_target_per_cell_index = ious.max(0)

        # one best grid cell (anchor in it) for EACH target
        best_cell_per_target, best_cell_per_target_index = ious.max(1)
        # make sure EACH target has a grid cell assigned
        best_target_per_cell_index[best_cell_per_target_index] = torch.arange(len(targets)).to(device)
        # 2. is higher than any IoU, so it is guaranteed to pass any IoU threshold
        # which ensures that the pairs selected for each target will be included in the mask below
        # while the threshold will only affect other grid cell anchors that aren't pre-assigned to any target
        best_target_per_cell[best_cell_per_target_index] = 2.0

        mask = best_target_per_cell > self.iou_thresh
        each_cell_target_locations[:, mask] = target_boxes[best_target_per_cell_index[mask]].T
        each_cell_target_labels[mask] = target_labels[best_target_per_cell_index[mask]] + 1

    return each_cell_target_locations, each_cell_target_labels

DetailAggregateModule

Bases: nn.Module

DetailAggregateModule to create ground-truth spatial details map. Given ground-truth segmentation masks and using laplacian kernels this module create feature-maps with special attention to classes edges aka details.

Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class DetailAggregateModule(nn.Module):
    """
    DetailAggregateModule to create ground-truth spatial details map. Given ground-truth segmentation masks and using
     laplacian kernels this module create feature-maps with special attention to classes edges aka details.
    """

    _LAPLACIAN_KERNEL = [-1, -1, -1, -1, 8, -1, -1, -1, -1]
    _INITIAL_FUSE_KERNEL = [[6.0 / 10], [3.0 / 10], [1.0 / 10]]

    def __init__(self, num_classes: int, ignore_label: int, detail_threshold: float = 1.0, learnable_fusing_kernel: bool = True):
        """
        :param detail_threshold: threshold to define a pixel as edge after laplacian. must be a value between 1 and 8,
            lower value for smooth edges, high value for fine edges.
        :param learnable_fusing_kernel: whether the 1x1 conv map of strided maps is learnable or not.
        """
        super().__init__()
        assert 1 <= detail_threshold <= 8, f"Detail threshold must be a value between 1 and 8, found: {detail_threshold}"

        self.device = None
        self.detail_threshold = detail_threshold
        self.num_classes = num_classes
        self.ignore_label = ignore_label

        # laplacian dw-convolution, each channel is a class label. apply laplacian filter once for each channel.
        self.laplacian_kernel = torch.tensor(self._LAPLACIAN_KERNEL, dtype=torch.float32).reshape(1, 1, 3, 3).expand(num_classes, 1, 3, 3).requires_grad_(False)
        # init param for 1x1 conv of strided gaussian feature maps.
        self.fuse_kernel = torch.tensor(self._INITIAL_FUSE_KERNEL, dtype=torch.float32).reshape(1, 3, 1, 1).requires_grad_(learnable_fusing_kernel)
        if learnable_fusing_kernel:
            self.fuse_kernel = torch.nn.Parameter(self.fuse_kernel)

    def forward(self, gt_masks: torch.Tensor):
        if self.device is None:
            self._set_kernels_to_device(gt_masks.device)
        if self.num_classes > 1:
            one_hot = to_one_hot(gt_masks, self.num_classes, self.ignore_label).float()
        else:
            one_hot = gt_masks.unsqueeze(1).float()
        # create binary detail maps using filters withs strides of 1, 2 and 4.
        boundary_targets = F.conv2d(one_hot, self.laplacian_kernel, stride=1, padding=1, groups=self.num_classes)
        boundary_targets_x2 = F.conv2d(one_hot, self.laplacian_kernel, stride=2, padding=1, groups=self.num_classes)
        boundary_targets_x4 = F.conv2d(one_hot, self.laplacian_kernel, stride=4, padding=1, groups=self.num_classes)

        boundary_targets = self._to_one_channel_binary(boundary_targets, self.detail_threshold)
        boundary_targets_x2 = self._to_one_channel_binary(boundary_targets_x2, self.detail_threshold)
        boundary_targets_x4 = self._to_one_channel_binary(boundary_targets_x4, self.detail_threshold)

        boundary_targets_x4 = F.interpolate(boundary_targets_x4, boundary_targets.shape[2:], mode="nearest")
        boundary_targets_x2 = F.interpolate(boundary_targets_x2, boundary_targets.shape[2:], mode="nearest")

        boundary_targets = torch.cat((boundary_targets, boundary_targets_x2, boundary_targets_x4), dim=1)

        boundary_targets = F.conv2d(boundary_targets, self.fuse_kernel)
        boundary_targets = self._to_one_channel_binary(boundary_targets, 0.3)

        return boundary_targets

    def _set_kernels_to_device(self, device: str):
        self.device = device
        self.laplacian_kernel = self.laplacian_kernel.to(device)
        self.fuse_kernel = self.fuse_kernel.to(device)

    @staticmethod
    def _to_one_channel_binary(x: torch.Tensor, threshold: float):
        """
        Flatten channels, and turn to binary tensor. if at least one pixel class is above threshold, flatten value is 1,
        'or' operator.
        """
        x = x.max(dim=1, keepdim=True)[0]
        x[x < threshold] = 0
        x[x >= threshold] = 1
        return x

__init__(num_classes, ignore_label, detail_threshold=1.0, learnable_fusing_kernel=True)

Parameters:

Name Type Description Default
detail_threshold float

threshold to define a pixel as edge after laplacian. must be a value between 1 and 8, lower value for smooth edges, high value for fine edges.

1.0
learnable_fusing_kernel bool

whether the 1x1 conv map of strided maps is learnable or not.

True
Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(self, num_classes: int, ignore_label: int, detail_threshold: float = 1.0, learnable_fusing_kernel: bool = True):
    """
    :param detail_threshold: threshold to define a pixel as edge after laplacian. must be a value between 1 and 8,
        lower value for smooth edges, high value for fine edges.
    :param learnable_fusing_kernel: whether the 1x1 conv map of strided maps is learnable or not.
    """
    super().__init__()
    assert 1 <= detail_threshold <= 8, f"Detail threshold must be a value between 1 and 8, found: {detail_threshold}"

    self.device = None
    self.detail_threshold = detail_threshold
    self.num_classes = num_classes
    self.ignore_label = ignore_label

    # laplacian dw-convolution, each channel is a class label. apply laplacian filter once for each channel.
    self.laplacian_kernel = torch.tensor(self._LAPLACIAN_KERNEL, dtype=torch.float32).reshape(1, 1, 3, 3).expand(num_classes, 1, 3, 3).requires_grad_(False)
    # init param for 1x1 conv of strided gaussian feature maps.
    self.fuse_kernel = torch.tensor(self._INITIAL_FUSE_KERNEL, dtype=torch.float32).reshape(1, 3, 1, 1).requires_grad_(learnable_fusing_kernel)
    if learnable_fusing_kernel:
        self.fuse_kernel = torch.nn.Parameter(self.fuse_kernel)

DetailLoss

Bases: _Loss

STDC DetailLoss applied on details features from higher resolution and ground-truth details map. Loss combination of BCE loss and BinaryDice loss

Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class DetailLoss(_Loss):
    """
    STDC DetailLoss applied on  details features from higher resolution and ground-truth details map.
    Loss combination of BCE loss and BinaryDice loss
    """

    def __init__(self, weights: list = [1.0, 1.0]):
        """
        :param weights: weight to apply for each part of the loss contributions, [BCE, Dice] respectively.
        """
        super().__init__()
        assert len(weights) == 2, f"Only 2 weight elements are required for BCE-Dice loss combo, found: {len(weights)}"
        self.weights = weights
        self.bce_with_logits = nn.BCEWithLogitsLoss()
        self.dice_loss = BinaryDiceLoss(apply_sigmoid=True)

    def forward(self, detail_out: torch.Tensor, detail_target: torch.Tensor):
        """
        :param detail_out: predicted detail map.
        :param detail_target: ground-truth detail loss, output of DetailAggregateModule.
        """
        bce_loss = self.bce_with_logits(detail_out, detail_target)
        dice_loss = self.dice_loss(detail_out, detail_target)
        return self.weights[0] * bce_loss + self.weights[1] * dice_loss

__init__(weights=[1.0, 1.0])

Parameters:

Name Type Description Default
weights list

weight to apply for each part of the loss contributions, [BCE, Dice] respectively.

[1.0, 1.0]
Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(self, weights: list = [1.0, 1.0]):
    """
    :param weights: weight to apply for each part of the loss contributions, [BCE, Dice] respectively.
    """
    super().__init__()
    assert len(weights) == 2, f"Only 2 weight elements are required for BCE-Dice loss combo, found: {len(weights)}"
    self.weights = weights
    self.bce_with_logits = nn.BCEWithLogitsLoss()
    self.dice_loss = BinaryDiceLoss(apply_sigmoid=True)

forward(detail_out, detail_target)

Parameters:

Name Type Description Default
detail_out torch.Tensor

predicted detail map.

required
detail_target torch.Tensor

ground-truth detail loss, output of DetailAggregateModule.

required
Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
104
105
106
107
108
109
110
111
def forward(self, detail_out: torch.Tensor, detail_target: torch.Tensor):
    """
    :param detail_out: predicted detail map.
    :param detail_target: ground-truth detail loss, output of DetailAggregateModule.
    """
    bce_loss = self.bce_with_logits(detail_out, detail_target)
    dice_loss = self.dice_loss(detail_out, detail_target)
    return self.weights[0] * bce_loss + self.weights[1] * dice_loss

STDCLoss

Bases: _Loss

Loss class of STDC-Seg training.

Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
@register_loss(name=Losses.STDC_LOSS, deprecated_name="stdc_loss")
class STDCLoss(_Loss):
    """
    Loss class of STDC-Seg training.
    """

    def __init__(
        self,
        num_classes: int,
        threshold: float = 0.7,
        num_aux_heads: int = 2,
        num_detail_heads: int = 1,
        weights: Union[tuple, list] = (1, 1, 1, 1),
        detail_weights: Union[tuple, list] = (1, 1),
        mining_percent: float = 0.1,
        detail_threshold: float = 1.0,
        learnable_fusing_kernel: bool = True,
        ignore_index: int = None,
        ohem_criteria: OhemLoss = None,
    ):
        """
        :param threshold: Online hard-mining probability threshold.
        :param num_aux_heads: num of auxiliary heads.
        :param num_detail_heads: num of detail heads.
        :param weights: Loss lambda weights.
        :param detail_weights: weights for (Dice, BCE) losses parts in DetailLoss.
        :param mining_percent: mining percentage.
        :param detail_threshold: detail threshold to create binary details features in DetailLoss.
        :param learnable_fusing_kernel: whether DetailAggregateModule params are learnable or not.
        :param ohem_criteria: OhemLoss criterion component of STDC. When none is given, it will be derrived according
         to num_classes (i.e OhemCELoss if num_classes > 1 and OhemBCELoss otherwise).
        """
        super().__init__()

        assert len(weights) == num_aux_heads + num_detail_heads + 1, "Lambda loss weights must be in same size as loss items."

        self.weights = weights
        self.use_detail = num_detail_heads > 0

        self.num_aux_heads = num_aux_heads
        self.num_detail_heads = num_detail_heads

        if self.use_detail:
            self.detail_module = DetailAggregateModule(
                num_classes=num_classes, detail_threshold=detail_threshold, ignore_label=ignore_index, learnable_fusing_kernel=learnable_fusing_kernel
            )
            self.detail_loss = DetailLoss(weights=detail_weights)

        if ohem_criteria is None:
            if num_classes > 1:
                ohem_criteria = OhemCELoss(threshold=threshold, mining_percent=mining_percent, ignore_lb=ignore_index)
            else:
                ohem_criteria = OhemBCELoss(threshold=threshold, mining_percent=mining_percent)

        self.ce_ohem = ohem_criteria
        self.num_classes = num_classes

    @property
    def component_names(self):
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["main_loss", "aux_loss1", "aux_loss2", "detail_loss", "loss"]

    def forward(self, preds: Tuple[torch.Tensor], target: torch.Tensor):
        """
        :param preds: Model output predictions, must be in the followed format:
         [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]
        """
        assert (
            len(preds) == self.num_aux_heads + self.num_detail_heads + 1
        ), f"Wrong num of predictions tensors for STDC loss, expected {self.num_aux_heads + self.num_detail_heads + 1} found {len(preds)}"
        losses = []
        total_loss = 0

        # classification and auxiliary loss
        for i in range(0, 1 + self.num_aux_heads):
            ce_loss = self.ce_ohem(preds[i], target)
            total_loss += ce_loss * self.weights[i]
            losses.append(ce_loss)

        # detail heads loss
        if self.use_detail:
            gt_binary_mask = self.detail_module(target)
            for i in range(1 + self.num_aux_heads, len(preds)):
                detail_loss = self.detail_loss(preds[i], gt_binary_mask)
                total_loss += self.weights[i] * detail_loss
                losses.append(detail_loss)

        losses.append(total_loss)

        return total_loss, torch.stack(losses, dim=0).detach()

    def get_train_named_params(self):
        """
        Expose DetailAggregateModule learnable parameters to be passed to the optimizer.
        """
        if self.use_detail:
            return list(self.detail_module.named_parameters())

component_names property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

__init__(num_classes, threshold=0.7, num_aux_heads=2, num_detail_heads=1, weights=(1, 1, 1, 1), detail_weights=(1, 1), mining_percent=0.1, detail_threshold=1.0, learnable_fusing_kernel=True, ignore_index=None, ohem_criteria=None)

Parameters:

Name Type Description Default
threshold float

Online hard-mining probability threshold.

0.7
num_aux_heads int

num of auxiliary heads.

2
num_detail_heads int

num of detail heads.

1
weights Union[tuple, list]

Loss lambda weights.

(1, 1, 1, 1)
detail_weights Union[tuple, list]

weights for (Dice, BCE) losses parts in DetailLoss.

(1, 1)
mining_percent float

mining percentage.

0.1
detail_threshold float

detail threshold to create binary details features in DetailLoss.

1.0
learnable_fusing_kernel bool

whether DetailAggregateModule params are learnable or not.

True
ohem_criteria OhemLoss

OhemLoss criterion component of STDC. When none is given, it will be derrived according to num_classes (i.e OhemCELoss if num_classes > 1 and OhemBCELoss otherwise).

None
Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def __init__(
    self,
    num_classes: int,
    threshold: float = 0.7,
    num_aux_heads: int = 2,
    num_detail_heads: int = 1,
    weights: Union[tuple, list] = (1, 1, 1, 1),
    detail_weights: Union[tuple, list] = (1, 1),
    mining_percent: float = 0.1,
    detail_threshold: float = 1.0,
    learnable_fusing_kernel: bool = True,
    ignore_index: int = None,
    ohem_criteria: OhemLoss = None,
):
    """
    :param threshold: Online hard-mining probability threshold.
    :param num_aux_heads: num of auxiliary heads.
    :param num_detail_heads: num of detail heads.
    :param weights: Loss lambda weights.
    :param detail_weights: weights for (Dice, BCE) losses parts in DetailLoss.
    :param mining_percent: mining percentage.
    :param detail_threshold: detail threshold to create binary details features in DetailLoss.
    :param learnable_fusing_kernel: whether DetailAggregateModule params are learnable or not.
    :param ohem_criteria: OhemLoss criterion component of STDC. When none is given, it will be derrived according
     to num_classes (i.e OhemCELoss if num_classes > 1 and OhemBCELoss otherwise).
    """
    super().__init__()

    assert len(weights) == num_aux_heads + num_detail_heads + 1, "Lambda loss weights must be in same size as loss items."

    self.weights = weights
    self.use_detail = num_detail_heads > 0

    self.num_aux_heads = num_aux_heads
    self.num_detail_heads = num_detail_heads

    if self.use_detail:
        self.detail_module = DetailAggregateModule(
            num_classes=num_classes, detail_threshold=detail_threshold, ignore_label=ignore_index, learnable_fusing_kernel=learnable_fusing_kernel
        )
        self.detail_loss = DetailLoss(weights=detail_weights)

    if ohem_criteria is None:
        if num_classes > 1:
            ohem_criteria = OhemCELoss(threshold=threshold, mining_percent=mining_percent, ignore_lb=ignore_index)
        else:
            ohem_criteria = OhemBCELoss(threshold=threshold, mining_percent=mining_percent)

    self.ce_ohem = ohem_criteria
    self.num_classes = num_classes

forward(preds, target)

Parameters:

Name Type Description Default
preds Tuple[torch.Tensor]

Model output predictions, must be in the followed format: [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]

required
Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def forward(self, preds: Tuple[torch.Tensor], target: torch.Tensor):
    """
    :param preds: Model output predictions, must be in the followed format:
     [Main-feats, Aux-feats[0], ..., Aux-feats[num_auxs-1], Detail-feats[0], ..., Detail-feats[num_details-1]
    """
    assert (
        len(preds) == self.num_aux_heads + self.num_detail_heads + 1
    ), f"Wrong num of predictions tensors for STDC loss, expected {self.num_aux_heads + self.num_detail_heads + 1} found {len(preds)}"
    losses = []
    total_loss = 0

    # classification and auxiliary loss
    for i in range(0, 1 + self.num_aux_heads):
        ce_loss = self.ce_ohem(preds[i], target)
        total_loss += ce_loss * self.weights[i]
        losses.append(ce_loss)

    # detail heads loss
    if self.use_detail:
        gt_binary_mask = self.detail_module(target)
        for i in range(1 + self.num_aux_heads, len(preds)):
            detail_loss = self.detail_loss(preds[i], gt_binary_mask)
            total_loss += self.weights[i] * detail_loss
            losses.append(detail_loss)

    losses.append(total_loss)

    return total_loss, torch.stack(losses, dim=0).detach()

get_train_named_params()

Expose DetailAggregateModule learnable parameters to be passed to the optimizer.

Source code in V3_6/src/super_gradients/training/losses/stdc_loss.py
209
210
211
212
213
214
def get_train_named_params(self):
    """
    Expose DetailAggregateModule learnable parameters to be passed to the optimizer.
    """
    if self.use_detail:
        return list(self.detail_module.named_parameters())

AbstarctSegmentationStructureLoss

Bases: _Loss, ABC

Abstract computation of structure loss between two tensors, It can support both multi-classes and binary tasks.

Source code in V3_6/src/super_gradients/training/losses/structure_loss.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class AbstarctSegmentationStructureLoss(_Loss, ABC):
    """
    Abstract computation of structure loss between two tensors, It can support both multi-classes and binary tasks.
    """

    def __init__(
        self,
        apply_softmax: bool = True,
        ignore_index: int = None,
        smooth: float = 1.0,
        eps: float = 1e-5,
        reduce_over_batches: bool = False,
        generalized_metric: bool = False,
        weight: Optional[torch.Tensor] = None,
        reduction: Union[LossReduction, str] = "mean",
    ):
        """
        :param apply_softmax: Whether to apply softmax to the predictions.
        :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the metric
            coefficient is to 1, which can be used as a regularization effect.
            As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
        :param eps: epsilon value to avoid inf.
        :param reduce_over_batches: Whether to average metric over the batch axis if set True,
         default is `False` to average over the classes axis.
        :param generalized_metric: Whether to apply normalization by the volume of each class.
        :param weight: a manual rescaling weight given to each class. If given, it has to be a Tensor of size `C`.
        :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
            `none`: no reduction will be applied.
            `mean`: the sum of the output will be divided by the number of elements in the output.
            `sum`: the output will be summed.
            Default: `mean`
        """
        super().__init__(reduction=reduction)
        self.ignore_index = ignore_index
        self.apply_softmax = apply_softmax
        self.eps = eps
        self.smooth = smooth
        self.reduce_over_batches = reduce_over_batches
        self.generalized_metric = generalized_metric
        self.weight = weight
        if self.generalized_metric:
            assert self.weight is None, "Cannot use structured Loss with weight classes and generalized normalization"
            if self.eps > 1e-12:
                logger.warning("When using GeneralizedLoss, it is recommended to use eps below 1e-12, to not affect" "small values normalized terms.")
            if self.smooth != 0:
                logger.warning("When using GeneralizedLoss, it is recommended to set smooth value as 0.")

    @abstractmethod
    def _calc_numerator_denominator(self, labels_one_hot, predict) -> (torch.Tensor, torch.Tensor):
        """
        All base classes must implement this function.
        Return: 2 tensor of shape [BS, num_classes, img_width, img_height].
        """
        raise NotImplementedError()

    @abstractmethod
    def _calc_loss(self, numerator, denominator) -> torch.Tensor:
        """
        All base classes must implement this function.
        Return a tensors of shape [BS] if self.reduce_over_batches else [num_classes].
        """
        raise NotImplementedError()

    def forward(self, predict, target):
        if self.apply_softmax:
            predict = torch.softmax(predict, dim=1)
        # target to one hot format
        if target.size() == predict.size():
            labels_one_hot = target
        elif target.dim() == 3:  # if target tensor is in class indexes format.
            if predict.size(1) == 1 and self.ignore_index is None:  # if one class prediction task
                labels_one_hot = target.unsqueeze(1)
            else:
                labels_one_hot = to_one_hot(target, num_classes=predict.shape[1], ignore_index=self.ignore_index)
        else:
            raise AssertionError(
                f"Mismatch of target shape: {target.size()} and prediction shape: {predict.size()},"
                f" target must be [NxWxH] tensor for to_one_hot conversion"
                f" or to have the same num of channels like prediction tensor"
            )

        reduce_spatial_dims = list(range(2, len(predict.shape)))
        reduce_dims = [1] + reduce_spatial_dims if self.reduce_over_batches else [0] + reduce_spatial_dims

        # Calculate the numerator and denominator of the chosen metric
        numerator, denominator = self._calc_numerator_denominator(labels_one_hot, predict)

        # exclude ignore labels from numerator and denominator, false positive predicted on ignore samples
        # are not included in the total calculation.
        if self.ignore_index is not None:
            valid_mask = target.ne(self.ignore_index).unsqueeze(1).expand_as(denominator)
            numerator *= valid_mask
            denominator *= valid_mask

        numerator = torch.sum(numerator, dim=reduce_dims)
        denominator = torch.sum(denominator, dim=reduce_dims)

        if self.generalized_metric:
            weights = 1.0 / (torch.sum(labels_one_hot, dim=reduce_dims) ** 2)
            # if some classes are not in batch, weights will be inf.
            infs = torch.isinf(weights)
            weights[infs] = 0.0
            numerator *= weights
            denominator *= weights

        # Calculate the loss of the chosen metric
        losses = self._calc_loss(numerator, denominator)
        if self.weight is not None:
            losses *= self.weight
        return apply_reduce(losses, reduction=self.reduction)

__init__(apply_softmax=True, ignore_index=None, smooth=1.0, eps=1e-05, reduce_over_batches=False, generalized_metric=False, weight=None, reduction='mean')

Parameters:

Name Type Description Default
apply_softmax bool

Whether to apply softmax to the predictions.

True
smooth float

laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the metric coefficient is to 1, which can be used as a regularization effect. As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895

1.0
eps float

epsilon value to avoid inf.

1e-05
reduce_over_batches bool

Whether to average metric over the batch axis if set True, default is False to average over the classes axis.

False
generalized_metric bool

Whether to apply normalization by the volume of each class.

False
weight Optional[torch.Tensor]

a manual rescaling weight given to each class. If given, it has to be a Tensor of size C.

None
reduction Union[LossReduction, str]

Specifies the reduction to apply to the output: none | mean | sum. none: no reduction will be applied. mean: the sum of the output will be divided by the number of elements in the output. sum: the output will be summed. Default: mean

'mean'
Source code in V3_6/src/super_gradients/training/losses/structure_loss.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    apply_softmax: bool = True,
    ignore_index: int = None,
    smooth: float = 1.0,
    eps: float = 1e-5,
    reduce_over_batches: bool = False,
    generalized_metric: bool = False,
    weight: Optional[torch.Tensor] = None,
    reduction: Union[LossReduction, str] = "mean",
):
    """
    :param apply_softmax: Whether to apply softmax to the predictions.
    :param smooth: laplace smoothing, also known as additive smoothing. The larger smooth value is, closer the metric
        coefficient is to 1, which can be used as a regularization effect.
        As mentioned in: https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
    :param eps: epsilon value to avoid inf.
    :param reduce_over_batches: Whether to average metric over the batch axis if set True,
     default is `False` to average over the classes axis.
    :param generalized_metric: Whether to apply normalization by the volume of each class.
    :param weight: a manual rescaling weight given to each class. If given, it has to be a Tensor of size `C`.
    :param reduction: Specifies the reduction to apply to the output: `none` | `mean` | `sum`.
        `none`: no reduction will be applied.
        `mean`: the sum of the output will be divided by the number of elements in the output.
        `sum`: the output will be summed.
        Default: `mean`
    """
    super().__init__(reduction=reduction)
    self.ignore_index = ignore_index
    self.apply_softmax = apply_softmax
    self.eps = eps
    self.smooth = smooth
    self.reduce_over_batches = reduce_over_batches
    self.generalized_metric = generalized_metric
    self.weight = weight
    if self.generalized_metric:
        assert self.weight is None, "Cannot use structured Loss with weight classes and generalized normalization"
        if self.eps > 1e-12:
            logger.warning("When using GeneralizedLoss, it is recommended to use eps below 1e-12, to not affect" "small values normalized terms.")
        if self.smooth != 0:
            logger.warning("When using GeneralizedLoss, it is recommended to set smooth value as 0.")

CIoULoss

Bases: nn.Module

Complete IoU loss

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
class CIoULoss(nn.Module):
    """
    Complete IoU loss
    """

    def __init__(self, eps: float = 1e-10, reduction: str = "none"):
        """
        :param eps:         epsilon to avoid divide by zero, default as 1e-10
        :param reduction:   Options are "none", "mean" and "sum". default as none
        """

        if reduction not in ("none", "mean", "sum"):
            raise ValueError(f"reduction must be one of 'none', 'mean', 'sum', but got {reduction}")
        super().__init__()
        self.eps = eps
        self.reduction = reduction

    def forward(self, predictions: Tensor, targets: Tensor, loc_weights: Optional[Tensor] = None) -> Tensor:
        """
        :param predictions: Predicted boxes in xyxy format of [D0, D1,...Di, 4] shape
        :param targets:     Target boxes in xyxy format of [D0, D1,...Di, 4] shape
        :param loc_weights: Optional tensor of [D0, D1,...Di] shape with weights for each prediction
        :return:            CIOU loss
        """
        loss = bbox_ciou_loss(predictions, targets, eps=self.eps)
        if loc_weights is not None:
            loss = loss * loc_weights
        if self.reduction == "sum":
            loss = torch.sum(loss)
        elif self.reduction == "mean":
            loss = torch.mean(loss)
        return loss

__init__(eps=1e-10, reduction='none')

Parameters:

Name Type Description Default
eps float

epsilon to avoid divide by zero, default as 1e-10

1e-10
reduction str

Options are "none", "mean" and "sum". default as none

'none'
Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
252
253
254
255
256
257
258
259
260
261
262
def __init__(self, eps: float = 1e-10, reduction: str = "none"):
    """
    :param eps:         epsilon to avoid divide by zero, default as 1e-10
    :param reduction:   Options are "none", "mean" and "sum". default as none
    """

    if reduction not in ("none", "mean", "sum"):
        raise ValueError(f"reduction must be one of 'none', 'mean', 'sum', but got {reduction}")
    super().__init__()
    self.eps = eps
    self.reduction = reduction

forward(predictions, targets, loc_weights=None)

Parameters:

Name Type Description Default
predictions Tensor

Predicted boxes in xyxy format of [D0, D1,...Di, 4] shape

required
targets Tensor

Target boxes in xyxy format of [D0, D1,...Di, 4] shape

required
loc_weights Optional[Tensor]

Optional tensor of [D0, D1,...Di] shape with weights for each prediction

None

Returns:

Type Description
Tensor

CIOU loss

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def forward(self, predictions: Tensor, targets: Tensor, loc_weights: Optional[Tensor] = None) -> Tensor:
    """
    :param predictions: Predicted boxes in xyxy format of [D0, D1,...Di, 4] shape
    :param targets:     Target boxes in xyxy format of [D0, D1,...Di, 4] shape
    :param loc_weights: Optional tensor of [D0, D1,...Di] shape with weights for each prediction
    :return:            CIOU loss
    """
    loss = bbox_ciou_loss(predictions, targets, eps=self.eps)
    if loc_weights is not None:
        loss = loss * loc_weights
    if self.reduction == "sum":
        loss = torch.sum(loss)
    elif self.reduction == "mean":
        loss = torch.mean(loss)
    return loss

YoloNASPoseLoss

Bases: nn.Module

Loss for training YoloNASPose model

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
@register_loss(Losses.YOLONAS_POSE_LOSS)
class YoloNASPoseLoss(nn.Module):
    """
    Loss for training YoloNASPose model
    """

    def __init__(
        self,
        oks_sigmas: Union[List[float], np.ndarray, Tensor],
        classification_loss_type: str = "focal",
        regression_iou_loss_type: str = "ciou",
        classification_loss_weight: float = 1.0,
        iou_loss_weight: float = 2.5,
        dfl_loss_weight: float = 0.5,
        pose_cls_loss_weight: float = 1.0,
        pose_reg_loss_weight: float = 1.0,
        pose_classification_loss_type: str = "bce",
        bbox_assigner_topk: int = 13,
        bbox_assigned_alpha: float = 1.0,
        bbox_assigned_beta: float = 6.0,
        assigner_multiply_by_pose_oks: bool = False,
        rescale_pose_loss_with_assigned_score: bool = False,
        average_losses_in_ddp: bool = False,
    ):
        """
        :param oks_sigmas:                 OKS sigmas for pose estimation. Array of [Num Keypoints].
        :param classification_loss_type:   Classification loss type. One of "focal" or "bce"
        :param regression_iou_loss_type:   Regression IoU loss type. One of "giou" or "ciou"
        :param classification_loss_weight: Classification loss weight
        :param iou_loss_weight:            IoU loss weight
        :param dfl_loss_weight:            DFL loss weight
        :param pose_cls_loss_weight:       Pose classification loss weight
        :param pose_reg_loss_weight:       Pose regression loss weight
        :param average_losses_in_ddp:      Whether to average losses in DDP mode. In theory, enabling this option
                                           should have the positive impact on model accuracy since it would smooth out
                                           influence of batches with small number of objects.
                                           However, it needs to be proven empirically.
        """
        super().__init__()
        self.classification_loss_type = classification_loss_type
        self.classification_loss_weight = classification_loss_weight
        self.dfl_loss_weight = dfl_loss_weight
        self.iou_loss_weight = iou_loss_weight

        self.iou_loss = {"giou": GIoULoss, "ciou": CIoULoss}[regression_iou_loss_type]()
        self.num_keypoints = len(oks_sigmas)
        self.num_classes = 1  # We have only one class in pose estimation task
        self.oks_sigmas = torch.tensor(oks_sigmas)
        self.pose_cls_loss_weight = pose_cls_loss_weight
        self.pose_reg_loss_weight = pose_reg_loss_weight
        self.assigner = YoloNASPoseTaskAlignedAssigner(
            sigmas=self.oks_sigmas,
            topk=bbox_assigner_topk,
            alpha=bbox_assigned_alpha,
            beta=bbox_assigned_beta,
            multiply_by_pose_oks=assigner_multiply_by_pose_oks,
        )
        self.pose_classification_loss_type = pose_classification_loss_type
        self.rescale_pose_loss_with_assigned_score = rescale_pose_loss_with_assigned_score
        self.average_losses_in_ddp = average_losses_in_ddp

    @torch.no_grad()
    def _unpack_flat_targets(self, targets: Tuple[Tensor, Tensor, Tensor], batch_size: int) -> Mapping[str, torch.Tensor]:
        """
        Convert targets to PPYoloE-compatible format since it's the easiest (not the cleanest) way to
        have PP Yolo training & metrics computed

        :param targets: Tuple (boxes, joints, crowd)
                        - boxes: [N, 5] (batch_index, x1, y1, x2, y2)
                        - joints: [N, num_joints, 4] (batch_index, x, y, visibility)
                        - crowd: [N, 2] (batch_index, is_crowd)
        :return:        (Dictionary [str,Tensor]) with keys:
                        - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
                        - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in XYXY format
                        - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        """
        target_boxes, target_joints, target_iscrowd = targets

        image_index = target_boxes[:, 0]
        gt_bbox = target_boxes[:, 1:5]

        per_image_class = []
        per_image_bbox = []
        per_image_pad_mask = []
        per_image_targets = undo_flat_collate_tensors_with_batch_index(target_joints, batch_size)
        per_image_crowds = undo_flat_collate_tensors_with_batch_index(target_iscrowd, batch_size)

        max_boxes = 0
        for i in range(batch_size):
            mask = image_index == i

            image_bboxes = gt_bbox[mask, :]
            valid_bboxes = image_bboxes.sum(dim=1, keepdims=True) > 0

            per_image_bbox.append(image_bboxes)
            per_image_pad_mask.append(valid_bboxes)
            # Since for pose estimation we have only one class, we can just fill it with zeros
            per_image_class.append(torch.zeros((len(image_bboxes), 1), dtype=torch.long, device=target_boxes.device))

            max_boxes = max(max_boxes, mask.sum().item())

        for i in range(batch_size):
            elements_to_pad = max_boxes - len(per_image_bbox[i])
            padding_left = 0
            padding_right = 0
            padding_top = 0
            padding_bottom = elements_to_pad
            pad = padding_left, padding_right, padding_top, padding_bottom
            per_image_class[i] = F.pad(per_image_class[i], pad, mode="constant", value=0)
            per_image_bbox[i] = F.pad(per_image_bbox[i], pad, mode="constant", value=0)
            per_image_pad_mask[i] = F.pad(per_image_pad_mask[i], pad, mode="constant", value=0)
            per_image_targets[i] = F.pad(per_image_targets[i], (0, 0) + pad, mode="constant", value=0)
            per_image_crowds[i] = F.pad(per_image_crowds[i], pad, mode="constant", value=0)

        new_targets = {
            "gt_class": torch.stack(per_image_class, dim=0),
            "gt_bbox": torch.stack(per_image_bbox, dim=0),
            "pad_gt_mask": torch.stack(per_image_pad_mask, dim=0),
            "gt_poses": torch.stack(per_image_targets, dim=0),
            "gt_crowd": torch.stack(per_image_crowds, dim=0),
        }
        return new_targets

    def forward(
        self,
        outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]],
        targets: Tuple[Tensor, Tensor, Tensor],
    ) -> Tuple[Tensor, Tensor]:
        """
        :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
        :param targets: A tuple of (boxes, joints, crowd) tensors where
                        - boxes: [N, 5] (batch_index, x1, y1, x2, y2)
                        - joints: [N, num_joints, 4] (batch_index, x, y, visibility)
                        - crowd: [N, 2] (batch_index, is_crowd)
        :return:        Tuple of two tensors where first element is main loss for backward and
                        second element is stacked tensor of all individual losses
        """
        _, predictions = outputs

        (
            pred_scores,
            pred_distri,
            pred_pose_coords,  # [B, Anchors, C, 2]
            pred_pose_logits,  # [B, Anchors, C]
            anchors,
            anchor_points,
            num_anchors_list,
            stride_tensor,
        ) = predictions

        targets = self._unpack_flat_targets(targets, batch_size=pred_scores.size(0))

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes, reg_max = self._bbox_decode(anchor_points_s, pred_distri)

        gt_labels = targets["gt_class"]
        gt_bboxes = targets["gt_bbox"]
        gt_poses = targets["gt_poses"]
        gt_crowd = targets["gt_crowd"]
        pad_gt_mask = targets["pad_gt_mask"]

        # label assignment
        assign_result = self.assigner(
            pred_scores=pred_scores.detach().sigmoid(),  # Pred scores are logits on training for numerical stability
            pred_bboxes=pred_bboxes.detach() * stride_tensor,
            pred_pose_coords=pred_pose_coords.detach(),
            anchor_points=anchor_points,
            gt_labels=gt_labels,
            gt_bboxes=gt_bboxes,
            gt_poses=gt_poses,
            gt_crowd=gt_crowd,
            pad_gt_mask=pad_gt_mask,
            bg_index=self.num_classes,
        )

        assigned_scores = assign_result.assigned_scores

        # cls loss
        if self.classification_loss_type == "focal":
            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=-1)
        elif self.classification_loss_type == "bce":
            loss_cls = torch.nn.functional.binary_cross_entropy_with_logits(pred_scores, assigned_scores, reduction="sum")
        else:
            raise ValueError(f"Unknown classification loss type: {self.classification_loss_type}")

        assigned_scores_sum = assigned_scores.sum()
        if self.average_losses_in_ddp and is_distributed():
            torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
            assigned_scores_sum /= get_world_size()
        assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)
        loss_cls /= assigned_scores_sum

        loss_iou, loss_dfl, loss_pose_cls, loss_pose_reg = self._bbox_loss(
            pred_distri,
            pred_bboxes,
            pred_pose_coords=pred_pose_coords,
            pred_pose_logits=pred_pose_logits,
            stride_tensor=stride_tensor,
            anchor_points=anchor_points_s,
            assign_result=assign_result,
            assigned_scores_sum=assigned_scores_sum,
            reg_max=reg_max,
        )

        loss_cls = loss_cls * self.classification_loss_weight
        loss_iou = loss_iou * self.iou_loss_weight
        loss_dfl = loss_dfl * self.dfl_loss_weight
        loss_pose_cls = loss_pose_cls * self.pose_cls_loss_weight
        loss_pose_reg = loss_pose_reg * self.pose_reg_loss_weight

        loss = loss_cls + loss_iou + loss_dfl + loss_pose_cls + loss_pose_reg
        log_losses = torch.stack([loss_cls.detach(), loss_iou.detach(), loss_dfl.detach(), loss_pose_cls.detach(), loss_pose_reg.detach(), loss.detach()])

        return loss, log_losses

    @property
    def component_names(self):
        return ["loss_cls", "loss_iou", "loss_dfl", "loss_pose_cls", "loss_pose_reg", "loss"]

    def _df_loss(self, pred_dist: Tensor, target: Tensor) -> Tensor:
        target_left = target.long()
        target_right = target_left + 1
        weight_left = target_right.float() - target
        weight_right = 1 - weight_left

        # [B,L,C] -> [B,C,L] to make compatible with torch.nn.functional.cross_entropy
        # which expects channel dim to be at index 1
        pred_dist = torch.moveaxis(pred_dist, -1, 1)

        loss_left = torch.nn.functional.cross_entropy(pred_dist, target_left, reduction="none") * weight_left
        loss_right = torch.nn.functional.cross_entropy(pred_dist, target_right, reduction="none") * weight_right
        return (loss_left + loss_right).mean(dim=-1, keepdim=True)

    def _keypoint_loss(
        self,
        predicted_coords: Tensor,
        target_coords: Tensor,
        predicted_logits: Tensor,
        target_visibility: Tensor,
        area: Tensor,
        sigmas: Tensor,
        assigned_scores: Optional[Tensor] = None,
        assigned_scores_sum: Optional[Tensor] = None,
    ) -> Tuple[Tensor, Tensor]:
        """

        :param predicted_coords:  [Num Instances, Num Joints, 2] - (x, y)
        :param target_coords:     [Num Instances, Num Joints, 2] - (x, y)
        :param predicted_logits:  [Num Instances, Num Joints, 1] - Logits for each joint
        :param target_visibility: [Num Instances, Num Joints, 1] - Visibility of each joint
        :param sigmas:            [Num Joints] - Sigma for each joint
        :param area:              [Num Instances, 1] - Area of the corresponding bounding box
        :return:                  Tuple of (regression loss, classification loss)
                                  - regression loss [Num Instances, 1]
                                  - classification loss [Num Instances, 1]
        """
        sigmas = sigmas.reshape([1, -1, 1])
        area = area.reshape([-1, 1, 1])

        visible_targets_mask: Tensor = (target_visibility > 0).float()  # [Num Instances, Num Joints, 1]

        d = ((predicted_coords - target_coords) ** 2).sum(dim=-1, keepdim=True)  # [[Num Instances, Num Joints, 1]
        e = d / (2 * sigmas) ** 2 / (area + 1e-9) / 2  # [Num Instances, Num Joints, 1]
        regression_loss_unreduced = 1 - torch.exp(-e)  # [Num Instances, Num Joints, 1]

        regression_loss_reduced = (regression_loss_unreduced * visible_targets_mask).sum(dim=1, keepdim=False) / (
            visible_targets_mask.sum(dim=1, keepdim=False) + 1e-9
        )  # [Num Instances, 1]

        if self.pose_classification_loss_type == "bce":
            classification_loss = torch.nn.functional.binary_cross_entropy_with_logits(predicted_logits, visible_targets_mask, reduction="none").mean(dim=1)
        elif self.pose_classification_loss_type == "focal":
            classification_loss = self._focal_loss(predicted_logits, visible_targets_mask, alpha=0.25, gamma=2.0, reduction="none").mean(dim=1)
        else:
            raise ValueError(f"Unsupported pose classification loss type {self.pose_classification_loss_type}")

        if assigned_scores is None:
            classification_loss = classification_loss.mean()
            regression_loss = regression_loss_reduced.mean()
        else:
            classification_loss = (classification_loss * assigned_scores).sum() / assigned_scores_sum
            regression_loss = (regression_loss_reduced * assigned_scores).sum() / assigned_scores_sum

        return regression_loss, classification_loss

    def _xyxy_box_area(self, boxes):
        """
        :param boxes: [..., 4] (x1, y1, x2, y2)
        :return: [...,1]
        """
        area = (boxes[..., 2:4] - boxes[..., 0:2]).prod(dim=-1, keepdim=True)
        return area

    def _bbox_loss(
        self,
        pred_dist,
        pred_bboxes,
        pred_pose_coords,
        pred_pose_logits,
        stride_tensor,
        anchor_points,
        assign_result: YoloNASPoseYoloNASPoseBoxesAssignmentResult,
        assigned_scores_sum,
        reg_max: int,
    ):
        # select positive samples mask that are not crowd and not background
        # loss ALWAYS respect the crowd targets by excluding them from contributing to the loss
        # if you want to train WITH crowd targets, mark them as non-crowd on dataset level
        # if you want to train WITH crowd targets, mark them as non-crowd on dataset level
        mask_positive = (assign_result.assigned_labels != self.num_classes) * assign_result.assigned_crowd.eq(0)
        num_pos = mask_positive.sum()
        assigned_bboxes_divided_by_stride = assign_result.assigned_bboxes / stride_tensor

        # pos/neg loss
        if num_pos > 0:
            # l1 + iou
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])

            pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = torch.masked_select(assigned_bboxes_divided_by_stride, bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos_image_coord = torch.masked_select(assign_result.assigned_bboxes, bbox_mask).reshape([-1, 4])

            bbox_weight = torch.masked_select(assign_result.assigned_scores.sum(-1), mask_positive).unsqueeze(-1)

            loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / assigned_scores_sum

            dist_mask = mask_positive.unsqueeze(-1).tile([1, 1, (reg_max + 1) * 4])
            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).reshape([-1, 4, reg_max + 1])
            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes_divided_by_stride, reg_max)
            assigned_ltrb_pos = torch.masked_select(assigned_ltrb, bbox_mask).reshape([-1, 4])
            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos) * bbox_weight
            loss_dfl = loss_dfl.sum() / assigned_scores_sum

            # Do not divide poses by stride since this would skew the loss and make sigmas incorrect
            pred_pose_coords = pred_pose_coords[mask_positive]
            pred_pose_logits = pred_pose_logits[mask_positive].unsqueeze(-1)  # To make [Num Instances, Num Joints, 1]

            gt_pose_coords = assign_result.assigned_poses[..., 0:2][mask_positive]
            gt_pose_visibility = assign_result.assigned_poses[mask_positive][:, :, 2:3]

            area = self._xyxy_box_area(assigned_bboxes_pos_image_coord).reshape([-1, 1]) * 0.53
            loss_pose_reg, loss_pose_cls = self._keypoint_loss(
                predicted_coords=pred_pose_coords,
                target_coords=gt_pose_coords,
                predicted_logits=pred_pose_logits,
                target_visibility=gt_pose_visibility,
                assigned_scores=bbox_weight if self.rescale_pose_loss_with_assigned_score else None,
                assigned_scores_sum=assigned_scores_sum if self.rescale_pose_loss_with_assigned_score else None,
                area=area,
                sigmas=self.oks_sigmas.to(pred_pose_logits.device),
            )
        else:
            loss_iou = torch.zeros([], device=pred_bboxes.device)
            loss_dfl = torch.zeros([], device=pred_bboxes.device)
            loss_pose_cls = torch.zeros([], device=pred_bboxes.device)
            loss_pose_reg = torch.zeros([], device=pred_bboxes.device)

        return loss_iou, loss_dfl, loss_pose_cls, loss_pose_reg

    def _bbox_decode(self, anchor_points: Tensor, pred_dist: Tensor) -> Tuple[Tensor, int]:
        """
        Decode predicted bounding boxes using anchor points and predicted distribution
        :param anchor_points: Anchor locations (center for each point) of [B, L, 2] shape
        :param pred_dist:     Predicted offset distributions of [B, L, 4 * (reg_max + 1)] shape
        :return:              Decoded bounding boxes (XYXY format) of [B, L, 4] shape and reg_max
        """
        b, l, *_ = pred_dist.size()
        pred_dist = torch.softmax(pred_dist.reshape([b, l, 4, -1]), dim=-1)

        reg_max = pred_dist.size(-1) - 1
        proj_conv = torch.linspace(0, reg_max, reg_max + 1, device=pred_dist.device).reshape([1, reg_max + 1, 1, 1])

        pred_dist = torch.nn.functional.conv2d(pred_dist.permute(0, 3, 1, 2), proj_conv).squeeze(1)
        return batch_distance2bbox(anchor_points, pred_dist), reg_max

    def _bbox2distance(self, points, bbox, reg_max):
        x1y1, x2y2 = torch.split(bbox, 2, -1)
        lt = points - x1y1
        rb = x2y2 - points
        return torch.cat([lt, rb], dim=-1).clip(0, reg_max - 0.01)

    @staticmethod
    def _focal_loss(pred_logits: Tensor, label: Tensor, alpha=0.25, gamma=2.0, reduction="sum") -> Tensor:
        pred_score = pred_logits.sigmoid()
        weight = torch.abs(pred_score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        # This is same, but binary_cross_entropy_with_logits is faster
        # loss = -weight * (label * torch.nn.functional.logsigmoid(pred_logits) + (1 - label) * torch.nn.functional.logsigmoid(-pred_logits))
        loss = weight * torch.nn.functional.binary_cross_entropy_with_logits(pred_logits, label, reduction="none")

        if reduction == "sum":
            loss = loss.sum()
        elif reduction == "mean":
            loss = loss.mean()
        elif reduction == "none":
            pass
        else:
            raise ValueError(f"Unsupported reduction type {reduction}")
        return loss

__init__(oks_sigmas, classification_loss_type='focal', regression_iou_loss_type='ciou', classification_loss_weight=1.0, iou_loss_weight=2.5, dfl_loss_weight=0.5, pose_cls_loss_weight=1.0, pose_reg_loss_weight=1.0, pose_classification_loss_type='bce', bbox_assigner_topk=13, bbox_assigned_alpha=1.0, bbox_assigned_beta=6.0, assigner_multiply_by_pose_oks=False, rescale_pose_loss_with_assigned_score=False, average_losses_in_ddp=False)

Parameters:

Name Type Description Default
oks_sigmas Union[List[float], np.ndarray, Tensor]

OKS sigmas for pose estimation. Array of [Num Keypoints].

required
classification_loss_type str

Classification loss type. One of "focal" or "bce"

'focal'
regression_iou_loss_type str

Regression IoU loss type. One of "giou" or "ciou"

'ciou'
classification_loss_weight float

Classification loss weight

1.0
iou_loss_weight float

IoU loss weight

2.5
dfl_loss_weight float

DFL loss weight

0.5
pose_cls_loss_weight float

Pose classification loss weight

1.0
pose_reg_loss_weight float

Pose regression loss weight

1.0
average_losses_in_ddp bool

Whether to average losses in DDP mode. In theory, enabling this option should have the positive impact on model accuracy since it would smooth out influence of batches with small number of objects. However, it needs to be proven empirically.

False
Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def __init__(
    self,
    oks_sigmas: Union[List[float], np.ndarray, Tensor],
    classification_loss_type: str = "focal",
    regression_iou_loss_type: str = "ciou",
    classification_loss_weight: float = 1.0,
    iou_loss_weight: float = 2.5,
    dfl_loss_weight: float = 0.5,
    pose_cls_loss_weight: float = 1.0,
    pose_reg_loss_weight: float = 1.0,
    pose_classification_loss_type: str = "bce",
    bbox_assigner_topk: int = 13,
    bbox_assigned_alpha: float = 1.0,
    bbox_assigned_beta: float = 6.0,
    assigner_multiply_by_pose_oks: bool = False,
    rescale_pose_loss_with_assigned_score: bool = False,
    average_losses_in_ddp: bool = False,
):
    """
    :param oks_sigmas:                 OKS sigmas for pose estimation. Array of [Num Keypoints].
    :param classification_loss_type:   Classification loss type. One of "focal" or "bce"
    :param regression_iou_loss_type:   Regression IoU loss type. One of "giou" or "ciou"
    :param classification_loss_weight: Classification loss weight
    :param iou_loss_weight:            IoU loss weight
    :param dfl_loss_weight:            DFL loss weight
    :param pose_cls_loss_weight:       Pose classification loss weight
    :param pose_reg_loss_weight:       Pose regression loss weight
    :param average_losses_in_ddp:      Whether to average losses in DDP mode. In theory, enabling this option
                                       should have the positive impact on model accuracy since it would smooth out
                                       influence of batches with small number of objects.
                                       However, it needs to be proven empirically.
    """
    super().__init__()
    self.classification_loss_type = classification_loss_type
    self.classification_loss_weight = classification_loss_weight
    self.dfl_loss_weight = dfl_loss_weight
    self.iou_loss_weight = iou_loss_weight

    self.iou_loss = {"giou": GIoULoss, "ciou": CIoULoss}[regression_iou_loss_type]()
    self.num_keypoints = len(oks_sigmas)
    self.num_classes = 1  # We have only one class in pose estimation task
    self.oks_sigmas = torch.tensor(oks_sigmas)
    self.pose_cls_loss_weight = pose_cls_loss_weight
    self.pose_reg_loss_weight = pose_reg_loss_weight
    self.assigner = YoloNASPoseTaskAlignedAssigner(
        sigmas=self.oks_sigmas,
        topk=bbox_assigner_topk,
        alpha=bbox_assigned_alpha,
        beta=bbox_assigned_beta,
        multiply_by_pose_oks=assigner_multiply_by_pose_oks,
    )
    self.pose_classification_loss_type = pose_classification_loss_type
    self.rescale_pose_loss_with_assigned_score = rescale_pose_loss_with_assigned_score
    self.average_losses_in_ddp = average_losses_in_ddp

forward(outputs, targets)

Parameters:

Name Type Description Default
outputs Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]

Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor

required
targets Tuple[Tensor, Tensor, Tensor]

A tuple of (boxes, joints, crowd) tensors where - boxes: [N, 5] (batch_index, x1, y1, x2, y2) - joints: [N, num_joints, 4] (batch_index, x, y, visibility) - crowd: [N, 2] (batch_index, is_crowd)

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple of two tensors where first element is main loss for backward and second element is stacked tensor of all individual losses

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
def forward(
    self,
    outputs: Tuple[Tuple[Tensor, Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]],
    targets: Tuple[Tensor, Tensor, Tensor],
) -> Tuple[Tensor, Tensor]:
    """
    :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
    :param targets: A tuple of (boxes, joints, crowd) tensors where
                    - boxes: [N, 5] (batch_index, x1, y1, x2, y2)
                    - joints: [N, num_joints, 4] (batch_index, x, y, visibility)
                    - crowd: [N, 2] (batch_index, is_crowd)
    :return:        Tuple of two tensors where first element is main loss for backward and
                    second element is stacked tensor of all individual losses
    """
    _, predictions = outputs

    (
        pred_scores,
        pred_distri,
        pred_pose_coords,  # [B, Anchors, C, 2]
        pred_pose_logits,  # [B, Anchors, C]
        anchors,
        anchor_points,
        num_anchors_list,
        stride_tensor,
    ) = predictions

    targets = self._unpack_flat_targets(targets, batch_size=pred_scores.size(0))

    anchor_points_s = anchor_points / stride_tensor
    pred_bboxes, reg_max = self._bbox_decode(anchor_points_s, pred_distri)

    gt_labels = targets["gt_class"]
    gt_bboxes = targets["gt_bbox"]
    gt_poses = targets["gt_poses"]
    gt_crowd = targets["gt_crowd"]
    pad_gt_mask = targets["pad_gt_mask"]

    # label assignment
    assign_result = self.assigner(
        pred_scores=pred_scores.detach().sigmoid(),  # Pred scores are logits on training for numerical stability
        pred_bboxes=pred_bboxes.detach() * stride_tensor,
        pred_pose_coords=pred_pose_coords.detach(),
        anchor_points=anchor_points,
        gt_labels=gt_labels,
        gt_bboxes=gt_bboxes,
        gt_poses=gt_poses,
        gt_crowd=gt_crowd,
        pad_gt_mask=pad_gt_mask,
        bg_index=self.num_classes,
    )

    assigned_scores = assign_result.assigned_scores

    # cls loss
    if self.classification_loss_type == "focal":
        loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=-1)
    elif self.classification_loss_type == "bce":
        loss_cls = torch.nn.functional.binary_cross_entropy_with_logits(pred_scores, assigned_scores, reduction="sum")
    else:
        raise ValueError(f"Unknown classification loss type: {self.classification_loss_type}")

    assigned_scores_sum = assigned_scores.sum()
    if self.average_losses_in_ddp and is_distributed():
        torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
        assigned_scores_sum /= get_world_size()
    assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)
    loss_cls /= assigned_scores_sum

    loss_iou, loss_dfl, loss_pose_cls, loss_pose_reg = self._bbox_loss(
        pred_distri,
        pred_bboxes,
        pred_pose_coords=pred_pose_coords,
        pred_pose_logits=pred_pose_logits,
        stride_tensor=stride_tensor,
        anchor_points=anchor_points_s,
        assign_result=assign_result,
        assigned_scores_sum=assigned_scores_sum,
        reg_max=reg_max,
    )

    loss_cls = loss_cls * self.classification_loss_weight
    loss_iou = loss_iou * self.iou_loss_weight
    loss_dfl = loss_dfl * self.dfl_loss_weight
    loss_pose_cls = loss_pose_cls * self.pose_cls_loss_weight
    loss_pose_reg = loss_pose_reg * self.pose_reg_loss_weight

    loss = loss_cls + loss_iou + loss_dfl + loss_pose_cls + loss_pose_reg
    log_losses = torch.stack([loss_cls.detach(), loss_iou.detach(), loss_dfl.detach(), loss_pose_cls.detach(), loss_pose_reg.detach(), loss.detach()])

    return loss, log_losses

YoloNASPoseTaskAlignedAssigner

Bases: nn.Module

Task-aligned assigner repurposed from YoloNAS for pose estimation task

This class is almost identical to TaskAlignedAssigner, but it also assigns poses and unlike in object detection where assigned scores are product of IoU and class confidence, in pose estimation final assignment score is product of pose OKS and bbox IoU. This was empirically found to provide superior performance that the original approach.

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
class YoloNASPoseTaskAlignedAssigner(nn.Module):
    """
    Task-aligned assigner repurposed from YoloNAS for pose estimation task

    This class is almost identical to TaskAlignedAssigner, but it also assigns poses and unlike in
    object detection where assigned scores are product of IoU and class confidence, in pose estimation
    final assignment score is product of pose OKS and bbox IoU. This was empirically found to provide
    superior performance that the original approach.
    """

    def __init__(self, sigmas: Tensor, topk: int = 13, alpha: float = 1.0, beta=6.0, eps=1e-9, multiply_by_pose_oks: bool = False):
        """

        :param sigmas:               Sigmas for OKS calculation
        :param topk:                 Maximum number of anchors that is selected for each gt box
        :param alpha:                Power factor for class probabilities of predicted boxes (Used compute alignment metric)
        :param beta:                 Power factor for IoU score of predicted boxes (Used compute alignment metric)
        :param eps:                  Small constant for numerical stability
        :param multiply_by_pose_oks: Whether to multiply alignment metric by pose OKS
        """
        super().__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.eps = eps
        self.sigmas = sigmas
        self.multiply_by_pose_oks = multiply_by_pose_oks

    @torch.no_grad()
    def forward(
        self,
        pred_scores: Tensor,
        pred_bboxes: Tensor,
        pred_pose_coords: Tensor,
        anchor_points: Tensor,
        gt_labels: Tensor,
        gt_bboxes: Tensor,
        gt_poses: Tensor,
        gt_crowd: Tensor,
        pad_gt_mask: Tensor,
        bg_index: int,
    ) -> YoloNASPoseYoloNASPoseBoxesAssignmentResult:
        """
        This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.

        :param pred_scores: Tensor (float32): predicted class probability, shape(B, L, C)
        :param pred_bboxes: Tensor (float32): predicted bounding boxes, shape(B, L, 4)
        :param pred_pose_coords: Tensor (float32): predicted poses, shape(B, L, Num Keypoints, 2)
        :param anchor_points: Tensor (float32): pre-defined anchors, shape(L, 2), xy format
        :param gt_labels: Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)
        :param gt_bboxes: Tensor (float32): Ground truth bboxes, shape(B, n, 4)
        :param gt_poses: Tensor (float32): Ground truth poses, shape(B, n, Num Keypoints, 3)
        :param gt_crowd: Tensor (int): Whether the gt is crowd, shape(B, n, 1)
        :param pad_gt_mask: Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        :param bg_index: int ( background index
        :param gt_scores: Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)
        :return:
            - assigned_labels, Tensor of shape (B, L)
            - assigned_bboxes, Tensor of shape (B, L, 4)
            - assigned_scores, Tensor of shape (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, _, num_keypoints, _ = pred_pose_coords.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=gt_labels.device)
            assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=gt_labels.device)
            assigned_poses = torch.zeros([batch_size, num_anchors, num_keypoints, 3], device=gt_labels.device)
            assigned_scores = torch.zeros([batch_size, num_anchors, num_classes], device=gt_labels.device)
            assigned_gt_index = torch.zeros([batch_size, num_anchors], dtype=torch.long, device=gt_labels.device)
            assigned_crowd = torch.zeros([batch_size, num_anchors], dtype=torch.bool, device=gt_labels.device)

            return YoloNASPoseYoloNASPoseBoxesAssignmentResult(
                assigned_labels=assigned_labels,
                assigned_bboxes=assigned_bboxes,
                assigned_scores=assigned_scores,
                assigned_gt_index=assigned_gt_index,
                assigned_poses=assigned_poses,
                assigned_crowd=assigned_crowd,
            )

        # compute iou between gt and pred bbox, [B, n, L]
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)

        if self.multiply_by_pose_oks:
            pose_oks = batch_pose_oks(gt_poses, pred_pose_coords, gt_bboxes, self.sigmas.to(pred_pose_coords.device))
            ious = ious * pose_oks

        # gather pred bboxes class score
        pred_scores = torch.permute(pred_scores, [0, 2, 1])
        batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
        gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)

        bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]

        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)

        # check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

        # select topk largest alignment metrics pred bbox as candidates
        # for each gt, [B, n, L]
        is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts * pad_gt_mask

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(dim=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        assigned_gt_index = mask_positive.argmax(dim=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
        assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_poses = gt_poses.reshape([-1, num_keypoints, 3])[assigned_gt_index.flatten(), :]
        assigned_poses = assigned_poses.reshape([batch_size, num_anchors, num_keypoints, 3])

        assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
        max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
        alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        # respect crowd
        assigned_crowd = torch.gather(gt_crowd.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_crowd = assigned_crowd.reshape([batch_size, num_anchors])
        assigned_scores = assigned_scores * assigned_crowd.eq(0).unsqueeze(-1)

        return YoloNASPoseYoloNASPoseBoxesAssignmentResult(
            assigned_labels=assigned_labels,
            assigned_bboxes=assigned_bboxes,
            assigned_scores=assigned_scores,
            assigned_poses=assigned_poses,
            assigned_gt_index=assigned_gt_index,
            assigned_crowd=assigned_crowd,
        )

__init__(sigmas, topk=13, alpha=1.0, beta=6.0, eps=1e-09, multiply_by_pose_oks=False)

Parameters:

Name Type Description Default
sigmas Tensor

Sigmas for OKS calculation

required
topk int

Maximum number of anchors that is selected for each gt box

13
alpha float

Power factor for class probabilities of predicted boxes (Used compute alignment metric)

1.0
beta

Power factor for IoU score of predicted boxes (Used compute alignment metric)

6.0
eps

Small constant for numerical stability

1e-09
multiply_by_pose_oks bool

Whether to multiply alignment metric by pose OKS

False
Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __init__(self, sigmas: Tensor, topk: int = 13, alpha: float = 1.0, beta=6.0, eps=1e-9, multiply_by_pose_oks: bool = False):
    """

    :param sigmas:               Sigmas for OKS calculation
    :param topk:                 Maximum number of anchors that is selected for each gt box
    :param alpha:                Power factor for class probabilities of predicted boxes (Used compute alignment metric)
    :param beta:                 Power factor for IoU score of predicted boxes (Used compute alignment metric)
    :param eps:                  Small constant for numerical stability
    :param multiply_by_pose_oks: Whether to multiply alignment metric by pose OKS
    """
    super().__init__()
    self.topk = topk
    self.alpha = alpha
    self.beta = beta
    self.eps = eps
    self.sigmas = sigmas
    self.multiply_by_pose_oks = multiply_by_pose_oks

forward(pred_scores, pred_bboxes, pred_pose_coords, anchor_points, gt_labels, gt_bboxes, gt_poses, gt_crowd, pad_gt_mask, bg_index)

This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

The assignment is done in following steps 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt 2. select top-k bbox as candidates for each gt 3. limit the positive sample's center in gt (because the anchor-free detector only can predict positive distance) 4. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected.

Parameters:

Name Type Description Default
pred_scores Tensor

Tensor (float32): predicted class probability, shape(B, L, C)

required
pred_bboxes Tensor

Tensor (float32): predicted bounding boxes, shape(B, L, 4)

required
pred_pose_coords Tensor

Tensor (float32): predicted poses, shape(B, L, Num Keypoints, 2)

required
anchor_points Tensor

Tensor (float32): pre-defined anchors, shape(L, 2), xy format

required
gt_labels Tensor

Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)

required
gt_bboxes Tensor

Tensor (float32): Ground truth bboxes, shape(B, n, 4)

required
gt_poses Tensor

Tensor (float32): Ground truth poses, shape(B, n, Num Keypoints, 3)

required
gt_crowd Tensor

Tensor (int): Whether the gt is crowd, shape(B, n, 1)

required
pad_gt_mask Tensor

Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)

required
bg_index int

int ( background index

required
gt_scores

Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)

required

Returns:

Type Description
YoloNASPoseYoloNASPoseBoxesAssignmentResult
  • assigned_labels, Tensor of shape (B, L) - assigned_bboxes, Tensor of shape (B, L, 4) - assigned_scores, Tensor of shape (B, L, C)
Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
@torch.no_grad()
def forward(
    self,
    pred_scores: Tensor,
    pred_bboxes: Tensor,
    pred_pose_coords: Tensor,
    anchor_points: Tensor,
    gt_labels: Tensor,
    gt_bboxes: Tensor,
    gt_poses: Tensor,
    gt_crowd: Tensor,
    pad_gt_mask: Tensor,
    bg_index: int,
) -> YoloNASPoseYoloNASPoseBoxesAssignmentResult:
    """
    This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

    The assignment is done in following steps
    1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
    2. select top-k bbox as candidates for each gt
    3. limit the positive sample's center in gt (because the anchor-free detector
       only can predict positive distance)
    4. if an anchor box is assigned to multiple gts, the one with the
       highest iou will be selected.

    :param pred_scores: Tensor (float32): predicted class probability, shape(B, L, C)
    :param pred_bboxes: Tensor (float32): predicted bounding boxes, shape(B, L, 4)
    :param pred_pose_coords: Tensor (float32): predicted poses, shape(B, L, Num Keypoints, 2)
    :param anchor_points: Tensor (float32): pre-defined anchors, shape(L, 2), xy format
    :param gt_labels: Tensor (int64|int32): Label of gt_bboxes, shape(B, n, 1)
    :param gt_bboxes: Tensor (float32): Ground truth bboxes, shape(B, n, 4)
    :param gt_poses: Tensor (float32): Ground truth poses, shape(B, n, Num Keypoints, 3)
    :param gt_crowd: Tensor (int): Whether the gt is crowd, shape(B, n, 1)
    :param pad_gt_mask: Tensor (float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
    :param bg_index: int ( background index
    :param gt_scores: Tensor (one, float32) Score of gt_bboxes, shape(B, n, 1)
    :return:
        - assigned_labels, Tensor of shape (B, L)
        - assigned_bboxes, Tensor of shape (B, L, 4)
        - assigned_scores, Tensor of shape (B, L, C)
    """
    assert pred_scores.ndim == pred_bboxes.ndim
    assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

    batch_size, num_anchors, num_classes = pred_scores.shape
    _, _, num_keypoints, _ = pred_pose_coords.shape
    _, num_max_boxes, _ = gt_bboxes.shape

    # negative batch
    if num_max_boxes == 0:
        assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=gt_labels.device)
        assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=gt_labels.device)
        assigned_poses = torch.zeros([batch_size, num_anchors, num_keypoints, 3], device=gt_labels.device)
        assigned_scores = torch.zeros([batch_size, num_anchors, num_classes], device=gt_labels.device)
        assigned_gt_index = torch.zeros([batch_size, num_anchors], dtype=torch.long, device=gt_labels.device)
        assigned_crowd = torch.zeros([batch_size, num_anchors], dtype=torch.bool, device=gt_labels.device)

        return YoloNASPoseYoloNASPoseBoxesAssignmentResult(
            assigned_labels=assigned_labels,
            assigned_bboxes=assigned_bboxes,
            assigned_scores=assigned_scores,
            assigned_gt_index=assigned_gt_index,
            assigned_poses=assigned_poses,
            assigned_crowd=assigned_crowd,
        )

    # compute iou between gt and pred bbox, [B, n, L]
    ious = batch_iou_similarity(gt_bboxes, pred_bboxes)

    if self.multiply_by_pose_oks:
        pose_oks = batch_pose_oks(gt_poses, pred_pose_coords, gt_bboxes, self.sigmas.to(pred_pose_coords.device))
        ious = ious * pose_oks

    # gather pred bboxes class score
    pred_scores = torch.permute(pred_scores, [0, 2, 1])
    batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
    gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)

    bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]

    # compute alignment metrics, [B, n, L]
    alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)

    # check the positive sample's center in gt, [B, n, L]
    is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

    # select topk largest alignment metrics pred bbox as candidates
    # for each gt, [B, n, L]
    is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

    # select positive sample, [B, n, L]
    mask_positive = is_in_topk * is_in_gts * pad_gt_mask

    # if an anchor box is assigned to multiple gts,
    # the one with the highest iou will be selected, [B, n, L]
    mask_positive_sum = mask_positive.sum(dim=-2)
    if mask_positive_sum.max() > 1:
        mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
        is_max_iou = compute_max_iou_anchor(ious)
        mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
        mask_positive_sum = mask_positive.sum(dim=-2)
    assigned_gt_index = mask_positive.argmax(dim=-2)

    # assigned target
    assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
    assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
    assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
    assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

    assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
    assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

    assigned_poses = gt_poses.reshape([-1, num_keypoints, 3])[assigned_gt_index.flatten(), :]
    assigned_poses = assigned_poses.reshape([batch_size, num_anchors, num_keypoints, 3])

    assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
    ind = list(range(num_classes + 1))
    ind.remove(bg_index)
    assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
    # rescale alignment metrics
    alignment_metrics *= mask_positive
    max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
    max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
    alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
    alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
    assigned_scores = assigned_scores * alignment_metrics

    # respect crowd
    assigned_crowd = torch.gather(gt_crowd.flatten(), index=assigned_gt_index.flatten(), dim=0)
    assigned_crowd = assigned_crowd.reshape([batch_size, num_anchors])
    assigned_scores = assigned_scores * assigned_crowd.eq(0).unsqueeze(-1)

    return YoloNASPoseYoloNASPoseBoxesAssignmentResult(
        assigned_labels=assigned_labels,
        assigned_bboxes=assigned_bboxes,
        assigned_scores=assigned_scores,
        assigned_poses=assigned_poses,
        assigned_gt_index=assigned_gt_index,
        assigned_crowd=assigned_crowd,
    )

YoloNASPoseYoloNASPoseBoxesAssignmentResult dataclass

This dataclass stores result of assignment of predicted boxes to ground truth boxes for YoloNASPose model. It produced by YoloNASPoseTaskAlignedAssigner and is used by YoloNASPoseLoss to compute the loss.

For all fields, first dimension is batch dimension, second dimension is number of anchors.

Parameters:

Name Type Description Default
assigned_labels Tensor

Tensor of shape (B, L) - Assigned gt labels for each anchor location

required
assigned_bboxes Tensor

Tensor of shape (B, L, 4) - Assigned groundtruth boxes in XYXY format for each anchor location

required
assigned_scores Tensor

Tensor of shape (B, L, C) - Assigned scores for each anchor location

required
assigned_poses Tensor

Tensor of shape (B, L, 17, 3) - Assigned groundtruth poses for each anchor location

required
assigned_gt_index Tensor

Tensor of shape (B, L) - Index of assigned groundtruth box for each anchor location

required
assigned_crowd Tensor

Tensor of shape (B, L) - Whether the assigned groundtruth box is crowd

required
Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@dataclasses.dataclass
class YoloNASPoseYoloNASPoseBoxesAssignmentResult:
    """
    This dataclass stores result of assignment of predicted boxes to ground truth boxes for YoloNASPose model.
    It produced by YoloNASPoseTaskAlignedAssigner and is used by YoloNASPoseLoss to compute the loss.

    For all fields, first dimension is batch dimension, second dimension is number of anchors.

    :param assigned_labels: Tensor of shape (B, L) - Assigned gt labels for each anchor location
    :param assigned_bboxes: Tensor of shape (B, L, 4) - Assigned groundtruth boxes in XYXY format for each anchor location
    :param assigned_scores: Tensor of shape (B, L, C) - Assigned scores for each anchor location
    :param assigned_poses: Tensor of shape (B, L, 17, 3) - Assigned groundtruth poses for each anchor location
    :param assigned_gt_index: Tensor of shape (B, L) - Index of assigned groundtruth box for each anchor location
    :param assigned_crowd: Tensor of shape (B, L) - Whether the assigned groundtruth box is crowd
    """

    assigned_labels: Tensor
    assigned_bboxes: Tensor
    assigned_poses: Tensor
    assigned_scores: Tensor
    assigned_gt_index: Tensor
    assigned_crowd: Tensor

batch_pose_oks(gt_keypoints, pred_keypoints, gt_bboxes_xyxy, sigmas, eps=1e-09)

Calculate batched OKS (Object Keypoint Similarity) between two sets of keypoints.

Parameters:

Name Type Description Default
gt_keypoints torch.Tensor

Joints with the shape [N, M1, Num Joints, 3]

required
gt_bboxes_xyxy torch.Tensor

Array of bboxes with the shape [N, M1, 4] in XYXY format

required
pred_keypoints torch.Tensor

Joints with the shape [N, M1, Num Joints, 3]

required
sigmas torch.Tensor

Sigmas with the shape [Num Joints]

required
(float) eps

Small constant for numerical stability

required

Returns:

Type Description
float

OKS between gt_keypoints and pred_keypoints with the shape [N, M1, M2]

Source code in V3_6/src/super_gradients/training/losses/yolo_nas_pose_loss.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def batch_pose_oks(gt_keypoints: torch.Tensor, pred_keypoints: torch.Tensor, gt_bboxes_xyxy: torch.Tensor, sigmas: torch.Tensor, eps: float = 1e-9) -> float:
    """
    Calculate batched OKS (Object Keypoint Similarity) between two sets of keypoints.

    :param gt_keypoints:   Joints with the shape [N, M1, Num Joints, 3]
    :param gt_bboxes_xyxy: Array of bboxes with the shape [N, M1, 4] in XYXY format
    :param pred_keypoints: Joints with the shape [N, M1, Num Joints, 3]
    :param sigmas:         Sigmas with the shape [Num Joints]
    :param eps (float):    Small constant for numerical stability
    :return iou:           OKS between gt_keypoints and pred_keypoints with the shape [N, M1, M2]
    """

    joints1_xy = gt_keypoints[:, :, :, 0:2].unsqueeze(2)  # [N, M1, 1, Num Joints, 2]
    joints2_xy = pred_keypoints[:, :, :, 0:2].unsqueeze(1)  # [N, 1, M2, Num Joints, 2]

    d = ((joints1_xy - joints2_xy) ** 2).sum(dim=-1, keepdim=False)  # [N, M1, M2, Num Joints]

    # Infer pose area from bbox area * 0.53 (COCO heuristic)
    area = (gt_bboxes_xyxy[:, :, 2] - gt_bboxes_xyxy[:, :, 0]) * (gt_bboxes_xyxy[:, :, 3] - gt_bboxes_xyxy[:, :, 1]) * 0.53  # [N, M1]
    area = area[:, :, None, None]  # [N, M1, 1, 1]
    sigmas = sigmas.reshape([1, 1, 1, -1])  # [1, 1, 1, Num Keypoints]

    e: Tensor = d / (2 * sigmas) ** 2 / (area + eps) / 2
    oks = torch.exp(-e)  # [N, M1, M2, Num Keypoints]

    joints1_visiblity = gt_keypoints[:, :, :, 2].gt(0).float().unsqueeze(2)  # [N, M1, 1, Num Keypoints]
    num_visible_joints = joints1_visiblity.sum(dim=-1, keepdim=False)  # [N, M1, M2]
    mean_oks = (oks * joints1_visiblity).sum(dim=-1, keepdim=False) / (num_visible_joints + eps)  # [N, M1, M2]

    return mean_oks

Based on https://github.com/Megvii-BaseDetection/YOLOX (Apache-2.0 license)

IOUloss

Bases: nn.Module

IoU loss with the following supported loss types:

Parameters:

Name Type Description Default
reduction str

One of ["mean", "sum", "none"] reduction to apply to the computed loss (Default="none")

'none'
loss_type str

One of ["iou", "giou"] where: * 'iou' for (1 - iou^2) * 'giou' according to "Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression" (1 - giou), where giou = iou - (cover_box - union_box)/cover_box

'iou'
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class IOUloss(nn.Module):
    """
    IoU loss with the following supported loss types:
    :param reduction: One of ["mean", "sum", "none"] reduction to apply to the computed loss (Default="none")
    :param loss_type: One of ["iou", "giou"] where:
            * 'iou' for
                (1 - iou^2)
            * 'giou' according to "Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression"
                (1 - giou), where giou = iou - (cover_box - union_box)/cover_box
    """

    def __init__(self, reduction: str = "none", loss_type: str = "iou"):
        super(IOUloss, self).__init__()
        self._validate_args(loss_type, reduction)
        self.reduction = reduction
        self.loss_type = loss_type

    @staticmethod
    def _validate_args(loss_type, reduction):
        supported_losses = ["iou", "giou"]
        supported_reductions = ["mean", "sum", "none"]
        if loss_type not in supported_losses:
            raise ValueError("Illegal loss_type value: " + loss_type + ", expected one of: " + str(supported_losses))
        if reduction not in supported_reductions:
            raise ValueError("Illegal reduction value: " + reduction + ", expected one of: " + str(supported_reductions))

    def forward(self, pred, target):
        assert pred.shape[0] == target.shape[0]

        pred = pred.view(-1, 4)
        target = target.view(-1, 4)
        tl = torch.max((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2))
        br = torch.min((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2))

        area_p = torch.prod(pred[:, 2:], 1)
        area_g = torch.prod(target[:, 2:], 1)

        en = (tl < br).type(tl.type()).prod(dim=1)
        area_i = torch.prod(br - tl, 1) * en
        area_u = area_p + area_g - area_i
        iou = (area_i) / (area_u + 1e-16)

        if self.loss_type == "iou":
            loss = 1 - iou**2
        elif self.loss_type == "giou":
            c_tl = torch.min((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2))
            c_br = torch.max((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2))
            area_c = torch.prod(c_br - c_tl, 1)
            giou = iou - (area_c - area_u) / area_c.clamp(1e-16)
            loss = 1 - giou.clamp(min=-1.0, max=1.0)

        if self.reduction == "mean":
            loss = loss.mean()
        elif self.reduction == "sum":
            loss = loss.sum()

        return loss

YoloXDetectionLoss

Bases: _Loss

Calculate YOLOX loss: L = L_objectivness + L_iou + L_classification + 1[use_l1]*L_l1

where: * L_iou, L_classification and L_l1 are calculated only between cells and targets that suit them; * L_objectivness is calculated for all cells.

L_classification:
    for cells that have suitable ground truths in their grid locations add BCEs
    to force a prediction of IoU with a GT in a multi-label way
    Coef: 1.
L_iou:
    for cells that have suitable ground truths in their grid locations
    add (1 - IoU^2), IoU between a predicted box and each GT box, force maximum IoU
    Coef: 5.
L_l1:
    for cells that have suitable ground truths in their grid locations
    l1 distance between the logits and GTs in “logits” format (the inverse of “logits to predictions” ops)
    Coef: 1[use_l1]
L_objectness:
    for each cell add BCE with a label of 1 if there is GT assigned to the cell
    Coef: 1

Parameters:

Name Type Description Default
strides List[int]

List of Yolo levels output grid sizes (i.e [8, 16, 32]).

required
num_classes int

Number of classes.

required
use_l1 bool

Controls the L_l1 Coef as discussed above (default=False).

False
center_sampling_radius float

Sampling radius used for center sampling when creating the fg mask (default=2.5).

2.5
iou_type str

Iou loss type, one of ["iou","giou"] (deafult="iou").

'iou'
iou_weight float

Weight to apply to the iou loss term.

5.0
obj_weight float

Weight to apply to the obj loss term.

1.0
cls_weight float

Weight to apply to the cls loss term.

1.0
cls_pos_weight Optional[torch.Tensor]

Class weights for the cls loss. Passed on to torch.nn.BCEWithLogitsLoss

None
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
@register_loss(name=Losses.YOLOX_LOSS, deprecated_name="yolox_loss")
class YoloXDetectionLoss(_Loss):
    """
    Calculate YOLOX loss:
    L = L_objectivness + L_iou + L_classification + 1[use_l1]*L_l1

    where:
        * L_iou, L_classification and L_l1 are calculated only between cells and targets that suit them;
        * L_objectivness is calculated for all cells.

        L_classification:
            for cells that have suitable ground truths in their grid locations add BCEs
            to force a prediction of IoU with a GT in a multi-label way
            Coef: 1.
        L_iou:
            for cells that have suitable ground truths in their grid locations
            add (1 - IoU^2), IoU between a predicted box and each GT box, force maximum IoU
            Coef: 5.
        L_l1:
            for cells that have suitable ground truths in their grid locations
            l1 distance between the logits and GTs in “logits” format (the inverse of “logits to predictions” ops)
            Coef: 1[use_l1]
        L_objectness:
            for each cell add BCE with a label of 1 if there is GT assigned to the cell
            Coef: 1

    :param strides:                 List of Yolo levels output grid sizes (i.e [8, 16, 32]).
    :param num_classes:             Number of classes.
    :param use_l1:                  Controls the L_l1 Coef as discussed above (default=False).
    :param center_sampling_radius:  Sampling radius used for center sampling when creating the fg mask (default=2.5).
    :param iou_type:                Iou loss type, one of ["iou","giou"] (deafult="iou").
    :param iou_weight:              Weight to apply to the iou loss term.
    :param obj_weight:              Weight to apply to the obj loss term.
    :param cls_weight:              Weight to apply to the cls loss term.
    :param cls_pos_weight:          Class weights for the cls loss. Passed on to torch.nn.BCEWithLogitsLoss
    """

    def __init__(
        self,
        strides: List[int],
        num_classes: int,
        use_l1: bool = False,
        center_sampling_radius: float = 2.5,
        iou_type: str = "iou",
        iou_weight: float = 5.0,
        obj_weight: float = 1.0,
        cls_weight: float = 1.0,
        cls_pos_weight: Optional[torch.Tensor] = None,
    ):
        super().__init__()
        self.grids = [torch.zeros(1)] * len(strides)
        self.strides = strides
        self.num_classes = num_classes

        self.center_sampling_radius = center_sampling_radius
        self.use_l1 = use_l1
        self.l1_loss = nn.L1Loss(reduction="none")
        self.obj_bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
        self.cls_bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none", pos_weight=cls_pos_weight)
        self.iou_loss = IOUloss(reduction="none", loss_type=iou_type)

        self.iou_weight = 5.0 if iou_weight is None else iou_weight
        self.obj_weight = 1.0 if obj_weight is None else obj_weight
        self.cls_weight = 1.0 if cls_weight is None else cls_weight

    @property
    def component_names(self) -> List[str]:
        """
        Component names for logging during training.
        These correspond to 2nd item in the tuple returned in self.forward(...).
        See super_gradients.Trainer.train() docs for more info.
        """
        return ["iou", "obj", "cls", "l1", "num_fg", "Loss"]

    def forward(self, model_output: Union[list, Tuple[torch.Tensor, List]], targets: torch.Tensor):
        """
        :param model_output: Union[list, Tuple[torch.Tensor, List]]:
             When list-
              output from all Yolo levels, each of shape [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
             And when tuple- the second item is the described list (first item is discarded)

        :param targets: torch.Tensor: Num_targets x (4 + 2)], values on dim 1 are: image id in a batch, class, box x y w h

        :return: loss, all losses separately in a detached tensor
        """
        if isinstance(model_output, tuple) and len(model_output) == 2:
            # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
            _, predictions = model_output
        else:
            predictions = model_output

        return self._compute_loss(predictions, targets)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        """
        Creates a tensor of xy coordinates of size (1,1,nx,ny,2)

        :param nx: int: cells along x axis (default=20)
        :param ny: int: cells along the y axis (default=20)
        :return: torch.tensor of xy coordinates of size (1,1,nx,ny,2)
        """
        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij")
        else:
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

    def _compute_loss(self, predictions: List[torch.Tensor], targets: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        :param predictions:     output from all Yolo levels, each of shape
                                [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
        :param targets:         [Num_targets x (4 + 2)], values on dim 1 are: image id in a batch, class, box x y w h

        :return:                loss, all losses separately in a detached tensor
        """
        x_shifts, y_shifts, expanded_strides, transformed_outputs, raw_outputs = self.prepare_predictions(predictions)

        bbox_preds = transformed_outputs[:, :, :4]  # [batch, n_anchors_all, 4]
        obj_preds = transformed_outputs[:, :, 4:5]  # [batch, n_anchors_all, 1]
        cls_preds = transformed_outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]

        # calculate targets
        total_num_anchors = transformed_outputs.shape[1]
        cls_targets = []
        reg_targets = []
        l1_targets = []
        obj_targets = []
        fg_masks = []

        num_fg, num_gts = 0.0, 0.0

        for image_idx in range(transformed_outputs.shape[0]):
            labels_im = targets[targets[:, 0] == image_idx]
            num_gt = labels_im.shape[0]
            num_gts += num_gt
            if num_gt == 0:
                cls_target = transformed_outputs.new_zeros((0, self.num_classes))
                reg_target = transformed_outputs.new_zeros((0, 4))
                l1_target = transformed_outputs.new_zeros((0, 4))
                obj_target = transformed_outputs.new_zeros((total_num_anchors, 1))
                fg_mask = transformed_outputs.new_zeros(total_num_anchors).bool()
            else:
                # GT boxes to image coordinates
                gt_bboxes_per_image = labels_im[:, 2:6].clone()
                gt_classes = labels_im[:, 1]
                bboxes_preds_per_image = bbox_preds[image_idx]

                try:
                    # assign cells to ground truths, at most one GT per cell
                    gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments(
                        image_idx,
                        num_gt,
                        total_num_anchors,
                        gt_bboxes_per_image,
                        gt_classes,
                        bboxes_preds_per_image,
                        expanded_strides,
                        x_shifts,
                        y_shifts,
                        cls_preds,
                        obj_preds,
                    )

                # TODO: CHECK IF ERROR IS CUDA OUT OF MEMORY
                except RuntimeError:
                    logging.error(
                        "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
                                   CPU mode is applied in this batch. If you want to avoid this issue, \
                                   try to reduce the batch size or image size."
                    )
                    torch.cuda.empty_cache()
                    gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments(
                        image_idx,
                        num_gt,
                        total_num_anchors,
                        gt_bboxes_per_image,
                        gt_classes,
                        bboxes_preds_per_image,
                        expanded_strides,
                        x_shifts,
                        y_shifts,
                        cls_preds,
                        obj_preds,
                        "cpu",
                    )

                torch.cuda.empty_cache()
                num_fg += num_fg_img

                cls_target = F.one_hot(gt_matched_classes.to(torch.int64), self.num_classes) * pred_ious_this_matching.unsqueeze(-1)
                obj_target = fg_mask.unsqueeze(-1)
                reg_target = gt_bboxes_per_image[matched_gt_inds]
                if self.use_l1:
                    l1_target = self.get_l1_target(
                        transformed_outputs.new_zeros((num_fg_img, 4)),
                        gt_bboxes_per_image[matched_gt_inds],
                        expanded_strides[0][fg_mask],
                        x_shifts=x_shifts[0][fg_mask],
                        y_shifts=y_shifts[0][fg_mask],
                    )

            # collect targets for all loss terms over the whole batch
            cls_targets.append(cls_target)
            reg_targets.append(reg_target)
            obj_targets.append(obj_target.to(transformed_outputs.dtype))
            fg_masks.append(fg_mask)
            if self.use_l1:
                l1_targets.append(l1_target)

        # concat all targets over the batch (get rid of batch dim)
        cls_targets = torch.cat(cls_targets, 0)
        reg_targets = torch.cat(reg_targets, 0)
        obj_targets = torch.cat(obj_targets, 0)
        fg_masks = torch.cat(fg_masks, 0)
        if self.use_l1:
            l1_targets = torch.cat(l1_targets, 0)

        num_fg = max(num_fg, 1)
        # loss terms divided by the total number of foregrounds
        loss_iou = self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets).sum() / num_fg
        loss_obj = self.obj_bcewithlog_loss(obj_preds.view(-1, 1), obj_targets).sum() / num_fg
        loss_cls = self.cls_bcewithlog_loss(cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets).sum() / num_fg
        if self.use_l1:
            loss_l1 = self.l1_loss(raw_outputs.view(-1, 4)[fg_masks], l1_targets).sum() / num_fg
        else:
            loss_l1 = 0.0

        loss = self.iou_weight * loss_iou + self.obj_weight * loss_obj + self.cls_weight * loss_cls + loss_l1

        return (
            loss,
            torch.cat(
                (
                    loss_iou.unsqueeze(0),
                    loss_obj.unsqueeze(0),
                    loss_cls.unsqueeze(0),
                    torch.tensor(loss_l1).unsqueeze(0).to(loss.device),
                    torch.tensor(num_fg / max(num_gts, 1)).unsqueeze(0).to(loss.device),
                    loss.unsqueeze(0),
                )
            ).detach(),
        )

    def prepare_predictions(self, predictions: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Convert raw outputs of the network into a format that merges outputs from all levels
        :param predictions:     output from all Yolo levels, each of shape
                                [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
        :return:    5 tensors representing predictions:
                        * x_shifts: shape [1 x * num_cells x 1],
                          where num_cells = grid1X * grid1Y + grid2X * grid2Y + grid3X * grid3Y,
                          x coordinate on the grid cell the prediction is coming from
                        * y_shifts: shape [1 x  num_cells x 1],
                          y coordinate on the grid cell the prediction is coming from
                        * expanded_strides: shape [1 x num_cells x 1],
                          stride of the output grid the prediction is coming from
                        * transformed_outputs: shape [batch_size x num_cells x (num_classes + 5)],
                          predictions with boxes in real coordinates and logprobabilities
                        * raw_outputs: shape [batch_size x num_cells x (num_classes + 5)],
                          raw predictions with boxes and confidences as logits

        """
        raw_outputs = []
        transformed_outputs = []
        x_shifts = []
        y_shifts = []
        expanded_strides = []
        for k, output in enumerate(predictions):
            batch_size, num_anchors, h, w, num_outputs = output.shape

            # IN FIRST PASS CREATE GRIDS ACCORDING TO OUTPUT SHAPE (BATCH,1,IMAGE_H/STRIDE,IMAGE_2/STRIDE,NUM_CLASSES+5)
            if self.grids[k].shape[2:4] != output.shape[2:4]:
                self.grids[k] = self._make_grid(w, h).type_as(output)

            # e.g. [batch_size, 1, 28, 28, 85] -> [batch_size, 784, 85]
            output_raveled = output.reshape(batch_size, num_anchors * h * w, num_outputs)
            # e.g [1, 784, 2]
            grid_raveled = self.grids[k].view(1, num_anchors * h * w, 2)
            if self.use_l1:
                # e.g [1, 784, 4]
                raw_outputs.append(output_raveled[:, :, :4].clone())

            # box logits to coordinates
            centers = (output_raveled[..., :2] + grid_raveled) * self.strides[k]
            wh = torch.exp(output_raveled[..., 2:4]) * self.strides[k]
            classes = output_raveled[..., 4:]
            output_raveled = torch.cat([centers, wh, classes], -1)

            # outputs with boxes in real coordinates, probs as logits
            transformed_outputs.append(output_raveled)
            # x cell coordinates of all 784 predictions, 0, 0, 0, ..., 1, 1, 1, ...
            x_shifts.append(grid_raveled[:, :, 0])
            # y cell coordinates of all 784 predictions, 0, 1, 2, ..., 0, 1, 2, ...
            y_shifts.append(grid_raveled[:, :, 1])
            # e.g. [1, 784, stride of this level (one of [8, 16, 32])]
            expanded_strides.append(torch.zeros(1, grid_raveled.shape[1]).fill_(self.strides[k]).type_as(output))

        # all 4 below have shapes of [batch_size , num_cells, num_values_pre_cell]
        # where num_anchors * num_cells is e.g. 1 * (28 * 28 + 14 * 14 + 17 * 17)
        transformed_outputs = torch.cat(transformed_outputs, 1)
        x_shifts = torch.cat(x_shifts, 1)
        y_shifts = torch.cat(y_shifts, 1)
        expanded_strides = torch.cat(expanded_strides, 1)
        if self.use_l1:
            raw_outputs = torch.cat(raw_outputs, 1)

        return x_shifts, y_shifts, expanded_strides, transformed_outputs, raw_outputs

    def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
        """
        :param l1_target:   tensor of zeros of shape [Num_cell_gt_pairs x 4]
        :param gt:          targets in coordinates [Num_cell_gt_pairs x (4 + 1 + num_classes)]

        :return:            targets in the format corresponding to logits
        """
        l1_target[:, 0] = gt[:, 0] / stride - x_shifts
        l1_target[:, 1] = gt[:, 1] / stride - y_shifts
        l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
        l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
        return l1_target

    @torch.no_grad()
    def get_assignments(
        self,
        image_idx,
        num_gt,
        total_num_anchors,
        gt_bboxes_per_image,
        gt_classes,
        bboxes_preds_per_image,
        expanded_strides,
        x_shifts,
        y_shifts,
        cls_preds,
        obj_preds,
        mode="gpu",
        ious_loss_cost_coeff=3.0,
        outside_boxes_and_center_cost_coeff=100000.0,
    ):
        """
        Match cells to ground truth:
            * at most 1 GT per cell
            * dynamic number of cells per GT

        :param outside_boxes_and_center_cost_coeff: float: Cost coefficiant of cells the radius and bbox of gts in dynamic
         matching (default=100000).
        :param ious_loss_cost_coeff: float: Cost coefficiant for iou loss in dynamic matching (default=3).
        :param image_idx: int: Image index in batch.
        :param num_gt: int: Number of ground trunth targets in the image.
        :param total_num_anchors: int: Total number of possible bboxes = sum of all grid cells.
        :param gt_bboxes_per_image: torch.Tensor: Tensor of gt bboxes for  the image, shape: (num_gt, 4).
        :param gt_classes: torch.Tesnor: Tensor of the classes in the image, shape: (num_preds,4).
        :param bboxes_preds_per_image: Tensor of the classes in the image, shape: (num_preds).
        :param expanded_strides: torch.Tensor: Stride of the output grid the prediction is coming from,
            shape (1 x num_cells x 1).
        :param x_shifts: torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).
        :param y_shifts: torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).
        :param cls_preds: torch.Tensor: Class predictions in all cells, shape (batch_size, num_cells).
        :param obj_preds: torch.Tensor: Objectness predictions in all cells, shape (batch_size, num_cells).
        :param mode: str: One of ["gpu","cpu"], Controls the device the assignment operation should be taken place on (deafult="gpu")

        """
        if mode == "cpu":
            print("------------CPU Mode for This Batch-------------")
            gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
            bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
            gt_classes = gt_classes.cpu().float()
            expanded_strides = expanded_strides.cpu().float()
            x_shifts = x_shifts.cpu()
            y_shifts = y_shifts.cpu()

        # create a mask for foreground cells
        fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt)

        bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
        cls_preds_ = cls_preds[image_idx][fg_mask]
        obj_preds_ = obj_preds[image_idx][fg_mask]
        num_in_boxes_anchor = bboxes_preds_per_image.shape[0]

        if mode == "cpu":
            gt_bboxes_per_image = gt_bboxes_per_image.cpu()
            bboxes_preds_per_image = bboxes_preds_per_image.cpu()

        # calculate cost between all foregrounds and all ground truths (used only for matching)
        pair_wise_ious = calculate_bbox_iou_matrix(gt_bboxes_per_image, bboxes_preds_per_image, x1y1x2y2=False)
        gt_cls_per_image = F.one_hot(gt_classes.to(torch.int64), self.num_classes)
        gt_cls_per_image = gt_cls_per_image.float().unsqueeze(1).repeat(1, num_in_boxes_anchor, 1)
        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)

        if mode == "cpu":
            cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()

        with torch.cuda.amp.autocast(enabled=False):
            cls_preds_ = cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * obj_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
            pair_wise_cls_loss = F.binary_cross_entropy(cls_preds_.sqrt_(), gt_cls_per_image, reduction="none").sum(-1)
        del cls_preds_

        cost = pair_wise_cls_loss + ious_loss_cost_coeff * pair_wise_ious_loss + outside_boxes_and_center_cost_coeff * (~is_in_boxes_and_center)

        # further filter foregrounds: create pairs between cells and ground truth, based on cost and IoUs
        num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
        # discard tensors related to cost
        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss

        if mode == "cpu":
            gt_matched_classes = gt_matched_classes.cuda()
            fg_mask = fg_mask.cuda()
            pred_ious_this_matching = pred_ious_this_matching.cuda()
            matched_gt_inds = matched_gt_inds.cuda()

        return gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg

    def get_in_boxes_info(self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt):
        """
        Create a mask for all cells, mask in only foreground: cells that have a center located:
            * withing a GT box;
            OR
            * within a fixed radius around a GT box (center sampling);

        :param num_gt: int: Number of ground trunth targets in the image.
        :param total_num_anchors: int: Sum of all grid cells.
        :param gt_bboxes_per_image: torch.Tensor: Tensor of gt bboxes for  the image, shape: (num_gt, 4).
        :param expanded_strides: torch.Tensor: Stride of the output grid the prediction is coming from,
            shape (1 x num_cells x 1).
        :param x_shifts: torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).
        :param y_shifts: torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).

        :return is_in_boxes_anchor, is_in_boxes_and_center
            where:
             - is_in_boxes_anchor masks the cells that their cell center is  inside a gt bbox and within
                self.center_sampling_radius cells away, without reduction (i.e shape=(num_gts, num_fgs))
             - is_in_boxes_and_center masks the cells that their center is either inside a gt bbox or within
                self.center_sampling_radius cells away, shape (num_fgs)
        """

        expanded_strides_per_image = expanded_strides[0]

        # cell coordinates, shape [n_predictions] -> repeated to [n_gts, n_predictions]
        x_shifts_per_image = x_shifts[0] * expanded_strides_per_image
        y_shifts_per_image = y_shifts[0] * expanded_strides_per_image
        x_centers_per_image = (x_shifts_per_image + 0.5 * expanded_strides_per_image).unsqueeze(0).repeat(num_gt, 1)
        y_centers_per_image = (y_shifts_per_image + 0.5 * expanded_strides_per_image).unsqueeze(0).repeat(num_gt, 1)

        # FIND CELL CENTERS THAT ARE WITHIN GROUND TRUTH BOXES

        # ground truth boxes, shape [n_gts] -> repeated to [n_gts, n_predictions]
        # from (c1, c2, w, h) to left, right, top, bottom
        gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2]).unsqueeze(1).repeat(1, total_num_anchors)
        gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2]).unsqueeze(1).repeat(1, total_num_anchors)
        gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3]).unsqueeze(1).repeat(1, total_num_anchors)
        gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3]).unsqueeze(1).repeat(1, total_num_anchors)

        # check which cell centers lay within the ground truth boxes
        b_l = x_centers_per_image - gt_bboxes_per_image_l  # x - l > 0 when l is on the lest from x
        b_r = gt_bboxes_per_image_r - x_centers_per_image
        b_t = y_centers_per_image - gt_bboxes_per_image_t
        b_b = gt_bboxes_per_image_b - y_centers_per_image
        bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)  # shape [n_gts, n_predictions]

        # to claim that a cell center is inside a gt box all 4 differences calculated above should be positive
        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0  # shape [n_gts, n_predictions]
        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0  # shape [n_predictions], whether a cell is inside at least one gt

        # FIND CELL CENTERS THAT ARE WITHIN +- self.center_sampling_radius CELLS FROM GROUND TRUTH BOXES CENTERS

        # define fake boxes: instead of ground truth boxes step +- self.center_sampling_radius from their centers
        gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
            1, total_num_anchors
        ) - self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
        gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
            1, total_num_anchors
        ) + self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
        gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
            1, total_num_anchors
        ) - self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
        gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
            1, total_num_anchors
        ) + self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)

        c_l = x_centers_per_image - gt_bboxes_per_image_l
        c_r = gt_bboxes_per_image_r - x_centers_per_image
        c_t = y_centers_per_image - gt_bboxes_per_image_t
        c_b = gt_bboxes_per_image_b - y_centers_per_image
        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
        is_in_centers = center_deltas.min(dim=-1).values > 0.0
        is_in_centers_all = is_in_centers.sum(dim=0) > 0

        # in boxes OR in centers
        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all

        # in boxes AND in centers, preserving a shape [num_GTs x num_FGs]
        is_in_boxes_and_center = is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
        return is_in_boxes_anchor, is_in_boxes_and_center

    def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
        """
        :param cost:            pairwise cost, [num_FGs x num_GTs]
        :param pair_wise_ious:  pairwise IoUs, [num_FGs x num_GTs]
        :param gt_classes:      class of each GT
        :param num_gt:          number of GTs

        :return num_fg, (number of foregrounds)
                gt_matched_classes, (the classes that have been matched with fgs)
                pred_ious_this_matching
                matched_gt_inds
        """
        # create a matrix with shape [num_GTs x num_FGs]
        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)

        # for each GT get a dynamic k of foregrounds with a minimum cost: k = int(sum[top 10 IoUs])
        ious_in_boxes_matrix = pair_wise_ious
        n_candidate_k = min(10, ious_in_boxes_matrix.size(1))
        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
        dynamic_ks = dynamic_ks.tolist()
        for gt_idx in range(num_gt):
            try:
                _, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx], largest=False)
            except Exception:
                logger.warning("cost[gt_idx]: " + str(cost[gt_idx]) + " dynamic_ks[gt_idx]L " + str(dynamic_ks[gt_idx]))
            matching_matrix[gt_idx][pos_idx] = 1

        del topk_ious, dynamic_ks, pos_idx

        # leave at most one GT per foreground, chose the one with the smallest cost
        anchor_matching_gt = matching_matrix.sum(0)
        if (anchor_matching_gt > 1).sum() > 0:
            _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
            matching_matrix[:, anchor_matching_gt > 1] *= 0
            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1

        fg_mask_inboxes = matching_matrix.sum(0) > 0
        num_fg = fg_mask_inboxes.sum().item()

        fg_mask[fg_mask.clone()] = fg_mask_inboxes

        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
        gt_matched_classes = gt_classes[matched_gt_inds]

        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[fg_mask_inboxes]
        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds

component_names: List[str] property

Component names for logging during training. These correspond to 2nd item in the tuple returned in self.forward(...). See super_gradients.Trainer.train() docs for more info.

dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)

Parameters:

Name Type Description Default
cost

pairwise cost, [num_FGs x num_GTs]

required
pair_wise_ious

pairwise IoUs, [num_FGs x num_GTs]

required
gt_classes

class of each GT

required
num_gt

number of GTs

required
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
    """
    :param cost:            pairwise cost, [num_FGs x num_GTs]
    :param pair_wise_ious:  pairwise IoUs, [num_FGs x num_GTs]
    :param gt_classes:      class of each GT
    :param num_gt:          number of GTs

    :return num_fg, (number of foregrounds)
            gt_matched_classes, (the classes that have been matched with fgs)
            pred_ious_this_matching
            matched_gt_inds
    """
    # create a matrix with shape [num_GTs x num_FGs]
    matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)

    # for each GT get a dynamic k of foregrounds with a minimum cost: k = int(sum[top 10 IoUs])
    ious_in_boxes_matrix = pair_wise_ious
    n_candidate_k = min(10, ious_in_boxes_matrix.size(1))
    topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
    dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
    dynamic_ks = dynamic_ks.tolist()
    for gt_idx in range(num_gt):
        try:
            _, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx], largest=False)
        except Exception:
            logger.warning("cost[gt_idx]: " + str(cost[gt_idx]) + " dynamic_ks[gt_idx]L " + str(dynamic_ks[gt_idx]))
        matching_matrix[gt_idx][pos_idx] = 1

    del topk_ious, dynamic_ks, pos_idx

    # leave at most one GT per foreground, chose the one with the smallest cost
    anchor_matching_gt = matching_matrix.sum(0)
    if (anchor_matching_gt > 1).sum() > 0:
        _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
        matching_matrix[:, anchor_matching_gt > 1] *= 0
        matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1

    fg_mask_inboxes = matching_matrix.sum(0) > 0
    num_fg = fg_mask_inboxes.sum().item()

    fg_mask[fg_mask.clone()] = fg_mask_inboxes

    matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
    gt_matched_classes = gt_classes[matched_gt_inds]

    pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[fg_mask_inboxes]
    return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds

forward(model_output, targets)

Parameters:

Name Type Description Default
model_output Union[list, Tuple[torch.Tensor, List]]

Union[list, Tuple[torch.Tensor, List]]: When list- output from all Yolo levels, each of shape [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)] And when tuple- the second item is the described list (first item is discarded)

required
targets torch.Tensor

torch.Tensor: Num_targets x (4 + 2)], values on dim 1 are: image id in a batch, class, box x y w h

required

Returns:

Type Description

loss, all losses separately in a detached tensor

Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def forward(self, model_output: Union[list, Tuple[torch.Tensor, List]], targets: torch.Tensor):
    """
    :param model_output: Union[list, Tuple[torch.Tensor, List]]:
         When list-
          output from all Yolo levels, each of shape [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
         And when tuple- the second item is the described list (first item is discarded)

    :param targets: torch.Tensor: Num_targets x (4 + 2)], values on dim 1 are: image id in a batch, class, box x y w h

    :return: loss, all losses separately in a detached tensor
    """
    if isinstance(model_output, tuple) and len(model_output) == 2:
        # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
        _, predictions = model_output
    else:
        predictions = model_output

    return self._compute_loss(predictions, targets)

get_assignments(image_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes, bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts, cls_preds, obj_preds, mode='gpu', ious_loss_cost_coeff=3.0, outside_boxes_and_center_cost_coeff=100000.0)

Match cells to ground truth: * at most 1 GT per cell * dynamic number of cells per GT

Parameters:

Name Type Description Default
outside_boxes_and_center_cost_coeff

float: Cost coefficiant of cells the radius and bbox of gts in dynamic matching (default=100000).

100000.0
ious_loss_cost_coeff

float: Cost coefficiant for iou loss in dynamic matching (default=3).

3.0
image_idx

int: Image index in batch.

required
num_gt

int: Number of ground trunth targets in the image.

required
total_num_anchors

int: Total number of possible bboxes = sum of all grid cells.

required
gt_bboxes_per_image

torch.Tensor: Tensor of gt bboxes for the image, shape: (num_gt, 4).

required
gt_classes

torch.Tesnor: Tensor of the classes in the image, shape: (num_preds,4).

required
bboxes_preds_per_image

Tensor of the classes in the image, shape: (num_preds).

required
expanded_strides

torch.Tensor: Stride of the output grid the prediction is coming from, shape (1 x num_cells x 1).

required
x_shifts

torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).

required
y_shifts

torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).

required
cls_preds

torch.Tensor: Class predictions in all cells, shape (batch_size, num_cells).

required
obj_preds

torch.Tensor: Objectness predictions in all cells, shape (batch_size, num_cells).

required
mode

str: One of ["gpu","cpu"], Controls the device the assignment operation should be taken place on (deafult="gpu")

'gpu'
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
@torch.no_grad()
def get_assignments(
    self,
    image_idx,
    num_gt,
    total_num_anchors,
    gt_bboxes_per_image,
    gt_classes,
    bboxes_preds_per_image,
    expanded_strides,
    x_shifts,
    y_shifts,
    cls_preds,
    obj_preds,
    mode="gpu",
    ious_loss_cost_coeff=3.0,
    outside_boxes_and_center_cost_coeff=100000.0,
):
    """
    Match cells to ground truth:
        * at most 1 GT per cell
        * dynamic number of cells per GT

    :param outside_boxes_and_center_cost_coeff: float: Cost coefficiant of cells the radius and bbox of gts in dynamic
     matching (default=100000).
    :param ious_loss_cost_coeff: float: Cost coefficiant for iou loss in dynamic matching (default=3).
    :param image_idx: int: Image index in batch.
    :param num_gt: int: Number of ground trunth targets in the image.
    :param total_num_anchors: int: Total number of possible bboxes = sum of all grid cells.
    :param gt_bboxes_per_image: torch.Tensor: Tensor of gt bboxes for  the image, shape: (num_gt, 4).
    :param gt_classes: torch.Tesnor: Tensor of the classes in the image, shape: (num_preds,4).
    :param bboxes_preds_per_image: Tensor of the classes in the image, shape: (num_preds).
    :param expanded_strides: torch.Tensor: Stride of the output grid the prediction is coming from,
        shape (1 x num_cells x 1).
    :param x_shifts: torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).
    :param y_shifts: torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).
    :param cls_preds: torch.Tensor: Class predictions in all cells, shape (batch_size, num_cells).
    :param obj_preds: torch.Tensor: Objectness predictions in all cells, shape (batch_size, num_cells).
    :param mode: str: One of ["gpu","cpu"], Controls the device the assignment operation should be taken place on (deafult="gpu")

    """
    if mode == "cpu":
        print("------------CPU Mode for This Batch-------------")
        gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
        bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
        gt_classes = gt_classes.cpu().float()
        expanded_strides = expanded_strides.cpu().float()
        x_shifts = x_shifts.cpu()
        y_shifts = y_shifts.cpu()

    # create a mask for foreground cells
    fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt)

    bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
    cls_preds_ = cls_preds[image_idx][fg_mask]
    obj_preds_ = obj_preds[image_idx][fg_mask]
    num_in_boxes_anchor = bboxes_preds_per_image.shape[0]

    if mode == "cpu":
        gt_bboxes_per_image = gt_bboxes_per_image.cpu()
        bboxes_preds_per_image = bboxes_preds_per_image.cpu()

    # calculate cost between all foregrounds and all ground truths (used only for matching)
    pair_wise_ious = calculate_bbox_iou_matrix(gt_bboxes_per_image, bboxes_preds_per_image, x1y1x2y2=False)
    gt_cls_per_image = F.one_hot(gt_classes.to(torch.int64), self.num_classes)
    gt_cls_per_image = gt_cls_per_image.float().unsqueeze(1).repeat(1, num_in_boxes_anchor, 1)
    pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)

    if mode == "cpu":
        cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()

    with torch.cuda.amp.autocast(enabled=False):
        cls_preds_ = cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * obj_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
        pair_wise_cls_loss = F.binary_cross_entropy(cls_preds_.sqrt_(), gt_cls_per_image, reduction="none").sum(-1)
    del cls_preds_

    cost = pair_wise_cls_loss + ious_loss_cost_coeff * pair_wise_ious_loss + outside_boxes_and_center_cost_coeff * (~is_in_boxes_and_center)

    # further filter foregrounds: create pairs between cells and ground truth, based on cost and IoUs
    num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
    # discard tensors related to cost
    del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss

    if mode == "cpu":
        gt_matched_classes = gt_matched_classes.cuda()
        fg_mask = fg_mask.cuda()
        pred_ious_this_matching = pred_ious_this_matching.cuda()
        matched_gt_inds = matched_gt_inds.cuda()

    return gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg

get_in_boxes_info(gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt)

Create a mask for all cells, mask in only foreground: cells that have a center located: * withing a GT box; OR * within a fixed radius around a GT box (center sampling);

Parameters:

Name Type Description Default
num_gt

int: Number of ground trunth targets in the image.

required
total_num_anchors

int: Sum of all grid cells.

required
gt_bboxes_per_image

torch.Tensor: Tensor of gt bboxes for the image, shape: (num_gt, 4).

required
expanded_strides

torch.Tensor: Stride of the output grid the prediction is coming from, shape (1 x num_cells x 1).

required
x_shifts

torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).

required
y_shifts

torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).

required

Returns:

Type Description
  • is_in_boxes_anchor masks the cells that their cell center is inside a gt bbox and within self.center_sampling_radius cells away, without reduction (i.e shape=(num_gts, num_fgs)) - is_in_boxes_and_center masks the cells that their center is either inside a gt bbox or within self.center_sampling_radius cells away, shape (num_fgs)
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
def get_in_boxes_info(self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt):
    """
    Create a mask for all cells, mask in only foreground: cells that have a center located:
        * withing a GT box;
        OR
        * within a fixed radius around a GT box (center sampling);

    :param num_gt: int: Number of ground trunth targets in the image.
    :param total_num_anchors: int: Sum of all grid cells.
    :param gt_bboxes_per_image: torch.Tensor: Tensor of gt bboxes for  the image, shape: (num_gt, 4).
    :param expanded_strides: torch.Tensor: Stride of the output grid the prediction is coming from,
        shape (1 x num_cells x 1).
    :param x_shifts: torch.Tensor: X's in cell coordinates, shape (1,num_cells,1).
    :param y_shifts: torch.Tensor: Y's in cell coordinates, shape (1,num_cells,1).

    :return is_in_boxes_anchor, is_in_boxes_and_center
        where:
         - is_in_boxes_anchor masks the cells that their cell center is  inside a gt bbox and within
            self.center_sampling_radius cells away, without reduction (i.e shape=(num_gts, num_fgs))
         - is_in_boxes_and_center masks the cells that their center is either inside a gt bbox or within
            self.center_sampling_radius cells away, shape (num_fgs)
    """

    expanded_strides_per_image = expanded_strides[0]

    # cell coordinates, shape [n_predictions] -> repeated to [n_gts, n_predictions]
    x_shifts_per_image = x_shifts[0] * expanded_strides_per_image
    y_shifts_per_image = y_shifts[0] * expanded_strides_per_image
    x_centers_per_image = (x_shifts_per_image + 0.5 * expanded_strides_per_image).unsqueeze(0).repeat(num_gt, 1)
    y_centers_per_image = (y_shifts_per_image + 0.5 * expanded_strides_per_image).unsqueeze(0).repeat(num_gt, 1)

    # FIND CELL CENTERS THAT ARE WITHIN GROUND TRUTH BOXES

    # ground truth boxes, shape [n_gts] -> repeated to [n_gts, n_predictions]
    # from (c1, c2, w, h) to left, right, top, bottom
    gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2]).unsqueeze(1).repeat(1, total_num_anchors)
    gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2]).unsqueeze(1).repeat(1, total_num_anchors)
    gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3]).unsqueeze(1).repeat(1, total_num_anchors)
    gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3]).unsqueeze(1).repeat(1, total_num_anchors)

    # check which cell centers lay within the ground truth boxes
    b_l = x_centers_per_image - gt_bboxes_per_image_l  # x - l > 0 when l is on the lest from x
    b_r = gt_bboxes_per_image_r - x_centers_per_image
    b_t = y_centers_per_image - gt_bboxes_per_image_t
    b_b = gt_bboxes_per_image_b - y_centers_per_image
    bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)  # shape [n_gts, n_predictions]

    # to claim that a cell center is inside a gt box all 4 differences calculated above should be positive
    is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0  # shape [n_gts, n_predictions]
    is_in_boxes_all = is_in_boxes.sum(dim=0) > 0  # shape [n_predictions], whether a cell is inside at least one gt

    # FIND CELL CENTERS THAT ARE WITHIN +- self.center_sampling_radius CELLS FROM GROUND TRUTH BOXES CENTERS

    # define fake boxes: instead of ground truth boxes step +- self.center_sampling_radius from their centers
    gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
        1, total_num_anchors
    ) - self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
    gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
        1, total_num_anchors
    ) + self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
    gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
        1, total_num_anchors
    ) - self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)
    gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
        1, total_num_anchors
    ) + self.center_sampling_radius * expanded_strides_per_image.unsqueeze(0)

    c_l = x_centers_per_image - gt_bboxes_per_image_l
    c_r = gt_bboxes_per_image_r - x_centers_per_image
    c_t = y_centers_per_image - gt_bboxes_per_image_t
    c_b = gt_bboxes_per_image_b - y_centers_per_image
    center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
    is_in_centers = center_deltas.min(dim=-1).values > 0.0
    is_in_centers_all = is_in_centers.sum(dim=0) > 0

    # in boxes OR in centers
    is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all

    # in boxes AND in centers, preserving a shape [num_GTs x num_FGs]
    is_in_boxes_and_center = is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
    return is_in_boxes_anchor, is_in_boxes_and_center

get_l1_target(l1_target, gt, stride, x_shifts, y_shifts, eps=1e-08)

Parameters:

Name Type Description Default
l1_target

tensor of zeros of shape [Num_cell_gt_pairs x 4]

required
gt

targets in coordinates [Num_cell_gt_pairs x (4 + 1 + num_classes)]

required

Returns:

Type Description

targets in the format corresponding to logits

Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
394
395
396
397
398
399
400
401
402
403
404
405
def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
    """
    :param l1_target:   tensor of zeros of shape [Num_cell_gt_pairs x 4]
    :param gt:          targets in coordinates [Num_cell_gt_pairs x (4 + 1 + num_classes)]

    :return:            targets in the format corresponding to logits
    """
    l1_target[:, 0] = gt[:, 0] / stride - x_shifts
    l1_target[:, 1] = gt[:, 1] / stride - y_shifts
    l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
    l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
    return l1_target

prepare_predictions(predictions)

Convert raw outputs of the network into a format that merges outputs from all levels

Parameters:

Name Type Description Default
predictions List[torch.Tensor]

output from all Yolo levels, each of shape [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]

5 tensors representing predictions: * x_shifts: shape [1 x * num_cells x 1], where num_cells = grid1X * grid1Y + grid2X * grid2Y + grid3X * grid3Y, x coordinate on the grid cell the prediction is coming from * y_shifts: shape [1 x num_cells x 1], y coordinate on the grid cell the prediction is coming from * expanded_strides: shape [1 x num_cells x 1], stride of the output grid the prediction is coming from * transformed_outputs: shape [batch_size x num_cells x (num_classes + 5)], predictions with boxes in real coordinates and logprobabilities * raw_outputs: shape [batch_size x num_cells x (num_classes + 5)], raw predictions with boxes and confidences as logits

Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
def prepare_predictions(self, predictions: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Convert raw outputs of the network into a format that merges outputs from all levels
    :param predictions:     output from all Yolo levels, each of shape
                            [Batch x 1 x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
    :return:    5 tensors representing predictions:
                    * x_shifts: shape [1 x * num_cells x 1],
                      where num_cells = grid1X * grid1Y + grid2X * grid2Y + grid3X * grid3Y,
                      x coordinate on the grid cell the prediction is coming from
                    * y_shifts: shape [1 x  num_cells x 1],
                      y coordinate on the grid cell the prediction is coming from
                    * expanded_strides: shape [1 x num_cells x 1],
                      stride of the output grid the prediction is coming from
                    * transformed_outputs: shape [batch_size x num_cells x (num_classes + 5)],
                      predictions with boxes in real coordinates and logprobabilities
                    * raw_outputs: shape [batch_size x num_cells x (num_classes + 5)],
                      raw predictions with boxes and confidences as logits

    """
    raw_outputs = []
    transformed_outputs = []
    x_shifts = []
    y_shifts = []
    expanded_strides = []
    for k, output in enumerate(predictions):
        batch_size, num_anchors, h, w, num_outputs = output.shape

        # IN FIRST PASS CREATE GRIDS ACCORDING TO OUTPUT SHAPE (BATCH,1,IMAGE_H/STRIDE,IMAGE_2/STRIDE,NUM_CLASSES+5)
        if self.grids[k].shape[2:4] != output.shape[2:4]:
            self.grids[k] = self._make_grid(w, h).type_as(output)

        # e.g. [batch_size, 1, 28, 28, 85] -> [batch_size, 784, 85]
        output_raveled = output.reshape(batch_size, num_anchors * h * w, num_outputs)
        # e.g [1, 784, 2]
        grid_raveled = self.grids[k].view(1, num_anchors * h * w, 2)
        if self.use_l1:
            # e.g [1, 784, 4]
            raw_outputs.append(output_raveled[:, :, :4].clone())

        # box logits to coordinates
        centers = (output_raveled[..., :2] + grid_raveled) * self.strides[k]
        wh = torch.exp(output_raveled[..., 2:4]) * self.strides[k]
        classes = output_raveled[..., 4:]
        output_raveled = torch.cat([centers, wh, classes], -1)

        # outputs with boxes in real coordinates, probs as logits
        transformed_outputs.append(output_raveled)
        # x cell coordinates of all 784 predictions, 0, 0, 0, ..., 1, 1, 1, ...
        x_shifts.append(grid_raveled[:, :, 0])
        # y cell coordinates of all 784 predictions, 0, 1, 2, ..., 0, 1, 2, ...
        y_shifts.append(grid_raveled[:, :, 1])
        # e.g. [1, 784, stride of this level (one of [8, 16, 32])]
        expanded_strides.append(torch.zeros(1, grid_raveled.shape[1]).fill_(self.strides[k]).type_as(output))

    # all 4 below have shapes of [batch_size , num_cells, num_values_pre_cell]
    # where num_anchors * num_cells is e.g. 1 * (28 * 28 + 14 * 14 + 17 * 17)
    transformed_outputs = torch.cat(transformed_outputs, 1)
    x_shifts = torch.cat(x_shifts, 1)
    y_shifts = torch.cat(y_shifts, 1)
    expanded_strides = torch.cat(expanded_strides, 1)
    if self.use_l1:
        raw_outputs = torch.cat(raw_outputs, 1)

    return x_shifts, y_shifts, expanded_strides, transformed_outputs, raw_outputs

YoloXFastDetectionLoss

Bases: YoloXDetectionLoss

A completely new implementation of YOLOX loss. This is NOT an equivalent implementation to the regular yolox loss.

  • Completely avoids using loops compared to the nested loops in the original implementation. As a result runs much faster (speedup depends on the type of GPUs, their count, the batch size, etc.).
  • Tensors format is very different the original implementation. Tensors contain image ids, ground truth ids and anchor ids as values to support variable length data.
  • There are differences in terms of the algorithm itself:
  • When computing a dynamic k for a ground truth, in the original implementation they consider the sum of top 10 predictions sorted by ious among the initial foregrounds of any ground truth in the image, while in our implementation we consider only the initial foreground of that particular ground truth. To compensate for that difference we introduce the dynamic_ks_bias hyperparamter which makes the dynamic ks larger.
  • When computing the k matched detections for a ground truth, in the original implementation they consider the initial foregrounds of any ground truth in the image as candidates, while in our implementation we consider only the initial foreground of that particular ground truth as candidates. We believe that this difference is minor.

Parameters:

Name Type Description Default
dynamic_ks_bias

hyperparameter to compensate for the discrepancies between the regular loss and this loss.

1.1
sync_num_fgs

sync num of fgs. Can be used for DDP training.

False
obj_loss_fix

devide by total of num anchors instead num of matching fgs. Can be used for objectness loss.

False
Source code in V3_6/src/super_gradients/training/losses/yolox_loss.py
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
@register_loss(name=Losses.YOLOX_FAST_LOSS, deprecated_name="yolox_fast_loss")
class YoloXFastDetectionLoss(YoloXDetectionLoss):
    """
    A completely new implementation of YOLOX loss.
    This is NOT an equivalent implementation to the regular yolox loss.

    * Completely avoids using loops compared to the nested loops in the original implementation.
        As a result runs much faster (speedup depends on the type of GPUs, their count, the batch size, etc.).
    * Tensors format is very different the original implementation.
        Tensors contain image ids, ground truth ids and anchor ids as values to support variable length data.
    * There are differences in terms of the algorithm itself:
    1. When computing a dynamic k for a ground truth,
        in the original implementation they consider the sum of top 10 predictions sorted by ious among the initial
        foregrounds of any ground truth in the image,
        while in our implementation we consider only the initial foreground of that particular ground truth.
        To compensate for that difference we introduce the dynamic_ks_bias hyperparamter which makes the dynamic ks larger.
    2. When computing the k matched detections for a ground truth,
        in the original implementation they consider the initial foregrounds of any ground truth in the image as candidates,
        while in our implementation we consider only the initial foreground of that particular ground truth as candidates.
        We believe that this difference is minor.

    :param dynamic_ks_bias: hyperparameter to compensate for the discrepancies between the regular loss and this loss.
    :param sync_num_fgs:    sync num of fgs.
                            Can be used for DDP training.
    :param obj_loss_fix:    devide by total of num anchors instead num of matching fgs.
                            Can be used for objectness loss.
    """

    def __init__(
        self,
        strides,
        num_classes,
        use_l1=False,
        center_sampling_radius=2.5,
        iou_type="iou",
        iou_weight: float = 5.0,
        obj_weight: float = 1.0,
        cls_weight: float = 1.0,
        cls_pos_weight: Optional[torch.Tensor] = None,
        dynamic_ks_bias=1.1,
        sync_num_fgs=False,
        obj_loss_fix=False,
    ):
        super().__init__(
            strides=strides,
            num_classes=num_classes,
            use_l1=use_l1,
            center_sampling_radius=center_sampling_radius,
            iou_type=iou_type,
            iou_weight=iou_weight,
            obj_weight=obj_weight,
            cls_weight=cls_weight,
            cls_pos_weight=cls_pos_weight,
        )

        self.dynamic_ks_bias = dynamic_ks_bias
        self.sync_num_fgs = sync_num_fgs
        self.obj_loss_fix = obj_loss_fix

    def _compute_loss(self, predictions: List[torch.Tensor], targets: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        L = L_objectness + L_iou + L_classification + 1[no_aug_epoch]*L_l1
        where:
            * L_iou, L_classification and L_l1 are calculated only between cells and targets that suit them;
            * L_objectness is calculated for all cells.

        L_classification:
            for cells that have suitable ground truths in their grid locations add BCEs
            to force a prediction of IoU with a GT in a multi-label way
            Coef: 1.
        L_iou:
            for cells that have suitable ground truths in their grid locations
            add (1 - IoU^2), IoU between a predicted box and each GT box, force maximum IoU
            Coef: 1.
        L_l1:
            for cells that have suitable ground truths in their grid locations
            l1 distance between the logits and GTs in “logits” format (the inverse of “logits to predictions” ops)
            Coef: 1[no_aug_epoch]
        L_objectness:
            for each cell add BCE with a label of 1 if there is GT assigned to the cell
            Coef: 5

        :param predictions:     output from all Yolo levels, each of shape
                                [Batch x Num_Anchors x GridSizeY x GridSizeX x (4 + 1 + Num_classes)]
        :param targets:         [Num_targets x (4 + 2)], values on dim 1 are: image id in a batch, class, box x y w h

        :return:                loss, all losses separately in a detached tensor
        """
        x_shifts, y_shifts, expanded_strides, transformed_outputs, raw_outputs = self.prepare_predictions(predictions)

        bbox_preds = transformed_outputs[:, :, :4]  # [batch, n_anchors_all, 4]
        obj_preds = transformed_outputs[:, :, 4:5]  # [batch, n_anchors_all, 1]
        cls_preds = transformed_outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]

        # assign cells to ground truths, at most one GT per cell
        matched_fg_ids, matched_gt_classes, matched_gt_ids, matched_img_ids, matched_ious, flattened_gts = self._compute_matching(
            bbox_preds, cls_preds, obj_preds, expanded_strides, x_shifts, y_shifts, targets
        )

        num_gts = flattened_gts.shape[0]
        num_gts_clamped = max(flattened_gts.shape[0], 1)
        num_fg = max(matched_gt_ids.shape[0], 1)
        total_num_anchors = max(transformed_outputs.shape[0] * transformed_outputs.shape[1], 1)

        cls_targets = F.one_hot(matched_gt_classes.to(torch.int64), self.num_classes) * matched_ious.unsqueeze(dim=1)
        obj_targets = transformed_outputs.new_zeros((transformed_outputs.shape[0], transformed_outputs.shape[1]))
        obj_targets[matched_img_ids, matched_fg_ids] = 1
        reg_targets = flattened_gts[matched_gt_ids][:, 1:]
        if self.use_l1 and num_gts > 0:
            l1_targets = self.get_l1_target(
                transformed_outputs.new_zeros((num_fg, 4)),
                flattened_gts[matched_gt_ids][:, 1:],
                expanded_strides.squeeze()[matched_fg_ids],
                x_shifts=x_shifts.squeeze()[matched_fg_ids],
                y_shifts=y_shifts.squeeze()[matched_fg_ids],
            )
        if self.sync_num_fgs and dist.group.WORLD is not None:
            num_fg = torch.scalar_tensor(num_fg).to(matched_gt_ids.device)
            dist.all_reduce(num_fg, op=torch._C._distributed_c10d.ReduceOp.AVG)

        loss_iou = self.iou_loss(bbox_preds[matched_img_ids, matched_fg_ids], reg_targets).sum() / num_fg
        loss_obj = self.obj_bcewithlog_loss(obj_preds.squeeze(-1), obj_targets).sum() / (total_num_anchors if self.obj_loss_fix else num_fg)
        loss_cls = self.cls_bcewithlog_loss(cls_preds[matched_img_ids, matched_fg_ids], cls_targets).sum() / num_fg

        if self.use_l1 and num_gts > 0:
            loss_l1 = self.l1_loss(raw_outputs[matched_img_ids, matched_fg_ids], l1_targets).sum() / num_fg
        else:
            loss_l1 = 0.0

        loss = self.iou_weight * loss_iou + self.obj_weight * loss_obj + self.cls_weight * loss_cls + loss_l1

        return (
            loss,
            torch.cat(
                (
                    loss_iou.unsqueeze(0),
                    loss_obj.unsqueeze(0),
                    loss_cls.unsqueeze(0),
                    torch.tensor(loss_l1).unsqueeze(0).to(transformed_outputs.device),
                    torch.tensor(num_fg / num_gts_clamped).unsqueeze(0).to(transformed_outputs.device),
                    loss.unsqueeze(0),
                )
            ).detach(),
        )

    def _get_initial_matching(
        self, gt_bboxes: torch.Tensor, expanded_strides: torch.Tensor, x_shifts: torch.Tensor, y_shifts: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Get candidates using a mask for all cells.
        Mask in only foreground cells that have a center located:
            * withing a GT box (param: is_in_boxes);
            OR
            * within a fixed radius around a GT box (center sampling) (param: is_in_centers);

        return:
            initial_matching: get a list of candidates pairs of (gt box id, anchor box id) based on cell = is_in_boxes | is_in_centers.
                              shape: [num_candidates, 2]
            strong candidate mask: get a list whether a candidate is a strong one or not.
                                   strong candidate is a cell from is_in_boxes & is_in_centers.
                                   shape: [num_candidates].
        """
        cell_x_centers = (x_shifts + 0.5) * expanded_strides
        cell_y_centers = (y_shifts + 0.5) * expanded_strides

        gt_bboxes_x_centers = gt_bboxes[:, 0].unsqueeze(1)
        gt_bboxes_y_centers = gt_bboxes[:, 1].unsqueeze(1)

        gt_bboxes_half_w = (0.5 * gt_bboxes[:, 2]).unsqueeze(1)
        gt_bboxes_half_h = (0.5 * gt_bboxes[:, 3]).unsqueeze(1)

        is_in_boxes = (
            (cell_x_centers > gt_bboxes_x_centers - gt_bboxes_half_w)
            & (gt_bboxes_x_centers + gt_bboxes_half_w > cell_x_centers)
            & (cell_y_centers > gt_bboxes_y_centers - gt_bboxes_half_h)
            & (gt_bboxes_y_centers + gt_bboxes_half_h > cell_y_centers)
        )

        radius_shifts = 2.5 * expanded_strides

        is_in_centers = (
            (cell_x_centers + radius_shifts > gt_bboxes_x_centers)
            & (gt_bboxes_x_centers > cell_x_centers - radius_shifts)
            & (cell_y_centers + radius_shifts > gt_bboxes_y_centers)
            & (gt_bboxes_y_centers > cell_y_centers - radius_shifts)
        )

        initial_mask = is_in_boxes | is_in_centers
        initial_matching = initial_mask.nonzero()
        strong_candidate_mask = (is_in_boxes & is_in_centers)[initial_mask]

        return initial_matching[:, 0], initial_matching[:, 1], strong_candidate_mask

    @torch.no_grad()
    def _compute_matching(
        self,
        bbox_preds: torch.Tensor,
        cls_preds: torch.Tensor,
        obj_preds: torch.Tensor,
        expanded_strides: torch.Tensor,
        x_shifts: torch.Tensor,
        y_shifts: torch.Tensor,
        labels: torch.Tensor,
        ious_loss_cost_coeff: float = 3.0,
        outside_boxes_and_center_cost_coeff: float = 100000.0,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Match cells to ground truth:
            * at most 1 GT per cell
            * dynamic number of cells per GT

        :param bbox_preds: predictions of bounding boxes. shape [batch, n_anchors_all, 4]
        :param cls_preds:  predictions of class.          shape [batch, n_anchors_all, n_cls]
        :param obj_preds:  predictions for objectness.    shape [batch, n_anchors_all, 1]
        :param expanded_strides:  stride of the output grid the prediction is coming from. shape [1, n_anchors_all]
        :param x_shifts: x coordinate on the grid cell the prediction is coming from.      shape [1, n_anchors_all]
        :param y_shifts: y coordinate on the grid cell the prediction is coming from.      shape [1, n_anchors_all]
        :param labels:   labels for each grid cell.  shape [n_anchors_all, (4 + 2)]
        :return: candidate_fg_ids       shape [num_fg]
                 candidate_gt_classes   shape [num_fg]
                 candidate_gt_ids       shape [num_fg]
                 candidate_img_ids      shape [num_fg]
                 candidate_ious         shape [num_fg]
                 flattened_gts          shape [num_gts, 5]
        """

        flattened_gts, gt_id_to_img_id = labels[:, 1:], labels[:, 0].type(torch.int64)

        # COMPUTE CANDIDATES
        candidate_gt_ids, candidate_fg_ids, strong_candidate_mask = self._get_initial_matching(flattened_gts[:, 1:], expanded_strides, x_shifts, y_shifts)
        candidate_img_ids = gt_id_to_img_id[candidate_gt_ids]
        candidate_gts_bbox = flattened_gts[candidate_gt_ids, 1:]
        candidate_det_bbox = bbox_preds[candidate_img_ids, candidate_fg_ids]

        # COMPUTE DYNAMIC KS
        candidate_ious = self._calculate_pairwise_bbox_iou(candidate_gts_bbox, candidate_det_bbox, xyxy=False)
        dynamic_ks, matching_index_to_dynamic_k_index = self._compute_dynamic_ks(candidate_gt_ids, candidate_ious, self.dynamic_ks_bias)
        del candidate_gts_bbox, candidate_det_bbox

        # ORDER CANDIDATES BY COST
        candidate_gt_classes = flattened_gts[candidate_gt_ids, 0]
        cost_order = self._compute_cost_order(
            self.num_classes,
            candidate_img_ids,
            candidate_gt_classes,
            candidate_fg_ids,
            candidate_ious,
            cls_preds,
            obj_preds,
            strong_candidate_mask,
            ious_loss_cost_coeff,
            outside_boxes_and_center_cost_coeff,
        )

        candidate_gt_ids = candidate_gt_ids[cost_order]
        candidate_gt_classes = candidate_gt_classes[cost_order]
        candidate_img_ids = candidate_img_ids[cost_order]
        candidate_fg_ids = candidate_fg_ids[cost_order]
        candidate_ious = candidate_ious[cost_order]
        matching_index_to_dynamic_k_index = matching_index_to_dynamic_k_index[cost_order]
        del cost_order

        # FILTER MATCHING TO LOWEST K COST MATCHES PER GT
        ranks = self._compute_ranks(candidate_gt_ids)
        corresponding_dynamic_ks = dynamic_ks[matching_index_to_dynamic_k_index]
        topk_mask = ranks < corresponding_dynamic_ks

        candidate_gt_ids = candidate_gt_ids[topk_mask]
        candidate_gt_classes = candidate_gt_classes[topk_mask]
        candidate_img_ids = candidate_img_ids[topk_mask]
        candidate_fg_ids = candidate_fg_ids[topk_mask]
        candidate_ious = candidate_ious[topk_mask]
        del ranks, topk_mask, dynamic_ks, matching_index_to_dynamic_k_index, corresponding_dynamic_ks

        # FILTER MATCHING TO AT MOST 1 MATCH FOR DET BY TAKING THE LOWEST COST MATCH
        candidate_img_and_fg_ids_combined = self._combine_candidates_img_id_fg_id(candidate_img_ids, candidate_fg_ids)
        top1_mask = self._compute_is_first_mask(candidate_img_and_fg_ids_combined)
        candidate_gt_ids = candidate_gt_ids[top1_mask]
        candidate_gt_classes = candidate_gt_classes[top1_mask]
        candidate_fg_ids = candidate_fg_ids[top1_mask]
        candidate_img_ids = candidate_img_ids[top1_mask]
        candidate_ious = candidate_ious[top1_mask]

        return candidate_fg_ids, candidate_gt_classes, candidate_gt_ids, candidate_img_ids, candidate_ious, flattened_gts

    def _combine_candidates_img_id_fg_id(self, candidate_img_ids, candidate_anchor_ids):
        """
        Create one dim tensor with unique pairs of img_id and fg_id.
        e.g: candidate_img_ids = [0,1,0,0]
             candidate_fg_ids = [0,0,0,1]
             result = [0,1,0,2]
        """
        candidate_img_and_fg_ids_combined = torch.stack((candidate_img_ids, candidate_anchor_ids), dim=1).unique(dim=0, return_inverse=True)[1]
        return candidate_img_and_fg_ids_combined

    def _compute_dynamic_ks(self, ids: torch.Tensor, ious: torch.Tensor, dynamic_ks_bias) -> torch.Tensor:
        """
        :param ids:                 ids of GTs, shape: [num_candidates]
        :param ious:                pairwise IoUs, shape: [num_candidates]
        :param dynamic_ks_bias:     multiply the resulted k to compensate the regular loss
        """
        assert len(ids.shape) == 1, "ids must be of shape [num_candidates]"
        assert len(ious.shape) == 1, "ious must be of shape [num_candidates]"
        assert ids.shape[0] == ious.shape[0], "num of ids.shape[0] must be the same as num of ious.shape[0]"
        # sort ious and ids by ious
        ious, ious_argsort = ious.sort(descending=True)
        ids = ids[ious_argsort]

        # stable sort indices, so that ious are first sorted by id and second by value
        ids, ids_argsort = ids.sort(stable=True)
        ious = ious[ids_argsort]

        unique_ids, ids_index_to_unique_ids_index = ids.unique_consecutive(dim=0, return_inverse=True)
        num_unique_ids = unique_ids.shape[0]

        if ids.shape[0] > 10:
            is_in_top_10 = torch.cat((torch.ones((10,), dtype=torch.bool, device=ids.device), ids[10:] != ids[:-10]))
        else:
            is_in_top_10 = torch.ones_like(ids, dtype=torch.bool)

        dynamic_ks = torch.zeros((num_unique_ids,), dtype=ious.dtype, device=ious.device)
        dynamic_ks.index_put_((ids_index_to_unique_ids_index,), is_in_top_10 * ious, accumulate=True)
        if dynamic_ks_bias is not None:
            dynamic_ks *= dynamic_ks_bias
        dynamic_ks = dynamic_ks.long().clamp(min=1)

        all_argsort = ious_argsort[ids_argsort]
        inverse_all_argsort = torch.zeros_like(ious_argsort)
        inverse_all_argsort[all_argsort] = torch.arange(all_argsort.shape[0], dtype=all_argsort.dtype, device=all_argsort.device)

        return dynamic_ks, ids_index_to_unique_ids_index[inverse_all_argsort]

    def _compute_cost_order(
        self,
        num_classes,
        candidate_gt_img_ids: torch.Tensor,
        candidate_gt_classes: torch.Tensor,
        candidate_anchor_ids: torch.Tensor,
        candidate_ious: torch.Tensor,
        cls_preds: torch.Tensor,
        obj_preds: torch.Tensor,
        strong_candidate_mask: torch.Tensor,
        ious_loss_cost_coeff: float,
        outside_boxes_and_center_cost_coeff: float,
    ) -> torch.Tensor:
        gt_cls_per_image = F.one_hot(candidate_gt_classes.to(torch.int64), num_classes).float()
        with torch.cuda.amp.autocast(enabled=False):
            cls_preds_ = (
                cls_preds[candidate_gt_img_ids, candidate_anchor_ids].float().sigmoid_()
                * obj_preds[candidate_gt_img_ids, candidate_anchor_ids].float().sigmoid_()
            )
            pair_wise_cls_cost = F.binary_cross_entropy(cls_preds_.sqrt_(), gt_cls_per_image, reduction="none").sum(-1)

        ious_cost = -torch.log(candidate_ious + 1e-8)
        cost = pair_wise_cls_cost + ious_loss_cost_coeff * ious_cost + outside_boxes_and_center_cost_coeff * strong_candidate_mask.logical_not()
        return cost.argsort()

    def _calculate_pairwise_bbox_iou(self, bboxes_a: torch.Tensor, bboxes_b: torch.Tensor, xyxy=True) -> torch.Tensor:
        if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
            raise IndexError

        if xyxy:
            tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2])
            br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:])
            area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
            area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
        else:
            tl = torch.max(
                (bboxes_a[:, :2] - bboxes_a[:, 2:] / 2),
                (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
            )
            br = torch.min(
                (bboxes_a[:, :2] + bboxes_a[:, 2:] / 2),
                (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
            )

            area_a = torch.prod(bboxes_a[:, 2:], 1)
            area_b = torch.prod(bboxes_b[:, 2:], 1)
        en = (tl < br).prod(dim=1)
        area_i = torch.prod(br - tl, 1) * en
        return area_i / (area_a + area_b - area_i)

    def _compute_ranks(self, ids: torch.Tensor) -> torch.Tensor:
        ids, ids_argsort = ids.sort(stable=True)

        if ids.shape[0] > 1:
            is_not_first = torch.cat((torch.zeros((1,), dtype=torch.bool, device=ids.device), ids[1:] == ids[:-1]))
        else:
            is_not_first = torch.zeros_like(ids, dtype=torch.bool)

        subtract = torch.arange(ids.shape[0], dtype=ids_argsort.dtype, device=ids.device)
        subtract[is_not_first] = 0
        subtract = subtract.cummax(dim=0)[0]
        rank = torch.arange(ids.shape[0], dtype=ids_argsort.dtype, device=ids.device) - subtract

        inverse_argsort = torch.zeros_like(ids_argsort)
        inverse_argsort[ids_argsort] = torch.arange(ids_argsort.shape[0], dtype=ids_argsort.dtype, device=ids_argsort.device)

        return rank[inverse_argsort]

    def _compute_is_first_mask(self, ids: torch.Tensor) -> torch.Tensor:
        """
        Filter fg that matches two gts.
        """
        ids, ids_argsort = ids.sort(stable=True)

        if ids.shape[0] > 1:
            is_first = torch.cat((torch.ones((1,), dtype=torch.bool, device=ids.device), ids[1:] != ids[:-1]))
        else:
            is_first = torch.ones_like(ids, dtype=torch.bool)

        inverse_argsort = torch.zeros_like(ids_argsort)
        inverse_argsort[ids_argsort] = torch.arange(ids_argsort.shape[0], dtype=ids_argsort.dtype, device=ids_argsort.device)

        return is_first[inverse_argsort]