Skip to content

Datasets

RandAugment RandAugment is a variant of AutoAugment which randomly selects transformations from AutoAugment to be applied on an image.

RandomAugmentation Implementation adapted from: https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py

Papers: RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719

AugmentOp

single auto augment operations

Source code in V3_4/src/super_gradients/training/datasets/auto_augment.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class AugmentOp:
    """
    single auto augment operations
    """

    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
        hparams = hparams or _HPARAMS_DEFAULT
        self.aug_fn = NAME_TO_OP[name]
        self.level_fn = LEVEL_TO_ARG[name]
        self.prob = prob
        self.magnitude = magnitude
        self.hparams = hparams.copy()
        self.kwargs = dict(
            fillcolor=hparams["img_mean"] if "img_mean" in hparams else _FILL,
            resample=hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION,
        )

        # If magnitude_std is > 0, introduce some randomness
        self.magnitude_std = self.hparams.get("magnitude_std", 0)

    def __call__(self, img):
        if self.prob < 1.0 and random.random() > self.prob:
            return img
        magnitude = self.magnitude
        if self.magnitude_std:
            if self.magnitude_std == float("inf"):
                magnitude = random.uniform(0, magnitude)
            elif self.magnitude_std > 0:
                magnitude = random.gauss(magnitude, self.magnitude_std)
        magnitude = min(_MAX_MAGNITUDE, max(0, magnitude))  # clip to valid range
        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
        return self.aug_fn(img, *level_args, **self.kwargs)

RandAugment

Random auto augment class, will select auto augment transforms according to probability weights for each op

Source code in V3_4/src/super_gradients/training/datasets/auto_augment.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
class RandAugment:
    """
    Random auto augment class, will select auto augment transforms according to probability weights for each op
    """

    def __init__(self, ops, num_layers=2, choice_weights=None):
        self.ops = ops
        self.num_layers = num_layers
        self.choice_weights = choice_weights

    def __call__(self, img):
        # no replacement when using weighted choice
        ops = np.random.choice(self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
        for op in ops:
            img = op(img)
        return img

rand_augment_transform(config_str, crop_size, img_mean)

Create a RandAugment transform

Parameters:

Name Type Description Default
config_str

String defining configuration of random augmentation. Consists of multiple sections separated by dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining sections, not order sepecific determine 'm' - integer magnitude of rand augment 'n' - integer num layers (number of transform ops selected per image) 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) 'mstd' - float std deviation of magnitude noise applied 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

required
crop_size int

The size of crop image

required
img_mean List[float]

Average per channel

required

Returns:

Type Description

A PyTorch compatible Transform

Source code in V3_4/src/super_gradients/training/datasets/auto_augment.py
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@register_transform(Transforms.RandAugmentTransform)
def rand_augment_transform(config_str, crop_size: int, img_mean: List[float]):
    """
    Create a RandAugment transform

    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
    sections, not order sepecific determine
        'm' - integer magnitude of rand augment
        'n' - integer num layers (number of transform ops selected per image)
        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
        'mstd' -  float std deviation of magnitude noise applied
        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

    :param crop_size: The size of crop image
    :param img_mean:  Average per channel

    :return: A PyTorch compatible Transform
    """
    hparams = dict(translate_const=int(crop_size * 0.45), img_mean=tuple([min(255, round(255 * channel_mean)) for channel_mean in img_mean]))

    magnitude = _MAX_MAGNITUDE  # default to _MAX_MAGNITUDE for magnitude (currently 10)
    num_layers = 2  # default to 2 ops per image
    weight_idx = None  # default to no probability weights for op choice
    transforms = _RAND_TRANSFORMS
    config = config_str.split("-")
    for c in config:
        cs = re.split(r"(\d.*)", c)
        if len(cs) < 2:
            continue
        key, val = cs[:2]
        if key == "mstd":
            # noise param injected via hparams for now
            hparams.setdefault("magnitude_std", float(val))
        elif key == "inc":
            if bool(val):
                transforms = _RAND_INCREASING_TRANSFORMS
        elif key == "m":
            magnitude = int(val)
        elif key == "n":
            num_layers = int(val)
        elif key == "w":
            weight_idx = int(val)
        else:
            assert False, "Unknown RandAugment config section"
    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)

Cifar10

Bases: CIFAR10, HasPreprocessingParams

CIFAR10 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/cifar.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
@register_dataset(Datasets.CIFAR_10)
class Cifar10(CIFAR10, HasPreprocessingParams):
    """
    CIFAR10 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """

    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar10, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/cifar.py
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

Cifar100

Bases: CIFAR100, HasPreprocessingParams

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/cifar.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@register_dataset(Datasets.CIFAR_100)
class Cifar100(CIFAR100, HasPreprocessingParams):
    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        """
        CIFAR100 Dataset

        :param root:                    Path for the data to be extracted
        :param train:                   Bool to load training (True) or validation (False) part of the dataset
        :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
        :param target_transform:        Transform to apply to target output
        :param download:                Download (True) the dataset from source
        """
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar100, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

__init__(root, train=True, transforms=None, target_transform=None, download=False)

CIFAR100 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/cifar.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@resolve_param("transforms", TransformsFactory())
def __init__(
    self,
    root: str,
    train: bool = True,
    transforms: Union[list, dict] = None,
    target_transform: Optional[Callable] = None,
    download: bool = False,
) -> None:
    """
    CIFAR100 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """
    # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
    # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
    if isinstance(transforms, list):
        transforms = Compose(transforms)

    super(Cifar100, self).__init__(
        root=root,
        train=train,
        transform=transforms,
        target_transform=target_transform,
        download=download,
    )

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/cifar.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

ImageNetDataset

Bases: torch_datasets.ImageFolder, HasPreprocessingParams

ImageNetDataset dataset.

To use this Dataset you need to:

  • Download imagenet dataset (https://image-net.org/download.php) Imagenet ├──train │ ├──n02093991 │ │ ├──n02093991_1001.JPEG │ │ ├──n02093991_1004.JPEG │ │ └──... │ ├──n02093992 │ └──... └──val ├──n02093991 ├──n02093992 └──...

  • Instantiate the dataset: >> train_set = ImageNetDataset(root='.../Imagenet/train', ...) >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@register_dataset(Datasets.IMAGENET_DATASET)
class ImageNetDataset(torch_datasets.ImageFolder, HasPreprocessingParams):
    """ImageNetDataset dataset.

    To use this Dataset you need to:

    - Download imagenet dataset (https://image-net.org/download.php)
        Imagenet
         ├──train
         │  ├──n02093991
         │  │   ├──n02093991_1001.JPEG
         │  │   ├──n02093991_1004.JPEG
         │  │   └──...
         │  ├──n02093992
         │  └──...
         └──val
            ├──n02093991
            ├──n02093992
            └──...

    - Instantiate the dataset:
        >> train_set = ImageNetDataset(root='.../Imagenet/train', ...)
        >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)
    """

    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(self, root: str, transforms: Union[list, dict] = [], *args, **kwargs):
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)
        super(ImageNetDataset, self).__init__(root, transform=transforms, *args, **kwargs)

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

get_torchvision_transforms_equivalent_processing(transforms)

Get the equivalent processing pipeline for torchvision transforms.

Returns:

Type Description
List[Dict[str, Any]]

List of Processings operations

Source code in V3_4/src/super_gradients/training/datasets/classification_datasets/torchvision_utils.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def get_torchvision_transforms_equivalent_processing(transforms: List[Any]) -> List[Dict[str, Any]]:
    """
    Get the equivalent processing pipeline for torchvision transforms.

    :return: List of Processings operations
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = []

    if isinstance(transforms, StandardTransform):
        transforms = transforms.transform

    if isinstance(transforms, Compose):
        transforms = transforms.transforms

    for transform in transforms:
        if isinstance(transform, ToTensor):
            pipeline.append({Processings.StandardizeImage: {"max_value": 255}})
        elif isinstance(transform, Normalize):
            pipeline.append({Processings.NormalizeImage: {"mean": tuple(map(float, transform.mean)), "std": tuple(map(float, transform.std))}})
        elif isinstance(transform, Resize):
            pipeline.append({Processings.Resize: {"size": int(transform.size)}})
        elif isinstance(transform, CenterCrop):
            pipeline.append({Processings.CenterCrop: {"size": int(transform.size)}})
        else:
            raise ValueError(f"Unsupported transform: {transform}")

    pipeline.append({Processings.ImagePermute: {"permutation": (2, 0, 1)}})
    return pipeline

Lighting

Bases: object

Lighting noise(AlexNet - style PCA - based noise) Taken from fastai Imagenet training - https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103 To use: - training_params = {"imagenet_pca_aug": 0.1} - Default training_params arg is 0.0 ("don't use") - 0.1 is that default in the original paper

Source code in V3_4/src/super_gradients/training/datasets/data_augmentation.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@register_transform(Transforms.Lighting)
class Lighting(object):
    """
    Lighting noise(AlexNet - style PCA - based noise)
    Taken from fastai Imagenet training -
    https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103
    To use:
        - training_params = {"imagenet_pca_aug": 0.1}
        - Default training_params arg is 0.0 ("don't use")
        - 0.1 is that default in the original paper
    """

    def __init__(self, alphastd, eigval=IMAGENET_PCA["eigval"], eigvec=IMAGENET_PCA["eigvec"]):
        self.alphastd = alphastd
        self.eigval = eigval
        self.eigvec = eigvec

    def __call__(self, img):
        if self.alphastd == 0:
            return img
        alpha = img.new().resize_(3).normal_(0, self.alphastd)
        rgb = self.eigvec.type_as(img).clone().mul(alpha.view(1, 3).expand(3, 3)).mul(self.eigval.view(1, 3).expand(3, 3)).sum(1).squeeze()
        return img.add(rgb.view(3, 1, 1).expand_as(img))

RandomErase

Bases: RandomErasing

A simple class that translates the parameters supported in SuperGradient's code base

Source code in V3_4/src/super_gradients/training/datasets/data_augmentation.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@register_transform(Transforms.RandomErase)
class RandomErase(RandomErasing):
    """
    A simple class that translates the parameters supported in SuperGradient's code base
    """

    def __init__(self, probability: float, value: str):
        # value might be a string representing a float. First we try to convert to float and if fails,
        # pass it as-is to super
        try:
            value = float(value)
        except ValueError:
            pass
        super().__init__(p=probability, value=value)

BoundingBoxFormat

Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class BoundingBoxFormat:
    """
    Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert
    whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to
    intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support
    all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.
    """

    def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert input boxes to XYXY format
        :param bboxes: Input bounding boxes [..., 4]
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in XYXY format
        """
        return self.get_to_xyxy(inplace)(bboxes, image_shape)

    def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert XYXY boxes to target bboxes format
        :param bboxes: Input bounding boxes [..., 4] in XYXY format
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in target format
        """
        return self.get_from_xyxy(inplace)(bboxes, image_shape)

    @abstractmethod
    def get_to_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    @abstractmethod
    def get_from_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    def get_num_parameters(self) -> int:
        return 4

from_xyxy(bboxes, image_shape, inplace)

Convert XYXY boxes to target bboxes format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4] in XYXY format

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in target format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
27
28
29
30
31
32
33
34
35
def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert XYXY boxes to target bboxes format
    :param bboxes: Input bounding boxes [..., 4] in XYXY format
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in target format
    """
    return self.get_from_xyxy(inplace)(bboxes, image_shape)

to_xyxy(bboxes, image_shape, inplace)

Convert input boxes to XYXY format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4]

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in XYXY format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
17
18
19
20
21
22
23
24
25
def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert input boxes to XYXY format
    :param bboxes: Input bounding boxes [..., 4]
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in XYXY format
    """
    return self.get_to_xyxy(inplace)(bboxes, image_shape)

convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)

Convert bboxes from source to target format

Parameters:

Name Type Description Default
bboxes

Tensor of shape (..., 4) with input bounding boxes

required
image_shape Tuple[int, int]

Tuple of (rows, cols) corresponding to image shape

required
source_format BoundingBoxFormat

Format of the source bounding boxes

required
target_format BoundingBoxFormat

Format of the output bounding boxes

required

Returns:

Type Description

Tensor of shape (..., 4) with resulting bounding boxes

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
49
50
51
52
53
54
55
56
57
58
59
def convert_bboxes(bboxes, image_shape: Tuple[int, int], source_format: BoundingBoxFormat, target_format: BoundingBoxFormat, inplace: bool):
    """
    Convert bboxes from source to target format
    :param bboxes: Tensor of shape (..., 4) with input bounding boxes
    :param image_shape: Tuple of (rows, cols) corresponding to image shape
    :param source_format: Format of the source bounding boxes
    :param target_format: Format of the output bounding boxes
    :return: Tensor of shape (..., 4) with resulting bounding boxes
    """
    xyxy = source_format.to_xyxy(bboxes, image_shape, inplace)
    return target_format.from_xyxy(xyxy, image_shape, inplace)

cxcywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from CX-CY-W-H format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def cxcywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from CX-CY-W-H format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    cx, cy, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        if isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

cxcywh_to_xyxy_inplace(bboxes, image_shape)

Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def cxcywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    bboxes[..., 0:2] -= bboxes[..., 2:4] * 0.5  # cxcy -> x1y1
    bboxes[..., 2:4] += bboxes[..., 0:2]  # x1y1 + wh -> x2y2
    return bboxes

xyxy_to_cxcywh(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def xyxy_to_cxcywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1
    cx = x1 + 0.5 * w
    cy = y1 + 0.5 * h
    if torch.jit.is_scripting():
        return torch.stack([cx, cy, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([cx, cy, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([cx, cy, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_cxcywh_inplace(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place. Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def xyxy_to_cxcywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place.
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
        elif isinstance(bboxes, np.ndarray) and not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
    bboxes[..., 2:4] -= bboxes[..., 0:2]  # x2y2 - x1y2 -> wh
    bboxes[..., 0:2] += bboxes[..., 2:4] * 0.5  # cxcywh
    return bboxes

NormalizedXYXYCoordinateFormat

Bases: BoundingBoxFormat

Normalized X1,Y1,X2,Y2 bounding boxes format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class NormalizedXYXYCoordinateFormat(BoundingBoxFormat):
    """
    Normalized X1,Y1,X2,Y2 bounding boxes format
    """

    def __init__(self):
        super().__init__()
        self.format = "normalized_xyxy"
        self.normalized = True

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return normalized_xyxy_to_xyxy_inplace
        else:
            return normalized_xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_normalized_xyxy_inplace
        else:
            return xyxy_to_normalized_xyxy

normalized_xyxy_to_xyxy(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def normalized_xyxy_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
            scale = scale.reshape([1] * (len(bboxes.shape) - 1) + [4])
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

    return bboxes * scale

normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
67
68
69
70
71
72
73
74
75
76
77
def normalized_xyxy_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    bboxes[..., 0:3:2] *= cols
    bboxes[..., 1:4:2] *= rows
    return bboxes

xyxy_to_normalized_xyxy(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format

Parameters:

Name Type Description Default
bboxes Tensor

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description
Tensor

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def xyxy_to_normalized_xyxy(bboxes: Tensor, image_shape: Tuple[int, int]) -> Tensor:
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")
    return bboxes / scale

xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def xyxy_to_normalized_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """

    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if isinstance(bboxes, np.ndarray) and not np.issubdtype(bboxes.dtype, np.floating):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    rows, cols = image_shape
    bboxes[..., 0:3:2] /= cols
    bboxes[..., 1:4:2] /= rows
    return bboxes

xywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def xywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    x1, y1, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xywh_to_xyxy_inplace(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
65
66
67
68
69
70
71
72
def xywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    bboxes[..., 2:4] += bboxes[..., 0:2]
    return bboxes

xyxy_to_xywh(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def xyxy_to_xywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_xywh_inplace(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
55
56
57
58
59
60
61
62
def xyxy_to_xywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    bboxes[..., 2:4] -= bboxes[..., 0:2]
    return bboxes

XYXYCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format X1, Y1, X2, Y2

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/xyxy.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class XYXYCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format X1, Y1, X2, Y2
    """

    def __init__(self):
        self.format = "xyxy"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

YXYXCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format Y1, X1, Y2, X1

Source code in V3_4/src/super_gradients/training/datasets/data_formats/bbox_formats/yxyx.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class YXYXCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format Y1, X1, Y2, X1
    """

    def __init__(self):
        super().__init__()
        self.format = "yxyx"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

    def get_from_xyxy(self, inplace: bool):
        # XYXY <-> YXYX is interchangable operation, so we may reuse same routine here
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

ConcatenatedTensorFormatConverter

Source code in V3_4/src/super_gradients/training/datasets/data_formats/format_converter.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class ConcatenatedTensorFormatConverter:
    def __init__(
        self,
        input_format: ConcatenatedTensorFormat,
        output_format: ConcatenatedTensorFormat,
        image_shape: Union[Tuple[int, int], None],
    ):
        """
        Converts concatenated tensors from input format to output format.

        Example:
            >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
            >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
            >>> h, w = 100, 200
            >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
            >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
            >>>
            >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
            >>>
            >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
            >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        self.permutation_indexes = get_permutation_indexes(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.image_shape = image_shape
        self.input_length = input_format.num_channels

    def __call__(self, tensor: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        if tensor.shape[-1] != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({tensor.shape[-1]}) must be "
                f"equal to {self.input_length} as defined by input format."
            )
        tensor = tensor[:, self.permutation_indexes]
        tensor = apply_on_bboxes(fn=self._convert_bbox, tensor=tensor, tensor_format=self.output_format)
        return tensor

    def _convert_bbox(self, bboxes: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        return convert_bboxes(
            bboxes=bboxes,
            source_format=self.input_format.bboxes_format.format,
            target_format=self.output_format.bboxes_format.format,
            inplace=False,
            image_shape=self.image_shape,
        )

__init__(input_format, output_format, image_shape)

Converts concatenated tensors from input format to output format.

Example: >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY >>> h, w = 100, 200 >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32) >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32) >>> >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w)) >>> >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in V3_4/src/super_gradients/training/datasets/data_formats/format_converter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(
    self,
    input_format: ConcatenatedTensorFormat,
    output_format: ConcatenatedTensorFormat,
    image_shape: Union[Tuple[int, int], None],
):
    """
    Converts concatenated tensors from input format to output format.

    Example:
        >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
        >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
        >>> h, w = 100, 200
        >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
        >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
        >>>
        >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
        >>>
        >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
        >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    self.permutation_indexes = get_permutation_indexes(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.image_shape = image_shape
    self.input_length = input_format.num_channels

ConcatenatedTensorFormat

Bases: DetectionOutputFormat

Define the output format that return a single tensor of shape [N,M] (N - number of detections, M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields. A layout defines the order of concatenated tensors. For instance: - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1) - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

custom_format = ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class ConcatenatedTensorFormat(DetectionOutputFormat):
    """
    Define the output format that return a single tensor of shape [N,M] (N - number of detections,
    M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields.
    A layout defines the order of concatenated tensors. For instance:
    - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1)
    - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)


    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> custom_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>         TensorSliceItem(name="label", length=1),
    >>>         TensorSliceItem(name="distance", length=1),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>     )
    >>> )

    """

    layout: Mapping[str, TensorSliceItem]
    locations: Mapping[str, Tuple[int, int]]
    indexes: Mapping[str, List[int]]
    num_channels: int

    @property
    def bboxes_format(self) -> BoundingBoxesTensorSliceItem:
        bbox_items = [x for x in self.layout.values() if isinstance(x, BoundingBoxesTensorSliceItem)]
        return bbox_items[0]

    def __init__(self, layout: Union[List[TensorSliceItem], Tuple[TensorSliceItem, ...]]):
        bbox_items = [x for x in layout if isinstance(x, BoundingBoxesTensorSliceItem)]
        if len(bbox_items) != 1:
            raise RuntimeError("Number of bounding box items must be strictly equal to 1")

        _layout = []
        _locations = []
        _indexes = []

        offset = 0
        for item in layout:
            location_indexes = list(range(offset, offset + item.length))
            location_slice = offset, offset + item.length

            _layout.append((item.name, item))
            _locations.append((item.name, location_slice))
            _indexes.append((item.name, location_indexes))
            offset += item.length

        self.layout = collections.OrderedDict(_layout)
        self.locations = collections.OrderedDict(_locations)
        self.indexes = collections.OrderedDict(_indexes)
        self.num_channels = offset

    def __repr__(self):
        return str(self.layout)

apply_on_bboxes(fn, tensor, tensor_format)

Apply inplace a function only on the bboxes of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
105
106
107
108
109
110
111
112
113
114
115
116
117
def apply_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on the bboxes of a concatenated tensor.

    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return apply_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

apply_on_layout(fn, tensor, tensor_format, layout_name)

Apply inplace a function only on a specific layout of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the layout

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def apply_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on a specific layout of a concatenated tensor.
    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after applying INPLACE the fn on the layout
    """
    location = slice(*iter(tensor_format.locations[layout_name]))
    result = fn(tensor[..., location])
    tensor[..., location] = result
    return tensor

filter_on_bboxes(fn, tensor, tensor_format)

Filter the tensor according to a condition on the bboxes.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
139
140
141
142
143
144
145
146
147
148
149
150
151
def filter_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on the bboxes.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return filter_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

filter_on_layout(fn, tensor, tensor_format, layout_name)

Filter the tensor according to a condition on a specific layout.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after filtering the bboxes according to fn.

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def filter_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on a specific layout.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after filtering the bboxes according to fn.
    """
    location = slice(*tensor_format.locations[layout_name])
    mask = fn(tensor[..., location])
    tensor = tensor[mask]
    return tensor

get_permutation_indexes(input_format, output_format)

Compute the permutations required to change the format layout order.

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Input format to transform from

required
output_format ConcatenatedTensorFormat

Output format to transform to

required

Returns:

Type Description
List[int]

Permutation indexes to go from input to output format.

Source code in V3_4/src/super_gradients/training/datasets/data_formats/formats.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_permutation_indexes(input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat) -> List[int]:
    """Compute the permutations required to change the format layout order.

    :param input_format:    Input format to transform from
    :param output_format:   Output format to transform to
    :return: Permutation indexes to go from input to output format.
    """
    output_indexes = []
    for output_name, output_spec in output_format.layout.items():
        if output_name not in input_format.layout:
            raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

        input_spec = input_format.layout[output_name]
        if input_spec.length != output_spec.length:
            raise RuntimeError(
                f"Length of the output must match in input and output format. "
                f"Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
            )
        indexes = input_format.indexes[output_name]
        output_indexes.extend(indexes)
    return output_indexes

ConvertBoundingBoxes

Bases: nn.Module

Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ConvertBoundingBoxes(nn.Module):
    to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]
    from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]

    def __init__(
        self,
        location: Tuple[int, int],
        to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        image_shape: Tuple[int, int],
    ):
        super().__init__()
        self.to_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], to_xyxy)
        self.from_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], from_xyxy)
        self.image_shape = image_shape
        self.location = location

    def forward(self, x: Tensor) -> Tensor:
        """

        :param x:
        :param image_shape:
        :return:
        """
        location = slice(self.location[0], self.location[1])
        bboxes = x[..., location]
        xyxy = self.to_xyxy(bboxes, self.image_shape)
        x[..., location] = self.from_xyxy(xyxy, self.image_shape)
        return x

forward(x)

Parameters:

Name Type Description Default
x Tensor required
image_shape required

Returns:

Type Description
Tensor
Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
56
57
58
59
60
61
62
63
64
65
66
67
def forward(self, x: Tensor) -> Tensor:
    """

    :param x:
    :param image_shape:
    :return:
    """
    location = slice(self.location[0], self.location[1])
    bboxes = x[..., location]
    xyxy = self.to_xyxy(bboxes, self.image_shape)
    x[..., location] = self.from_xyxy(xyxy, self.image_shape)
    return x

DetectionOutputAdapter

Bases: nn.Module

Adapter class for converting model's predictions for object detection to a desired format. This adapter supports torch.jit tracing & scripting & onnx conversion.

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

class CustomDetectionHead(nn.Module): num_classes: int = 123

@property def format(self): ''' Describe the semantics of the model's output. In this example model's output consists of - Bounding boxes in XYXY format [4] - Predicted probas of N classes [N] - A distance predictions [1] - K additional labels [K] ''' return ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

yolox = YoloX(head=CustomDetectionHead)

Suppose we want to return predictions in another format.

Let it be:

- Bounding boxes in normalized XYWH [4]

- Predicted attributes [4]

- Predicted label [1]

output_format = ConcatenatedTensorFormat( layout=( # Note: For output format it is not required to specify location attribute as it will be # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()), TensorSliceItem(name="attributes", length=4), TensorSliceItem(name="label", length=1), ) )

Now we can construct output adapter and attach it to the model

output_adapter = DetectionOutputAdapter( input_format=yolox.head.format, output_format=output_format, image_shape=(640, 640) )

yolox = nn.Sequential(yolox, output_adapter)

Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class DetectionOutputAdapter(nn.Module):
    """
    Adapter class for converting model's predictions for object detection to a desired format.
    This adapter supports torch.jit tracing & scripting & onnx conversion.

    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> class CustomDetectionHead(nn.Module):
    >>>    num_classes: int = 123
    >>>
    >>>    @property
    >>>    def format(self):
    >>>        '''
    >>>        Describe the semantics of the model's output. In this example model's output consists of
    >>>         - Bounding boxes in XYXY format [4]
    >>>         - Predicted probas of N classes [N]
    >>>         - A distance predictions [1]
    >>>         - K additional labels [K]
    >>>        '''
    >>>        return ConcatenatedTensorFormat(
    >>>            layout=(
    >>>                BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>                TensorSliceItem(name="label", length=1),
    >>>                TensorSliceItem(name="distance", length=1),
    >>>                TensorSliceItem(name="attributes", length=4),
    >>>            )
    >>>        )
    >>>
    >>> yolox = YoloX(head=CustomDetectionHead)
    >>>
    >>> # Suppose we want to return predictions in another format.
    >>> # Let it be:
    >>> # - Bounding boxes in normalized XYWH [4]
    >>> # - Predicted attributes [4]
    >>> # - Predicted label [1]
    >>> output_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         # Note: For output format it is not required to specify location attribute as it will be
    >>>         # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>         TensorSliceItem(name="label", length=1),
    >>>     )
    >>> )
    >>>
    >>> # Now we can construct output adapter and attach it to the model
    >>> output_adapter = DetectionOutputAdapter(
    >>>     input_format=yolox.head.format,
    >>>     output_format=output_format,
    >>>     image_shape=(640, 640)
    >>> )
    >>>
    >>> yolox = nn.Sequential(yolox, output_adapter)
    >>>
    """

    def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
        """

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        super().__init__()

        self.format_conversion: nn.Module = self.get_format_conversion_module(
            location=input_format.locations[input_format.bboxes_format.name],
            input_bbox_format=input_format.bboxes_format.format,
            output_bbox_format=output_format.bboxes_format.format,
            image_shape=image_shape,
        )

        self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.input_length = input_format.num_channels

    def forward(self, predictions: Tensor) -> Tensor:
        """
        Convert output detections to the user-specified format
        :param predictions:
        :return:
        """
        if predictions.size(-1) != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
                f"equal to {self.input_length} as defined by input format."
            )

        predictions = self.format_conversion(predictions.clone())
        predictions = self.rearrange_outputs(predictions)
        return predictions

    @classmethod
    def get_rearrange_outputs_module(
        cls, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat
    ) -> Tuple[RearrangeOutput, ConcatenatedTensorFormat]:

        output_indexes = []
        rearranged_layout = []

        offset = 0
        for output_name, output_spec in output_format.layout.items():
            if output_name not in input_format.layout:
                raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

            input_spec = input_format.layout[output_name]

            if input_spec.length != output_spec.length:
                raise RuntimeError(
                    "Length of the output must match in input and output format. "
                    "Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
                )
            indexes = input_format.indexes[output_name]
            output_indexes.extend(indexes)
            output_len = len(indexes)

            rearranged_item = copy.deepcopy(output_spec)
            offset += output_len

            rearranged_layout.append(rearranged_item)
        rearranged_format = ConcatenatedTensorFormat(rearranged_layout)
        return RearrangeOutput(torch.tensor(output_indexes).long()), rearranged_format

    @classmethod
    def get_format_conversion_module(
        cls, location: Tuple[int, int], input_bbox_format: BoundingBoxFormat, output_bbox_format: BoundingBoxFormat, image_shape: Union[Tuple[int, int], None]
    ) -> ConvertBoundingBoxes:
        return ConvertBoundingBoxes(
            location=location,
            to_xyxy=input_bbox_format.get_to_xyxy(False),
            from_xyxy=output_bbox_format.get_from_xyxy(True),
            image_shape=image_shape,
        )

__init__(input_format, output_format, image_shape)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
    """

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    super().__init__()

    self.format_conversion: nn.Module = self.get_format_conversion_module(
        location=input_format.locations[input_format.bboxes_format.name],
        input_bbox_format=input_format.bboxes_format.format,
        output_bbox_format=output_format.bboxes_format.format,
        image_shape=image_shape,
    )

    self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.input_length = input_format.num_channels

forward(predictions)

Convert output detections to the user-specified format

Parameters:

Name Type Description Default
predictions Tensor required

Returns:

Type Description
Tensor
Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def forward(self, predictions: Tensor) -> Tensor:
    """
    Convert output detections to the user-specified format
    :param predictions:
    :return:
    """
    if predictions.size(-1) != self.input_length:
        raise RuntimeError(
            f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
            f"equal to {self.input_length} as defined by input format."
        )

    predictions = self.format_conversion(predictions.clone())
    predictions = self.rearrange_outputs(predictions)
    return predictions

RearrangeOutput

Bases: nn.Module

Rearrange elements in last dimension of input tensor with respect to index argument

Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class RearrangeOutput(nn.Module):
    """
    Rearrange elements in last dimension of input tensor with respect to index argument

    """

    def __init__(self, indexes: Tensor):
        super().__init__()
        self.indexes = indexes

    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: Input tensor of  [..., N] shape
        :return: Output tensor of [..., N[index]] shape
        """
        if torch.jit.is_scripting():
            # Workaround "Ellipses followed by tensor indexing is currently not supported"
            # https://github.com/pytorch/pytorch/issues/34837
            x = torch.moveaxis(x, -1, 0)
            x = x[self.indexes]
            x = torch.moveaxis(x, 0, -1)
            return x
        else:
            return x[..., self.indexes]

forward(x)

Parameters:

Name Type Description Default
x Tensor

Input tensor of [..., N] shape

required

Returns:

Type Description
Tensor

Output tensor of [..., N[index]] shape

Source code in V3_4/src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def forward(self, x: Tensor) -> Tensor:
    """
    :param x: Input tensor of  [..., N] shape
    :return: Output tensor of [..., N[index]] shape
    """
    if torch.jit.is_scripting():
        # Workaround "Ellipses followed by tensor indexing is currently not supported"
        # https://github.com/pytorch/pytorch/issues/34837
        x = torch.moveaxis(x, -1, 0)
        x = x[self.indexes]
        x = torch.moveaxis(x, 0, -1)
        return x
    else:
        return x[..., self.indexes]

AbstractCollateFunction

Bases: ABC

A collate function (for torch DataLoader)

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
76
77
78
79
80
81
82
83
class AbstractCollateFunction(ABC):
    """
    A collate function (for torch DataLoader)
    """

    @abstractmethod
    def __call__(self, batch):
        pass

AbstractPrePredictionCallback

Bases: ABC

Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params pre_prediction_callback keyword arg.

Should implement call and return images, targets after applying the desired preprocessing.

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
175
176
177
178
179
180
181
182
183
184
185
class AbstractPrePredictionCallback(ABC):
    """
    Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params
     pre_prediction_callback keyword arg.

    Should implement __call__ and return images, targets after applying the desired preprocessing.
    """

    @abstractmethod
    def __call__(self, inputs, targets, batch_idx):
        pass

ComposedCollateFunction

Bases: AbstractCollateFunction

A function (for torch DataLoader) which executes a sequence of sub collate functions

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
86
87
88
89
90
91
92
93
94
95
96
97
98
@register_collate_function()
class ComposedCollateFunction(AbstractCollateFunction):
    """
    A function (for torch DataLoader) which executes a sequence of sub collate functions
    """

    def __init__(self, functions: list):
        self.functions = functions

    def __call__(self, batch):
        for f in self.functions:
            batch = f(batch)
        return batch

DatasetStatisticsTensorboardLogger

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
class DatasetStatisticsTensorboardLogger:

    logger = get_logger(__name__)
    DEFAULT_SUMMARY_PARAMS = {
        "sample_images": 32,  # by default, 32 images will be sampled from each dataset
        "plot_class_distribution": True,
        "plot_box_size_distribution": True,
        "plot_anchors_coverage": True,
        "max_batches": 30,
    }

    def __init__(self, sg_logger, summary_params: dict = DEFAULT_SUMMARY_PARAMS):
        self.sg_logger = sg_logger
        self.summary_params = {**DatasetStatisticsTensorboardLogger.DEFAULT_SUMMARY_PARAMS, **summary_params}

    def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
        """
        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. applicable only for detection datasets
        :param all_classes: the list of all classes names
        """
        # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
        # if isinstance(data_loader.dataset, DetectionDataSet):
        #     self._analyze_detection(data_loader=data_loader, title=title,
        #                             all_classes=all_classes, anchors=anchors)
        # else:
        #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
        DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

    def _analyze_detection(self, data_loader, title, all_classes, anchors=None):
        """
        Analyze a detection dataset

        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param all_classes: the list of all classes names
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. if not provided, anchors coverage will not be analyzed
        """
        try:
            color_mean = AverageMeter()
            color_std = AverageMeter()
            all_labels = []
            image_size = 0
            for i, (images, labels) in enumerate(tqdm(data_loader)):

                if i >= self.summary_params["max_batches"] > 0:
                    break

                if i == 0:
                    image_size = max(images[0].shape[1], images[0].shape[2])
                    if images.shape[0] > self.summary_params["sample_images"]:
                        samples = images[: self.summary_params["sample_images"]]
                    else:
                        samples = images

                    pred = [torch.zeros(size=(0, 6)) for _ in range(len(samples))]
                    try:
                        result_images = DetectionVisualization.visualize_batch(
                            image_tensor=samples,
                            pred_boxes=pred,
                            target_boxes=copy.deepcopy(labels),
                            batch_name=title,
                            class_names=all_classes,
                            box_thickness=1,
                            gt_alpha=1.0,
                        )

                        self.sg_logger.add_images(tag=f"{title} sample images", images=np.stack(result_images).transpose([0, 3, 1, 2])[:, ::-1, :, :])
                    except Exception as e:
                        DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at adding an example batch:\n{e}")
                        return

                all_labels.append(labels)
                color_mean.update(torch.mean(images, dim=[0, 2, 3]), 1)
                color_std.update(torch.std(images, dim=[0, 2, 3]), 1)

            all_labels = torch.cat(all_labels, dim=0)[1:].numpy()

            try:
                if self.summary_params["plot_class_distribution"]:
                    self._analyze_class_distribution(labels=all_labels, num_classes=len(all_classes), title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing class distributions.\n{e}")
                return

            try:
                if self.summary_params["plot_box_size_distribution"]:
                    self._analyze_object_size_distribution(labels=all_labels, title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing object size " f"distributions.\n{e}")
                return

            summary = ""
            summary += f"dataset size: {len(data_loader)}  \n"
            summary += f"color mean: {color_mean.average}  \n"
            summary += f"color std: {color_std.average}  \n"

            try:
                if anchors is not None and image_size > 0:
                    coverage = self._analyze_anchors_coverage(anchors=anchors, image_size=image_size, title=title, labels=all_labels)
                    summary += f"anchors: {anchors}  \n"
                    summary += f"anchors coverage: {coverage}  \n"
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing anchors " f"coverage.\n{e}")
                return

            self.sg_logger.add_text(tag=f"{title} Statistics", text_string=summary)
            self.sg_logger.flush()

        except Exception as e:
            DatasetStatisticsTensorboardLogger.logger.error(f"dataset analysis failed!\n{e}")

    def _analyze_class_distribution(self, labels: list, num_classes: int, title: str):
        hist, edges = np.histogram(labels[:, 0], num_classes)

        f = plt.figure(figsize=[10, 8])

        plt.bar(range(num_classes), hist, width=0.5, color="#0504aa", alpha=0.7)
        plt.xlim(-1, num_classes)
        plt.grid(axis="y", alpha=0.75)
        plt.xlabel("Value", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.ylabel("Frequency", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.xticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.yticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.title(f"{title} class distribution", fontsize=STAT_LOGGER_FONT_SIZE)

        self.sg_logger.add_figure(f"{title} class distribution", figure=f)
        text_dist = ""
        for i, val in enumerate(hist):
            text_dist += f"[{i}]: {val}, "

        self.sg_logger.add_text(tag=f"{title} class distribution", text_string=text_dist)

    def _analyze_object_size_distribution(self, labels: list, title: str):
        """
        This function will add two plots to the tensorboard.
        one is a 2D histogram and the other is a scatter plot. in both cases the X axis is the object width and Y axis
        is the object width (both normalized by image size)
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        # histogram plot
        hist, xedges, yedges = np.histogram2d(labels[:, 4], labels[:, 3], 50)  # x and y are deliberately switched

        fig = plt.figure(figsize=(10, 6))
        fig.suptitle(f"{title} boxes w/h distribution")
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(np.log(hist + 1), interpolation="nearest", origin="lower", extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

        # scatter plot
        if len(labels) > 10000:
            # we randomly sample just 10000 objects so that the scatter plot will not get too dense
            labels = labels[np.random.randint(0, len(labels) - 1, 10000)]
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        plt.scatter(labels[:, 3], labels[:, 4], marker=".")

        self.sg_logger.add_figure(tag=f"{title} boxes w/h distribution", figure=fig)

    @staticmethod
    def _get_rect(w, h):
        min_w = w / 4.0
        min_h = h / 4.0
        return Rectangle((min_w, min_h), w * 4 - min_w, h * 4 - min_h, linewidth=1, edgecolor="b", facecolor="none")

    @staticmethod
    def _get_score(anchors: np.ndarray, points: np.ndarray, image_size: int):
        """
        Calculate the ratio (and 1/ratio) between each anchor width and height and each point (representing a possible
        object width and height).
        i.e. for an anchor with w=10,h=20 the point w=11,h=25 will have the ratios 11/10=1.1 and 25/20=1.25
        or 10/11=0.91 and 20/25=0.8 respectively

        :param anchors: array of anchors of the shape [2,N]
        :param points: array of points of the shape [2,M]
        :param image_size the size of the input image

        :returns: an array of size [image_size - 1, image_size - 1] where each cell i,j represent the minimum ratio
        for that cell (point) from all anchors
        """

        ratio = (
            anchors[:, :, None]
            / points[
                :,
            ]
        )
        inv_ratio = 1 / ratio
        min_ratio = 1 - np.minimum(ratio, inv_ratio)
        min_ratio = np.max(min_ratio, axis=1)
        to_closest_anchor = np.min(min_ratio, axis=0)
        to_closest_anchor[to_closest_anchor > 0.75] = 2
        return to_closest_anchor.reshape(image_size - 1, -1)

    def _analyze_anchors_coverage(self, anchors: Anchors, image_size: int, labels: list, title: str):
        """
        This function will add anchors coverage plots to the tensorboard.
        :param anchors: a list of anchors
        :param image_size: the input image size for this training
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        fig = plt.figure(figsize=(12, 5))
        fig.suptitle(f"{title} anchors coverage")

        # box style plot
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_xlim([0, image_size])
        ax.set_ylim([0, image_size])

        anchors_boxes = anchors.anchors.cpu().numpy()
        anchors_len = anchors.num_anchors

        anchors_boxes = anchors_boxes.reshape(-1, 2)

        for i in range(anchors_len):
            rect = self._get_rect(anchors_boxes[i][0], anchors_boxes[i][1])
            rect.set_alpha(0.3)
            rect.set_facecolor([random.random(), random.random(), random.random(), 0.3])
            ax.add_patch(rect)

        # distance from anchor plot
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        x = np.arange(1, image_size, 1)
        y = np.arange(1, image_size, 1)

        xx, yy = np.meshgrid(x, y, sparse=False, indexing="xy")
        points = np.concatenate([xx.reshape(1, -1), yy.reshape(1, -1)])

        color = self._get_score(anchors_boxes, points, image_size)

        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(color, interpolation="nearest", origin="lower", extent=[0, image_size, 0, image_size])

        # calculate the coverage for the dataset labels
        cover_masks = []
        for i in range(anchors_len):
            w_max = (anchors_boxes[i][0] / image_size) * 4
            w_min = (anchors_boxes[i][0] / image_size) * 0.25
            h_max = (anchors_boxes[i][1] / image_size) * 4
            h_min = (anchors_boxes[i][1] / image_size) * 0.25
            cover_masks.append(
                np.logical_and(np.logical_and(np.logical_and(labels[:, 3] < w_max, labels[:, 3] > w_min), labels[:, 4] < h_max), labels[:, 4] > h_min)
            )
        cover_masks = np.stack(cover_masks)
        coverage = np.count_nonzero(np.any(cover_masks, axis=0)) / len(labels)

        self.sg_logger.add_figure(tag=f"{title} anchors coverage", figure=fig)
        return coverage

analyze(data_loader, title, all_classes, anchors=None)

Parameters:

Name Type Description Default
data_loader torch.utils.data.DataLoader

the dataset data loader

required
dataset_params

the dataset parameters

required
title str

the title for this dataset (i.e. Coco 2017 test set)

required
anchors list

the list of anchors used by the model. applicable only for detection datasets

None
all_classes List[str]

the list of all classes names

required
Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
    """
    :param data_loader: the dataset data loader
    :param dataset_params: the dataset parameters
    :param title: the title for this dataset (i.e. Coco 2017 test set)
    :param anchors: the list of anchors used by the model. applicable only for detection datasets
    :param all_classes: the list of all classes names
    """
    # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
    # if isinstance(data_loader.dataset, DetectionDataSet):
    #     self._analyze_detection(data_loader=data_loader, title=title,
    #                             all_classes=all_classes, anchors=anchors)
    # else:
    #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
    DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

DetectionMultiscalePrePredictionCallback

Bases: MultiscalePrePredictionCallback

Mutiscalepre-prediction callback for object detection.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.

Parameters:

Name Type Description Default
multiscale_range

Range of values for resize sizes as discussed above (default=5)

required
image_size_steps

Image step sizes as discussed abov (default=32)

required
change_frequency

The frequency to apply change in input size.

required
Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
@register_callback(Callbacks.DETECTION_MULTISCALE_PREPREDICTION)
class DetectionMultiscalePrePredictionCallback(MultiscalePrePredictionCallback):
    """
    Mutiscalepre-prediction callback for object detection.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.

    """

    def __call__(self, inputs, targets, batch_idx):
        # RESCALE THE IMAGE FIRST WITH SUPER(), AND IF RESCALING HAS ACTUALLY BEEN DONE APPLY TO BOXES AS WELL
        input_size = inputs.shape[2:]
        inputs, targets = super(DetectionMultiscalePrePredictionCallback, self).__call__(inputs, targets, batch_idx)
        new_input_size = inputs.shape[2:]
        scale_y = new_input_size[0] / input_size[0]
        scale_x = new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            targets[..., 2::2] = targets[..., 2::2] * scale_x
            targets[..., 3::2] = targets[..., 3::2] * scale_y
        return inputs, targets

MultiScaleCollateFunction

Bases: AbstractCollateFunction

a collate function to implement multi-scale data augmentation according to https://arxiv.org/pdf/1612.08242.pdf

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@register_collate_function()
class MultiScaleCollateFunction(AbstractCollateFunction):
    """
    a collate function to implement multi-scale data augmentation
    according to https://arxiv.org/pdf/1612.08242.pdf
    """

    _counter = AtomicInteger(0)
    _current_size = AtomicInteger(0)
    _lock = Lock()

    def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
        """
        set parameters for the multi-scale collate function
        the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
        a new size will be randomly selected every change_frequency calls to the collate_fn()
            :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
            :param min_image_size: the minimum size to scale down to (in pixels)
            :param max_image_size: the maximum size to scale up to (in pixels)
            :param image_size_steps: typically, the stride of the net, which defines the possible image
                    size multiplications
            :param change_frequency:
        """
        assert target_size is not None or (
            max_image_size is not None and min_image_size is not None
        ), "either target_size or min_image_size and max_image_size has to be set"
        assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

        if target_size is not None:
            min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
            max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

        print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

        self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self._current_size = random.choice(self.sizes)

    def __call__(self, batch):

        with self._lock:

            # Important: this implementation was tailored for a specific input. it assumes the batch is a tuple where
            # the images are the first item
            assert isinstance(batch, tuple), "this collate function expects the input to be a tuple (images, labels)"
            images = batch[0]
            if self._counter % self.frequency == 0:
                self._current_size = random.choice(self.sizes)
            self._counter += 1

            assert images.shape[2] % self.image_size_steps == 0 and images.shape[3] % self.image_size_steps == 0, (
                "images sized not divisible by %d. (resize images before calling multi_scale)" % self.image_size_steps
            )

            if self._current_size != max(images.shape[2:]):
                ratio = float(self._current_size) / max(images.shape[2:])
                new_size = (int(round(images.shape[2] * ratio)), int(round(images.shape[3] * ratio)))
                images = F.interpolate(images, size=new_size, mode="bilinear", align_corners=False)

            return images, batch[1]

__init__(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)

set parameters for the multi-scale collate function the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps a new size will be randomly selected every change_frequency calls to the collate_fn() :param target_size: scales will be [0.66 * target_size, 1.5 * target_size] :param min_image_size: the minimum size to scale down to (in pixels) :param max_image_size: the maximum size to scale up to (in pixels) :param image_size_steps: typically, the stride of the net, which defines the possible image size multiplications :param change_frequency:

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
    """
    set parameters for the multi-scale collate function
    the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
    a new size will be randomly selected every change_frequency calls to the collate_fn()
        :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
        :param min_image_size: the minimum size to scale down to (in pixels)
        :param max_image_size: the maximum size to scale up to (in pixels)
        :param image_size_steps: typically, the stride of the net, which defines the possible image
                size multiplications
        :param change_frequency:
    """
    assert target_size is not None or (
        max_image_size is not None and min_image_size is not None
    ), "either target_size or min_image_size and max_image_size has to be set"
    assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

    if target_size is not None:
        min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
        max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

    print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

    self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
    self.image_size_steps = image_size_steps
    self.frequency = change_frequency
    self._current_size = random.choice(self.sizes)

MultiscalePrePredictionCallback

Bases: AbstractPrePredictionCallback

Mutiscale pre-prediction callback pass function.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps)

Parameters:

Name Type Description Default
multiscale_range int

Range of values for resize sizes as discussed above (default=5)

5
image_size_steps int

Image step sizes as discussed abov (default=32)

32
change_frequency int

The frequency to apply change in input size.

10
Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
class MultiscalePrePredictionCallback(AbstractPrePredictionCallback):
    """
    Mutiscale pre-prediction callback pass function.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps)


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.
    """

    def __init__(self, multiscale_range: int = 5, image_size_steps: int = 32, change_frequency: int = 10):

        self.multiscale_range = multiscale_range
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self.rank = None
        self.is_distributed = None
        self.sampled_imres_once = False
        self.new_input_size = None

    def __call__(self, inputs, targets, batch_idx):
        if self.rank is None:
            self.rank = get_local_rank()
        if self.is_distributed is None:
            self.is_distributed = get_world_size() > 1

        # GENERATE A NEW SIZE AND BROADCAST IT TO THE THE OTHER RANKS SO THEY HAVE THE SAME SCALE
        input_size = inputs.shape[2:]
        if batch_idx % self.frequency == 0:
            tensor = torch.LongTensor(2).to(inputs.device)

            if self.rank == 0:
                size_factor = input_size[1] * 1.0 / input_size[0]
                min_size = int(input_size[0] / self.image_size_steps) - self.multiscale_range
                max_size = int(input_size[0] / self.image_size_steps) + self.multiscale_range
                random_size = (min_size, max_size)
                if self.sampled_imres_once:
                    size = random.randint(*random_size)
                else:
                    # sample the biggest resolution first to make sure the run fits into the GPU memory
                    size = max_size
                    self.sampled_imres_once = True
                size = (int(self.image_size_steps * size), self.image_size_steps * int(size * size_factor))
                tensor[0] = size[0]
                tensor[1] = size[1]

            if self.is_distributed:
                dist.barrier()
                dist.broadcast(tensor, 0)

            self.new_input_size = (tensor[0].item(), tensor[1].item())

        scale_y = self.new_input_size[0] / input_size[0]
        scale_x = self.new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            inputs = torch.nn.functional.interpolate(inputs, size=self.new_input_size, mode="bilinear", align_corners=False)
        return inputs, targets

RandomResizedCropAndInterpolation

Bases: RandomResizedCrop

Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks.

Parameters:

Name Type Description Default
size

Expected output size of each edge

required
scale

Range of size of the origin size cropped

(0.08, 1.0)
ratio

Range of aspect ratio of the origin aspect ratio cropped

(3.0 / 4.0, 4.0 / 3.0)
interpolation

Default: PIL.Image.BILINEAR

'default'
Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
@register_transform(Transforms.RandomResizedCropAndInterpolation)
class RandomResizedCropAndInterpolation(RandomResizedCrop):
    """
    Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
    is finally resized to given size.
    This is popularly used to train the Inception networks.

    :param size: Expected output size of each edge
    :param scale: Range of size of the origin size cropped
    :param ratio: Range of aspect ratio of the origin aspect ratio cropped
    :param interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="default"):
        super(RandomResizedCropAndInterpolation, self).__init__(size=size, scale=scale, ratio=ratio, interpolation=interpolation)
        if interpolation == "random":
            self.interpolation = _RANDOM_INTERPOLATION
        elif interpolation == "default":
            self.interpolation = InterpolationMode.BILINEAR
        else:
            self.interpolation = _pil_interp(interpolation)

    def forward(self, img: Image) -> Image:
        """
        :param img: Image to be cropped and resized.
        :return: Image: Randomly cropped and resized image.
        """
        i, j, h, w = self.get_params(img, self.scale, self.ratio)
        if isinstance(self.interpolation, (tuple, list)):
            interpolation = random.choice(self.interpolation)
        else:
            interpolation = self.interpolation
        return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

    def __repr__(self):
        if isinstance(self.interpolation, (tuple, list)):
            interpolate_str = " ".join([_pil_interpolation_to_str[x] for x in self.interpolation])
        else:
            interpolate_str = _pil_interpolation_to_str[self.interpolation]
        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
        format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale))
        format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio))
        format_string += ", interpolation={0})".format(interpolate_str)
        return format_string

forward(img)

Parameters:

Name Type Description Default
img Image

Image to be cropped and resized.

required

Returns:

Type Description
Image

Image: Randomly cropped and resized image.

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
344
345
346
347
348
349
350
351
352
353
354
def forward(self, img: Image) -> Image:
    """
    :param img: Image to be cropped and resized.
    :return: Image: Randomly cropped and resized image.
    """
    i, j, h, w = self.get_params(img, self.scale, self.ratio)
    if isinstance(self.interpolation, (tuple, list)):
        interpolation = random.choice(self.interpolation)
    else:
        interpolation = self.interpolation
    return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])

Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned according to rand_augment_config_string

Parameters:

Name Type Description Default
rand_augment_config_string str

string which defines the auto augment configurations. If none, color jitter will be returned. For possibile values see auto_augment.py

required
color_jitter tuple

tuple for color jitter value.

required
crop_size

relevant only for auto augment

224
img_mean

relevant only for auto augment

[0.485, 0.456, 0.406]

Returns:

Type Description

RandAugment transform or ColorJitter

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
def get_color_augmentation(rand_augment_config_string: str, color_jitter: tuple, crop_size=224, img_mean=[0.485, 0.456, 0.406]):
    """
    Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned
    according to rand_augment_config_string

    :param rand_augment_config_string: string which defines the auto augment configurations.
                                       If none, color jitter will be returned. For possibile values see auto_augment.py
    :param color_jitter: tuple for color jitter value.
    :param crop_size: relevant only for auto augment
    :param img_mean: relevant only for auto augment
    :return: RandAugment transform or ColorJitter
    """
    if rand_augment_config_string:
        color_augmentation = rand_augment_transform(rand_augment_config_string, crop_size, img_mean)

    else:  # RandAugment includes colorjitter like augmentations, both cannot be applied together.
        color_augmentation = transforms.ColorJitter(*color_jitter)
    return color_augmentation

get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)

A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

Parameters:

Name Type Description Default
data_dir

String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"

None
dataloader

a torch DataLoader, as it would feed the data into the trainer (including transforms etc).

None
RandomResizeSize

Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet, this value should be 224).

224

Returns:

Type Description

2 lists,mean and std, each one of len 3 (1 for each channel)

Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224):
    """
    A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

    :param data_dir: String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"
    :param dataloader: a torch DataLoader, as it would feed the data into the trainer (including transforms etc).
    :param RandomResizeSize: Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet,
    this value should be 224).
    :return: 2 lists,mean and std, each one of len 3 (1 for each channel)
    """
    assert data_dir is None or dataloader is None, "Please provide either path to data folder or DataLoader, not both."

    if dataloader is None:
        traindir = os.path.join(os.path.abspath(data_dir), "train")
        trainset = ImageFolder(
            traindir, transforms.Compose([transforms.RandomResizedCrop(RandomResizeSize), transforms.RandomHorizontalFlip(), transforms.ToTensor()])
        )
        dataloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=num_workers)

    print(f"Calculating on {len(dataloader.dataset.targets)} Training Samples")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    h, w = 0, 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            h, w = inputs.size(2), inputs.size(3)
            print(f"Min: {inputs.min()}, Max: {inputs.max()}")
            chsum = inputs.sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += inputs.sum(dim=(0, 2, 3), keepdim=True)
    mean = chsum / len(trainset) / h / w
    print(f"mean: {mean.view(-1)}")

    chsum = None
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            chsum = (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
    std = torch.sqrt(chsum / (len(trainset) * h * w - 1))
    print(f"std: {std.view(-1)}")
    return mean.view(-1).cpu().numpy().tolist(), std.view(-1).cpu().numpy().tolist()

worker_init_reset_seed(worker_id)

Make sure each process has different random seed, especially for 'fork' method. Check https://github.com/pytorch/pytorch/issues/63311 for more details.

Parameters:

Name Type Description Default
worker_id

placeholder (needs to be passed to DataLoader init).

required
Source code in V3_4/src/super_gradients/training/datasets/datasets_utils.py
657
658
659
660
661
662
663
664
665
666
667
def worker_init_reset_seed(worker_id):
    """
    Make sure each process has different random seed, especially for 'fork' method.
    Check https://github.com/pytorch/pytorch/issues/63311 for more details.

    :param worker_id: placeholder (needs to be passed to DataLoader init).
    """
    seed = uuid.uuid4().int % 2**32
    random.seed(seed)
    torch.set_rng_state(torch.manual_seed(seed).get_state())
    np.random.seed(seed)

COCODetectionDataset

Bases: COCOFormatDetectionDataset

Dataset for COCO object detection.

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset:
    >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/coco_detection.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@register_dataset(Datasets.COCO_DETECTION_DATASET)
class COCODetectionDataset(COCOFormatDetectionDataset):
    """Dataset for COCO object detection.

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset:
            >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(
        self,
        json_file: str = "instances_train2017.json",
        subdir: str = "images/train2017",
        *args,
        **kwargs,
    ):
        """
        :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
        :param subdir:              Sub directory of data_dir containing the data.
        :param tight_box_rotation:  bool, whether to use of segmentation maps convex hull as target_seg
                                    (check get_sample docs).
        :param with_crowd: Add the crowd groundtruths to __getitem__

        kwargs:
            all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
        """
        super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

__init__(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)

Parameters:

Name Type Description Default
json_file str

Name of the coco json file, that resides in data_dir/annotations/json_file.

'instances_train2017.json'
subdir str

Sub directory of data_dir containing the data.

'images/train2017'
tight_box_rotation

bool, whether to use of segmentation maps convex hull as target_seg (check get_sample docs).

required
with_crowd

Add the crowd groundtruths to getitem kwargs: all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.

required
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/coco_detection.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    json_file: str = "instances_train2017.json",
    subdir: str = "images/train2017",
    *args,
    **kwargs,
):
    """
    :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
    :param subdir:              Sub directory of data_dir containing the data.
    :param tight_box_rotation:  bool, whether to use of segmentation maps convex hull as target_seg
                                (check get_sample docs).
    :param with_crowd: Add the crowd groundtruths to __getitem__

    kwargs:
        all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
    """
    super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

COCOFormatDetectionDataset

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the COCO dataset. - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh). - One folder with all the images.

Output format: (x, y, x, y, class_id)

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
class COCOFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the COCO dataset.
    - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh).
    - One folder with all the images.

    Output format: (x, y, x, y, class_id)
    """

    def __init__(
        self,
        data_dir: str,
        json_annotation_file: str,
        images_dir: str,
        tight_box_rotation: bool = False,
        with_crowd: bool = True,
        class_ids_to_ignore: Optional[List[int]] = None,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param json_annotation_file:    Name of the coco json file. Path relative to data_dir.
        :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
        :param tight_box_rotation:      bool, whether to use of segmentation maps convex hull as target_seg
                                            (check get_sample docs).
        :param with_crowd:              Add the crowd groundtruths to __getitem__
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        """
        self.images_dir = images_dir
        self.json_annotation_file = json_annotation_file
        self.tight_box_rotation = tight_box_rotation
        self.with_crowd = with_crowd
        self.class_ids_to_ignore = class_ids_to_ignore or []

        target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
        kwargs["target_fields"] = target_fields
        kwargs["output_fields"] = ["image", *target_fields]
        kwargs["original_target_format"] = XYXY_LABEL
        super().__init__(data_dir=data_dir, *args, **kwargs)

        if len(self.original_classes) != len(self.all_classes_list):
            if set(self.all_classes_list).issubset(set(self.original_classes)):
                raise ParameterMismatchException(
                    "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                    "Please use `class_inclusion_list` to train with reduced number of classes",
                )
            else:
                raise DatasetValidationException(
                    "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                    "Most likely this indicates an error in your all_classes_list parameter"
                )

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: List of tuples made of (img_path,target_path)
        """

        self.coco = self._init_coco()
        self.class_ids = sorted(cls_id for cls_id in self.coco.getCatIds() if cls_id not in self.class_ids_to_ignore)
        self.original_classes = list([category["name"] for category in self.coco.loadCats(self.class_ids)])
        self.classes = copy.deepcopy(self.original_classes)
        self.sample_id_to_coco_id = self.coco.getImgIds()
        return len(self.sample_id_to_coco_id)

    @property
    def _all_classes(self) -> List[str]:
        return self.original_classes

    def _init_coco(self) -> COCO:
        annotation_file_path = os.path.join(self.data_dir, self.json_annotation_file)
        if not os.path.exists(annotation_file_path):
            raise ValueError("Could not find annotation file under " + str(annotation_file_path))

        if not self.verbose:
            with redirect_stdout(open(os.devnull, "w")):
                coco = COCO(annotation_file_path)
        else:
            coco = COCO(annotation_file_path)

        remove_useless_info(coco, self.tight_box_rotation)
        return coco

    def _load_annotation(self, sample_id: int) -> dict:
        """
        Load relevant information of a specific image.

        :param sample_id:               Sample_id in the dataset
        :return target:                 Target Bboxes (detection) in XYXY_LABEL format
        :return crowd_target:           Crowd target Bboxes (detection) in XYXY_LABEL format
        :return target_segmentation:    Segmentation
        :return initial_img_shape:      Image (height, width)
        :return resized_img_shape:      Resides image (height, width)
        :return img_path:               Path to the associated image
        """

        img_id = self.sample_id_to_coco_id[sample_id]

        img_metadata = self.coco.loadImgs(img_id)[0]
        width = img_metadata["width"]
        height = img_metadata["height"]

        img_annotation_ids = self.coco.getAnnIds(imgIds=[int(img_id)])
        img_annotations = self.coco.loadAnns(img_annotation_ids)

        cleaned_annotations = []
        for annotation in img_annotations:
            x1 = np.max((0, annotation["bbox"][0]))
            y1 = np.max((0, annotation["bbox"][1]))
            x2 = np.min((width, x1 + np.max((0, annotation["bbox"][2]))))
            y2 = np.min((height, y1 + np.max((0, annotation["bbox"][3]))))
            if annotation["area"] > 0 and x2 >= x1 and y2 >= y1:
                annotation["clean_bbox"] = [x1, y1, x2, y2]
                cleaned_annotations.append(annotation)

        non_crowd_annotations = [annotation for annotation in cleaned_annotations if annotation["iscrowd"] == 0]

        target = np.zeros((len(non_crowd_annotations), 5))
        num_seg_values = 98 if self.tight_box_rotation else 0
        target_segmentation = np.ones((len(non_crowd_annotations), num_seg_values))
        target_segmentation.fill(np.nan)
        for ix, annotation in enumerate(non_crowd_annotations):
            cls = self.class_ids.index(annotation["category_id"])
            target[ix, 0:4] = annotation["clean_bbox"]
            target[ix, 4] = cls
            if self.tight_box_rotation:
                seg_points = [j for i in annotation.get("segmentation", []) for j in i]
                if seg_points:
                    seg_points_c = np.array(seg_points).reshape((-1, 2)).astype(np.int32)
                    seg_points_convex = cv2.convexHull(seg_points_c).ravel()
                else:
                    seg_points_convex = []
                target_segmentation[ix, : len(seg_points_convex)] = seg_points_convex

        crowd_annotations = [annotation for annotation in cleaned_annotations if annotation["iscrowd"] == 1]

        crowd_target = np.zeros((len(crowd_annotations), 5))
        for ix, annotation in enumerate(crowd_annotations):
            cls = self.class_ids.index(annotation["category_id"])
            crowd_target[ix, 0:4] = annotation["clean_bbox"]
            crowd_target[ix, 4] = cls

        # Currently, the base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        initial_img_shape = (height, width)
        if self.input_dim is not None:
            r = min(self.input_dim[0] / height, self.input_dim[1] / width)
            target[:, :4] *= r
            crowd_target[:, :4] *= r
            target_segmentation *= r
            resized_img_shape = (int(height * r), int(width * r))
        else:
            resized_img_shape = initial_img_shape

        file_name = img_metadata["file_name"] if "file_name" in img_metadata else "{:012}".format(img_id) + ".jpg"
        img_path = os.path.join(self.data_dir, self.images_dir, file_name)
        img_id = self.sample_id_to_coco_id[sample_id]

        annotation = {
            "target": target,
            "crowd_target": crowd_target,
            "target_segmentation": target_segmentation,
            "initial_img_shape": initial_img_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": img_path,
            "id": np.array([img_id]),
        }
        return annotation

__init__(data_dir, json_annotation_file, images_dir, tight_box_rotation=False, with_crowd=True, class_ids_to_ignore=None, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
json_annotation_file str

Name of the coco json file. Path relative to data_dir.

required
images_dir str

Name of the directory that includes all the images. Path relative to data_dir.

required
tight_box_rotation bool

bool, whether to use of segmentation maps convex hull as target_seg (check get_sample docs).

False
with_crowd bool

Add the crowd groundtruths to getitem

True
class_ids_to_ignore Optional[List[int]]

List of class ids to ignore in the dataset. By default, doesnt ignore any class.

None
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __init__(
    self,
    data_dir: str,
    json_annotation_file: str,
    images_dir: str,
    tight_box_rotation: bool = False,
    with_crowd: bool = True,
    class_ids_to_ignore: Optional[List[int]] = None,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param json_annotation_file:    Name of the coco json file. Path relative to data_dir.
    :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
    :param tight_box_rotation:      bool, whether to use of segmentation maps convex hull as target_seg
                                        (check get_sample docs).
    :param with_crowd:              Add the crowd groundtruths to __getitem__
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    """
    self.images_dir = images_dir
    self.json_annotation_file = json_annotation_file
    self.tight_box_rotation = tight_box_rotation
    self.with_crowd = with_crowd
    self.class_ids_to_ignore = class_ids_to_ignore or []

    target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
    kwargs["target_fields"] = target_fields
    kwargs["output_fields"] = ["image", *target_fields]
    kwargs["original_target_format"] = XYXY_LABEL
    super().__init__(data_dir=data_dir, *args, **kwargs)

    if len(self.original_classes) != len(self.all_classes_list):
        if set(self.all_classes_list).issubset(set(self.original_classes)):
            raise ParameterMismatchException(
                "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                "Please use `class_inclusion_list` to train with reduced number of classes",
            )
        else:
            raise DatasetValidationException(
                "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                "Most likely this indicates an error in your all_classes_list parameter"
            )

remove_useless_info(coco, use_seg_info=False)

Remove useless info in coco dataset. COCO object is modified inplace. This function is mainly used for saving memory (save about 30% mem).

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def remove_useless_info(coco: COCO, use_seg_info: bool = False) -> None:
    """
    Remove useless info in coco dataset. COCO object is modified inplace.
    This function is mainly used for saving memory (save about 30% mem).
    """
    if isinstance(coco, COCO):
        dataset = coco.dataset
        dataset.pop("info", None)
        dataset.pop("licenses", None)
        for img in dataset["images"]:
            img.pop("license", None)
            img.pop("coco_url", None)
            img.pop("date_captured", None)
            img.pop("flickr_url", None)
        if "annotations" in coco.dataset and not use_seg_info:
            for anno in coco.dataset["annotations"]:
                anno.pop("segmentation", None)

DetectionDataset

Bases: Dataset, HasPreprocessingParams

Detection dataset.

This is a boilerplate class to facilitate the implementation of datasets.

HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ? - Inherit from DetectionDataSet - implement the method self.load_annotation to return at least the fields "target" and "img_path" - Call super().__init_ with the required params. //!\ super().init will call self.load_annotation, so make sure that every required attributes are set up before calling super().__init_ (ideally just call it last)

WORKFLOW: - On instantiation: - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step. - If cache is True, the images are also cached

- On call (__getitem__) for a specific image index:
    - The image and annotations are grouped together in a dict called SAMPLE
    - the sample is processed according to th transform
    - Only the specified fields are returned by __getitem__

TERMINOLOGY - TARGET: Groundtruth, made of bboxes. The format can vary from one dataset to another - ANNOTATION: Combination of targets (groundtruth) and metadata of the image, but without the image itself. > Has to include the fields "target" and "img_path" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - SAMPLE: Outout of the dataset: > Has to include the fields "target" and "image" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - Index: Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1 - Sample ID: Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
@register_dataset(Datasets.DETECTION_DATASET)
class DetectionDataset(Dataset, HasPreprocessingParams):
    """Detection dataset.

    This is a boilerplate class to facilitate the implementation of datasets.

    HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ?
        - Inherit from DetectionDataSet
        - implement the method self._load_annotation to return at least the fields "target" and "img_path"
        - Call super().__init__ with the required params.
                //!\\ super().__init__ will call self._load_annotation, so make sure that every required
                      attributes are set up before calling super().__init__ (ideally just call it last)

    WORKFLOW:
        - On instantiation:
            - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.
            - If cache is True, the images are also cached

        - On call (__getitem__) for a specific image index:
            - The image and annotations are grouped together in a dict called SAMPLE
            - the sample is processed according to th transform
            - Only the specified fields are returned by __getitem__

    TERMINOLOGY
        - TARGET:       Groundtruth, made of bboxes. The format can vary from one dataset to another
        - ANNOTATION:   Combination of targets (groundtruth) and metadata of the image, but without the image itself.
                            > Has to include the fields "target" and "img_path"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - SAMPLE:       Outout of the dataset:
                            > Has to include the fields "target" and "image"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - Index:        Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        - Sample ID:    Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(
        self,
        data_dir: str,
        original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
        max_num_samples: int = None,
        cache: bool = False,
        cache_annotations: bool = True,
        cache_dir: str = None,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[DetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        target_fields: List[str] = None,
        output_fields: List[str] = None,
        verbose: bool = True,
        show_all_warnings: bool = False,
    ):
        """Detection dataset.

        :param data_dir:                Where the data is stored
        :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                        None means that the image will be loaded as is.
                                        Scalar (size) - Image will be resized to (size, size)
                                        Tuple (rows,cols) - Image will be resized to (rows, cols)
        :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                        differ based on transforms.
        :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
        :param cache:                   Whether to cache images or not.
        :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                        but requires more RAM and more time to instantiate the dataset when working on very large datasets.
        :param cache_dir:              Path to the directory where cached images will be stored in an optimized format.
        :param transforms:              List of transforms to apply sequentially on sample.
        :param all_classes_list:        All the class names.
        :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                        Classes not in this list will excluded from training.
                                        Thus, number of classes in model must be adjusted accordingly.
        :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                                will be ignored.
        :param target_fields:                   List of the fields target fields. This has to include regular target,
                                                but can also include crowd target, segmentation target, ...
                                                It has to include at least "target" but can include other.
        :param output_fields:                   Fields that will be outputed by __getitem__.
                                                It has to include at least "image" and "target" but can include other.
        :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
        :param show_all_warnings:       Whether to show all warnings or not.
        """
        super().__init__()
        self.verbose = verbose
        self.show_all_warnings = show_all_warnings

        if isinstance(original_target_format, DetectionTargetsFormat):
            logger.warning(
                "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
                "Support for DetectionTargetsFormat will be removed in 3.1"
            )

        self.data_dir = data_dir
        if not Path(data_dir).exists():
            raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

        # Number of images that are available (regardless of ignored images)
        n_dataset_samples = self._setup_data_source()
        if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
            raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
        n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

        self.input_dim = ensure_is_tuple_of_two(input_dim)
        self.original_target_format = original_target_format

        if len(all_classes_list) != len(set(all_classes_list)):
            raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

        if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
            raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

        self.all_classes_list = all_classes_list or self._all_classes
        self.class_inclusion_list = class_inclusion_list
        self.classes = self.class_inclusion_list or self.all_classes_list
        if len(set(self.classes) - set(self.all_classes_list)) > 0:
            wrong_classes = set(self.classes) - set(all_classes_list)
            raise DatasetValidationException(
                f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
            )

        self.ignore_empty_annotations = ignore_empty_annotations
        self.target_fields = target_fields or ["target"]
        if "target" not in self.target_fields:
            raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

        self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

        self.transforms = transforms

        self.output_fields = output_fields or ["image", "target"]
        if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
            raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

        self._cache_annotations = cache_annotations
        self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

        # Maps (dataset index) -> (non-empty sample ids)
        self._non_empty_sample_ids: Optional[List[int]] = None

        # Some transform may require non-empty annotations to be indexed.
        transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

        # Iterate over the whole dataset to index the images with/without annotations.
        if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:

            if self._cache_annotations:
                logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
            elif self.ignore_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
                )
            elif transform_require_non_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. "
                    "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
                )

            # Map indexes to sample annotations.
            non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
            if self._cache_annotations:
                if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                    self._cached_annotations = non_empty_annotations
                else:
                    # Non overlapping dicts. since they map unique sample_ids -> sample
                    self._cached_annotations = {**non_empty_annotations, **empty_annotations}

            if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
                raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

            self._non_empty_sample_ids = list(non_empty_annotations.keys())

        self._n_samples = n_samples  # Regardless of any filtering

        # CACHE IMAGE
        self.cache = cache
        self.cache_dir = cache_dir
        self.cached_imgs_padded = self._cache_images() if self.cache else None

    @property
    def _all_classes(self):
        """Placeholder to setup the class names. This is an alternative to passing "all_classes_list" to __init__.
        This is usefull when all_classes_list is not known in advance, only after loading the dataset."""
        raise NotImplementedError

    def _setup_data_source(self) -> int:
        """Set up the data source and store relevant objects as attributes.

        :return: Number of available samples, (i.e. how many images we have, regardless of any filter we might want to use)"""
        raise NotImplementedError

    def _load_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load annotations associated to a specific sample.
        Please note that the targets should be resized according to self.input_dim!

        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        :return:            Annotation, a dict with any field but has to include at least the fields specified in self._required_annotation_fields.
        """
        raise NotImplementedError

    def _get_sample_annotations(self, index: int, ignore_empty_annotations: bool) -> Dict[str, Union[np.ndarray, Any]]:
        """Get the annotation associated to a specific sample. Use cache if enabled.
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    Whether to ignore empty annotations or not.
        :return:                            Dict representing the annotation of a specific image
        """
        sample_id = self._non_empty_sample_ids[index] if ignore_empty_annotations else index
        if self._cache_annotations:
            return self._cached_annotations[sample_id]
        else:
            return self._load_sample_annotation(sample_id=sample_id)

    def _load_sample_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load the annotation associated to a specific sample and apply subclassing.
        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        """
        sample_annotations = self._load_annotation(sample_id=sample_id)
        if not self._required_annotation_fields.issubset(set(sample_annotations.keys())):
            raise KeyError(
                f"_load_annotation is expected to return at least the fields {self._required_annotation_fields}, but got {set(sample_annotations.keys())}"
            )

        # Filter out classes that are not in self.class_inclusion_list
        if self.class_inclusion_list is not None:
            sample_annotations = self._sub_class_annotation(annotation=sample_annotations)

        return sample_annotations

    def _load_all_annotations(self, n_samples: int) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[str, Any]]]:
        """Load ALL the annotations into memory. This is usually required when `ignore_empty_annotations=True`,
        because we have to iterate over the whole dataset once in order to know which sample is empty and which is not.
        Question: Why not just check if annotation is empty on the fly ?
        Answer: When running with DDP, we split the dataset into small chunks.
                Therefore, we need to make sure that each chunk includes a similar subset of index.
                If we were to check on the fly, we would not know in advance the size of dataset/chunks
                and this means that some chunks would be smaller than others

        :param n_samples:   Number of samples in the datasets (including samples without annotations).
        :return:            A tuple of two dicts, one for non-empty annotations and one for empty annotations
                                - non_empty_annotations: Dict mapping dataset index -> non-empty annotations
                                - empty_annotations:     Dict mapping dataset index -> empty annotations
        """
        n_invalid_bbox = 0
        non_empty_annotations, empty_annotations = {}, {}

        for index in tqdm(range(n_samples), desc="Indexing dataset annotations", disable=not self.verbose):

            sample_annotations = self._load_sample_annotation(sample_id=index)
            n_invalid_bbox += sample_annotations.get("n_invalid_labels", 0)

            is_annotation_non_empty = any(len(sample_annotations[field]) != 0 for field in self.target_fields)
            if is_annotation_non_empty:
                non_empty_annotations[index] = sample_annotations if self._cache_annotations else None
            else:
                empty_annotations[index] = sample_annotations if self._cache_annotations else None

        if len(non_empty_annotations) + len(empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        if n_invalid_bbox > 0:
            logger.warning(f"Found {n_invalid_bbox} invalid bbox that were ignored. For more information, please set `show_all_warnings=True`.")

        return non_empty_annotations, empty_annotations

    def _sub_class_annotation(self, annotation: dict) -> Union[dict, None]:
        """Subclass every field listed in self.target_fields. It could be targets, crowd_targets, ...

        :param annotation: Dict representing the annotation of a specific image
        :return:           Subclassed annotation if non-empty after subclassing, otherwise None
        """
        class_index = _get_class_index_in_target(target_format=self.original_target_format)
        for field in self.target_fields:
            annotation[field] = self._sub_class_target(targets=annotation[field], class_index=class_index)
        return annotation

    def _sub_class_target(self, targets: np.ndarray, class_index: int) -> np.ndarray:
        """Sublass targets of a specific image.

        :param targets:     Target array to subclass of shape [n_targets, 5], 5 representing a bbox
        :param class_index:    Position of the class id in a bbox
                                ex: 0 if bbox of format label_xyxy | -1 if bbox of format xyxy_label
        :return:            Subclassed target
        """
        targets_kept = []
        for target in targets:
            cls_id = int(target[class_index])
            cls_name = self.all_classes_list[cls_id]
            if cls_name in self.class_inclusion_list:
                # Replace the target cls_id in self.all_classes_list by cls_id in self.class_inclusion_list
                target[class_index] = self.class_inclusion_list.index(cls_name)
                targets_kept.append(target)

        return np.array(targets_kept) if len(targets_kept) > 0 else np.zeros((0, 5), dtype=np.float32)

    def _cache_images(self) -> np.ndarray:
        """Cache the images. The cached image are stored in a file to be loaded faster mext time.
        :return: Cached images
        """
        cache_dir = Path(self.cache_dir)
        if cache_dir is None:
            raise ValueError("You must specify a cache_dir if you want to cache your images." "If you did not mean to use cache, please set cache=False ")
        cache_dir.mkdir(parents=True, exist_ok=True)

        logger.warning(
            "\n********************************************************************************\n"
            "You are using cached images in RAM to accelerate training.\n"
            "This requires large system RAM.\n"
            "********************************************************************************"
        )

        if self.input_dim is None:
            raise RuntimeError("caching is not possible without input_dim is not set")
        max_h, max_w = self.input_dim[0], self.input_dim[1]

        # The cache should be the same as long as the images and their sizes are the same
        hash = hashlib.sha256()
        for index in range(len(self)):
            annotation = self._get_sample_annotations(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
            values_to_hash = [annotation["resized_img_shape"][0], annotation["resized_img_shape"][1], Path(annotation["img_path"]).name]
            for value in values_to_hash:
                hash.update(str(value).encode("utf-8"))
        cache_hash = hash.hexdigest()

        img_resized_cache_path = cache_dir / f"img_resized_cache_{cache_hash}.array"

        if not img_resized_cache_path.exists():
            logger.info("Caching images for the first time. Be aware that this will stay in the disk until you delete it yourself.")
            NUM_THREADs = min(8, os.cpu_count())

            # Inline-function because we should not to pollute the rest of the class with this function.
            # This function is required because of legacy design - ideally we should not have to load annotations in order to get the image path.
            def _load_image_from_index(index: int) -> np.ndarray:
                annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                return self._load_resized_img(image_path=annotations["img_path"])

            loaded_images = ThreadPool(NUM_THREADs).imap(func=_load_image_from_index, iterable=range(len(self)))

            # Initialize placeholder for images
            cached_imgs = np.memmap(str(img_resized_cache_path), shape=(len(self), max_h, max_w, 3), dtype=np.uint8, mode="w+")

            # Store images in the placeholder
            with tqdm(enumerate(loaded_images), total=len(self), disable=not self.verbose) as loaded_images_pbar:
                for i, image in loaded_images_pbar:
                    cached_imgs[i][: image.shape[0], : image.shape[1], :] = image.copy()
                cached_imgs.flush()
        else:
            logger.warning("You are using cached imgs!")

        logger.info("Loading cached imgs...")
        cached_imgs = np.memmap(str(img_resized_cache_path), shape=(len(self), max_h, max_w, 3), dtype=np.uint8, mode="r+")
        return cached_imgs

    def _load_resized_img(self, image_path: str) -> np.ndarray:
        """Load an image and resize it to the desired size (If relevant).
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img = self._load_image(image_path=image_path)

        if self.input_dim is not None:
            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)

        return img

    def _load_image(self, image_path: str) -> np.ndarray:
        """Load an image.
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img_file = os.path.join(image_path)
        img = cv2.imread(img_file)

        if img is None:
            raise FileNotFoundError(f"{img_file} was no found. Please make sure that the dataset was" f"downloaded and that the path is correct")
        return img

    def __del__(self):
        """Clear the cached images"""
        if hasattr(self, "cached_imgs_padded"):
            del self.cached_imgs_padded

    def __len__(self) -> int:
        """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
        return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

    def __getitem__(self, index: int) -> Tuple:
        """Get the sample post transforms at a specific index of the dataset.
        The output of this function will be collated to form batches.

        :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :return:        Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample = self.apply_transforms(self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations))
        for field in self.output_fields:
            if field not in sample.keys():
                raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
        return tuple(sample[field] for field in self.output_fields)

    def get_random_item(self):
        return self[self.get_random_sample(ignore_empty_annotations=self.ignore_empty_annotations)]

    def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        """Get raw sample, before any transform (beside subclassing).
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    If True, empty annotations will be ignored
        :return:                            Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
        if self.cache:
            image = self._get_cached_image(index=index, cached_image_shape=sample_annotations["resized_img_shape"])
        else:
            image = self._load_resized_img(image_path=sample_annotations["img_path"])
        return {"image": image, **deepcopy(sample_annotations)}

    def _get_cached_image(self, index: int, cached_image_shape: Tuple[int, int]) -> np.ndarray:
        """Load an image from cache.
        :param index:               Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param cached_image_shape:  Shape of the cached image (after resizing if input_dim is set)
        :return:                    Image
        """
        padded_image = self.cached_imgs_padded[index]
        cached_height, cached_width = cached_image_shape
        resized_image = padded_image[:cached_height, :cached_width, :]
        return resized_image.copy()

    def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
        """
        Applies self.transforms sequentially to sample

        If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
         sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
         only additional samples with objects in them.

        :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
        :return: Transformed sample
        """
        for transform in self.transforms:
            sample["additional_samples"] = self._get_additional_inputs_for_transform(transform=transform)
            sample = transform(sample=sample)
            sample.pop("additional_samples")  # additional_samples is not useful after the transform
        return sample

    def _get_additional_inputs_for_transform(self, transform: DetectionTransform) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Add additional inputs required by a transform to the sample"""
        additional_samples_count = transform.additional_samples_count if hasattr(transform, "additional_samples_count") else 0
        non_empty_annotations = transform.non_empty_annotations if hasattr(transform, "non_empty_annotations") else False
        return self.get_random_samples(count=additional_samples_count, ignore_empty_annotations=non_empty_annotations)

    def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Load random samples.

        :param count: The number of samples wanted
        :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
        :return: A list of samples satisfying input params
        """
        return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

    def get_random_sample(self, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        n_relevant_samples = len(self._non_empty_sample_ids) if ignore_empty_annotations else self._n_samples
        random_index = random.randint(0, n_relevant_samples - 1)
        return self.get_sample(index=random_index, ignore_empty_annotations=ignore_empty_annotations)

    @property
    def output_target_format(self):
        target_format = self.original_target_format
        for transform in self.transforms:
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format = transform.output_format
        return target_format

    def plot(self, max_samples_per_plot: int = 16, n_plots: int = 1, plot_transformed_data: bool = True):
        """Combine samples of images with bbox into plots and display the result.

        :param max_samples_per_plot:    Maximum number of images to be displayed per plot
        :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                        If False, the plot will be over the raw samples (i.e. on get_sample)
        :return:
        """
        plot_counter = 0
        input_format = self.output_target_format if plot_transformed_data else self.original_target_format
        if isinstance(input_format, DetectionTargetsFormat):
            raise ValueError(
                "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an isntance of ConcatenateTransform instead."
            )
        target_format_transform = DetectionTargetsFormatTransform(input_format=input_format, output_format=XYXY_LABEL)

        for plot_i in range(n_plots):
            fig = plt.figure(figsize=(10, 10))
            n_subplot = int(np.ceil(max_samples_per_plot**0.5))
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * 16

                if plot_transformed_data:
                    image, targets, *_ = self[img_i + plot_i * 16]
                    image = image.transpose(1, 2, 0).astype(np.int32)
                else:
                    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                    image, targets = sample["image"], sample["target"]

                sample = target_format_transform({"image": image, "target": targets})

                # shape = [padding_size x 4] (The dataset will most likely pad the targets to a fixed dim)
                boxes = sample["target"][:, 0:4]

                # shape = [n_box x 4] (We remove padded boxes, which corresponds to boxes with only 0)
                boxes = boxes[(boxes != 0).any(axis=1)]
                plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
                plt.plot(boxes[:, [0, 2, 2, 0, 0]].T, boxes[:, [1, 1, 3, 3, 1]].T, ".-")
                plt.axis("off")
            fig.tight_layout()
            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as as list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = [Processings.ReverseImageChannels]
        if self.input_dim is not None:
            pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(
            class_names=self.classes,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            iou=0.65,
            conf=0.5,
        )
        return params

__del__()

Clear the cached images

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
411
412
413
414
def __del__(self):
    """Clear the cached images"""
    if hasattr(self, "cached_imgs_padded"):
        del self.cached_imgs_padded

__getitem__(index)

Get the sample post transforms at a specific index of the dataset. The output of this function will be collated to form batches.

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required

Returns:

Type Description
Tuple

Sample, i.e. a dictionary including at least "image" and "target"

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
420
421
422
423
424
425
426
427
428
429
430
431
def __getitem__(self, index: int) -> Tuple:
    """Get the sample post transforms at a specific index of the dataset.
    The output of this function will be collated to form batches.

    :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :return:        Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample = self.apply_transforms(self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations))
    for field in self.output_fields:
        if field not in sample.keys():
            raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
    return tuple(sample[field] for field in self.output_fields)

__init__(data_dir, original_target_format, max_num_samples=None, cache=False, cache_annotations=True, cache_dir=None, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, target_fields=None, output_fields=None, verbose=True, show_all_warnings=False)

Detection dataset.

Parameters:

Name Type Description Default
data_dir str

Where the data is stored

required
input_dim Union[int, Tuple[int, int], None]

Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols). None means that the image will be loaded as is. Scalar (size) - Image will be resized to (size, size) Tuple (rows,cols) - Image will be resized to (rows, cols)

None
original_target_format Union[ConcatenatedTensorFormat, DetectionTargetsFormat]

Format of targets stored on disk. raw data format, the output format might differ based on transforms.

required
max_num_samples int

If not None, set the maximum size of the dataset by only indexing the first n annotations/images.

None
cache bool

Whether to cache images or not.

False
cache_annotations bool

Whether to cache annotations or not. This reduces training time by pre-loading all the annotations, but requires more RAM and more time to instantiate the dataset when working on very large datasets.

True
cache_dir str

Path to the directory where cached images will be stored in an optimized format.

None
transforms List[DetectionTransform]

List of transforms to apply sequentially on sample.

[]
all_classes_list Optional[List[str]]

All the class names.

[]
class_inclusion_list Optional[List[str]]

If not None, define the subset of classes to be included as targets. Classes not in this list will excluded from training. Thus, number of classes in model must be adjusted accordingly.

None
ignore_empty_annotations bool

If True and class_inclusion_list not None, images without any target will be ignored.

True
target_fields List[str]

List of the fields target fields. This has to include regular target, but can also include crowd target, segmentation target, ... It has to include at least "target" but can include other.

None
output_fields List[str]

Fields that will be outputed by getitem. It has to include at least "image" and "target" but can include other.

None
verbose bool

Whether to show additional information or not, such as loading progress. (doesnt include warnings)

True
show_all_warnings bool

Whether to show all warnings or not.

False
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
@resolve_param("transforms", ListFactory(TransformsFactory()))
def __init__(
    self,
    data_dir: str,
    original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
    max_num_samples: int = None,
    cache: bool = False,
    cache_annotations: bool = True,
    cache_dir: str = None,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[DetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    target_fields: List[str] = None,
    output_fields: List[str] = None,
    verbose: bool = True,
    show_all_warnings: bool = False,
):
    """Detection dataset.

    :param data_dir:                Where the data is stored
    :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                    None means that the image will be loaded as is.
                                    Scalar (size) - Image will be resized to (size, size)
                                    Tuple (rows,cols) - Image will be resized to (rows, cols)
    :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                    differ based on transforms.
    :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
    :param cache:                   Whether to cache images or not.
    :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                    but requires more RAM and more time to instantiate the dataset when working on very large datasets.
    :param cache_dir:              Path to the directory where cached images will be stored in an optimized format.
    :param transforms:              List of transforms to apply sequentially on sample.
    :param all_classes_list:        All the class names.
    :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                    Classes not in this list will excluded from training.
                                    Thus, number of classes in model must be adjusted accordingly.
    :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                            will be ignored.
    :param target_fields:                   List of the fields target fields. This has to include regular target,
                                            but can also include crowd target, segmentation target, ...
                                            It has to include at least "target" but can include other.
    :param output_fields:                   Fields that will be outputed by __getitem__.
                                            It has to include at least "image" and "target" but can include other.
    :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
    :param show_all_warnings:       Whether to show all warnings or not.
    """
    super().__init__()
    self.verbose = verbose
    self.show_all_warnings = show_all_warnings

    if isinstance(original_target_format, DetectionTargetsFormat):
        logger.warning(
            "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
            "Support for DetectionTargetsFormat will be removed in 3.1"
        )

    self.data_dir = data_dir
    if not Path(data_dir).exists():
        raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

    # Number of images that are available (regardless of ignored images)
    n_dataset_samples = self._setup_data_source()
    if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
        raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
    n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

    self.input_dim = ensure_is_tuple_of_two(input_dim)
    self.original_target_format = original_target_format

    if len(all_classes_list) != len(set(all_classes_list)):
        raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

    if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
        raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

    self.all_classes_list = all_classes_list or self._all_classes
    self.class_inclusion_list = class_inclusion_list
    self.classes = self.class_inclusion_list or self.all_classes_list
    if len(set(self.classes) - set(self.all_classes_list)) > 0:
        wrong_classes = set(self.classes) - set(all_classes_list)
        raise DatasetValidationException(
            f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
        )

    self.ignore_empty_annotations = ignore_empty_annotations
    self.target_fields = target_fields or ["target"]
    if "target" not in self.target_fields:
        raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

    self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

    self.transforms = transforms

    self.output_fields = output_fields or ["image", "target"]
    if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
        raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

    self._cache_annotations = cache_annotations
    self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

    # Maps (dataset index) -> (non-empty sample ids)
    self._non_empty_sample_ids: Optional[List[int]] = None

    # Some transform may require non-empty annotations to be indexed.
    transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

    # Iterate over the whole dataset to index the images with/without annotations.
    if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:

        if self._cache_annotations:
            logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
        elif self.ignore_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
            )
        elif transform_require_non_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. "
                "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
            )

        # Map indexes to sample annotations.
        non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
        if self._cache_annotations:
            if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                self._cached_annotations = non_empty_annotations
            else:
                # Non overlapping dicts. since they map unique sample_ids -> sample
                self._cached_annotations = {**non_empty_annotations, **empty_annotations}

        if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        self._non_empty_sample_ids = list(non_empty_annotations.keys())

    self._n_samples = n_samples  # Regardless of any filtering

    # CACHE IMAGE
    self.cache = cache
    self.cache_dir = cache_dir
    self.cached_imgs_padded = self._cache_images() if self.cache else None

__len__()

Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant).

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
416
417
418
def __len__(self) -> int:
    """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
    return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

apply_transforms(sample)

Applies self.transforms sequentially to sample

If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load only additional samples with objects in them.

Parameters:

Name Type Description Default
sample Dict[str, Union[np.ndarray, Any]]

Sample to apply the transforms on to (loaded with self.get_sample)

required

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Transformed sample

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
    """
    Applies self.transforms sequentially to sample

    If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
     sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
     only additional samples with objects in them.

    :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
    :return: Transformed sample
    """
    for transform in self.transforms:
        sample["additional_samples"] = self._get_additional_inputs_for_transform(transform=transform)
        sample = transform(sample=sample)
        sample.pop("additional_samples")  # additional_samples is not useful after the transform
    return sample

get_dataset_preprocessing_params()

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as as list of dicts to be resolved by processing factory.

Returns:

Type Description
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as as list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = [Processings.ReverseImageChannels]
    if self.input_dim is not None:
        pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(
        class_names=self.classes,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        iou=0.65,
        conf=0.5,
    )
    return params

get_random_samples(count, ignore_empty_annotations=False)

Load random samples.

Parameters:

Name Type Description Default
count int

The number of samples wanted

required
ignore_empty_annotations bool

If true, only return samples with at least 1 annotation

False

Returns:

Type Description
List[Dict[str, Union[np.ndarray, Any]]]

A list of samples satisfying input params

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
483
484
485
486
487
488
489
490
def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
    """Load random samples.

    :param count: The number of samples wanted
    :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
    :return: A list of samples satisfying input params
    """
    return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

get_sample(index, ignore_empty_annotations=False)

Get raw sample, before any transform (beside subclassing).

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required
ignore_empty_annotations bool

If True, empty annotations will be ignored

False

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Sample, i.e. a dictionary including at least "image" and "target"

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
436
437
438
439
440
441
442
443
444
445
446
447
def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
    """Get raw sample, before any transform (beside subclassing).
    :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :param ignore_empty_annotations:    If True, empty annotations will be ignored
    :return:                            Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
    if self.cache:
        image = self._get_cached_image(index=index, cached_image_shape=sample_annotations["resized_img_shape"])
    else:
        image = self._load_resized_img(image_path=sample_annotations["img_path"])
    return {"image": image, **deepcopy(sample_annotations)}

plot(max_samples_per_plot=16, n_plots=1, plot_transformed_data=True)

Combine samples of images with bbox into plots and display the result.

Parameters:

Name Type Description Default
max_samples_per_plot int

Maximum number of images to be displayed per plot

16
n_plots int

Number of plots to display (each plot being a combination of img with bbox)

1
plot_transformed_data bool

If True, the plot will be over samples after applying transforms (i.e. on getitem). If False, the plot will be over the raw samples (i.e. on get_sample)

True

Returns:

Type Description
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
def plot(self, max_samples_per_plot: int = 16, n_plots: int = 1, plot_transformed_data: bool = True):
    """Combine samples of images with bbox into plots and display the result.

    :param max_samples_per_plot:    Maximum number of images to be displayed per plot
    :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                    If False, the plot will be over the raw samples (i.e. on get_sample)
    :return:
    """
    plot_counter = 0
    input_format = self.output_target_format if plot_transformed_data else self.original_target_format
    if isinstance(input_format, DetectionTargetsFormat):
        raise ValueError(
            "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an isntance of ConcatenateTransform instead."
        )
    target_format_transform = DetectionTargetsFormatTransform(input_format=input_format, output_format=XYXY_LABEL)

    for plot_i in range(n_plots):
        fig = plt.figure(figsize=(10, 10))
        n_subplot = int(np.ceil(max_samples_per_plot**0.5))
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * 16

            if plot_transformed_data:
                image, targets, *_ = self[img_i + plot_i * 16]
                image = image.transpose(1, 2, 0).astype(np.int32)
            else:
                sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                image, targets = sample["image"], sample["target"]

            sample = target_format_transform({"image": image, "target": targets})

            # shape = [padding_size x 4] (The dataset will most likely pad the targets to a fixed dim)
            boxes = sample["target"][:, 0:4]

            # shape = [n_box x 4] (We remove padded boxes, which corresponds to boxes with only 0)
            boxes = boxes[(boxes != 0).any(axis=1)]
            plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
            plt.plot(boxes[:, [0, 2, 2, 0, 0]].T, boxes[:, [1, 1, 3, 3, 1]].T, ".-")
            plt.axis("off")
        fig.tight_layout()
        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

PascalVOCDetectionDataset

Bases: DetectionDataset

Dataset for Pascal VOC object detection

To use this Dataset you need to: >> train_set = PascalVOCDetectionDataset(download=True, ...)

Dataset structure: ├─images │ ├─ train2012 │ ├─ val2012 │ ├─ VOCdevkit │ │ ├─ VOC2007 │ │ │ ├──JPEGImages │ │ │ ├──SegmentationClass │ │ │ ├──ImageSets │ │ │ ├──ImageSets/Segmentation │ │ │ ├──ImageSets/Main │ │ │ ├──ImageSets/Layout │ │ │ ├──Annotations │ │ │ └──SegmentationObject │ │ └──VOC2012 │ │ ├──JPEGImages │ │ ├──SegmentationClass │ │ ├──ImageSets │ │ ├──ImageSets/Segmentation │ │ ├──ImageSets/Main │ │ ├──ImageSets/Action │ │ ├──ImageSets/Layout │ │ ├──Annotations │ │ └──SegmentationObject │ ├─train2007 │ ├─test2007 │ └─val2007 └─labels ├─train2012 ├─val2012 ├─train2007 ├─test2007 └─val2007

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
@register_dataset(Datasets.PASCAL_VOC_DETECTION_DATASET)
class PascalVOCDetectionDataset(DetectionDataset):
    """Dataset for Pascal VOC object detection

    To use this Dataset you need to:
        >> train_set = PascalVOCDetectionDataset(download=True, ...)

    Dataset structure:
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    """

    def __init__(self, images_sub_directory: str, download: bool = False, *args, **kwargs):
        """Dataset for Pascal VOC object detection

        :param images_sub_directory:    Sub directory of data_dir that includes images.
        """

        self.images_sub_directory = images_sub_directory
        self.img_and_target_path_list = None
        data_dir = kwargs.get("data_dir")
        if data_dir is None:
            raise ValueError("Must pass data_dir != None through **kwargs")
        if download:
            PascalVOCDetectionDataset.download(data_dir)

        kwargs["original_target_format"] = DetectionTargetsFormat.XYXY_LABEL
        kwargs["all_classes_list"] = PASCAL_VOC_2012_CLASSES_LIST
        super().__init__(*args, **kwargs)

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: List of tuples made of (img_path,target_path)
        """
        img_files_folder = os.path.join(self.data_dir, self.images_sub_directory)
        if not Path(img_files_folder).exists():
            raise FileNotFoundError(
                f"{img_files_folder} not found...\n"
                f"Please make sure that f{self.data_dir} points toward your PascalVOC dataset folder.\n"
                f"If you don't have it locally, you can set PascalVOCDetectionDataset(..., download=True)"
            )

        img_files = glob.glob(img_files_folder + "*.jpg")
        if len(img_files) == 0:
            raise FileNotFoundError(f"No image file found at {img_files_folder}")

        target_files = [img_file.replace("images", "labels").replace(".jpg", ".txt") for img_file in img_files]

        img_and_target_path_list = [(img_file, target_file) for img_file, target_file in zip(img_files, target_files) if os.path.exists(target_file)]
        if len(img_and_target_path_list) == 0:
            raise FileNotFoundError("No target file associated to the images was found")

        num_missing_files = len(img_files) - len(img_and_target_path_list)
        if num_missing_files > 0:
            logger.warning(f"{num_missing_files} labels files were not loaded our of {len(img_files)} image files")

        self.img_and_target_path_list = img_and_target_path_list
        return len(self.img_and_target_path_list)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load annotations associated to a specific sample.

        :return: Annotation including:
                    - target in XYXY_LABEL format
                    - img_path
        """
        img_path, target_path = self.img_and_target_path_list[sample_id]
        with open(target_path, "r") as targets_file:
            target = np.array([x.split() for x in targets_file.read().splitlines()], dtype=np.float32)

        height, width = get_image_size_from_path(img_path)

        # We have to rescale the targets because the images will be resized.
        r = min(self.input_dim[1] / height, self.input_dim[0] / width)
        target[:, :4] *= r

        resized_img_shape = (int(height * r), int(width * r))

        return {"img_path": img_path, "target": target, "resized_img_shape": resized_img_shape}

    @staticmethod
    def download(data_dir: str) -> None:
        """Download Pascal dataset in XYXY_LABEL format.

        Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
        """

        def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
            """Parse and save the labels of an image in XYXY_LABEL format."""

            with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
                xml_parser = ElementTree.parse(f).getroot()

            labels = []
            for obj in xml_parser.iter("object"):
                cls = obj.find("name").text
                if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                    xml_box = obj.find("bndbox")

                    def get_coord(box_coord):
                        return xml_box.find(box_coord).text

                    xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                    labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

            with open(new_label_path, "w") as f:
                f.write("\n".join(labels))

        urls = [
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
        ]  # 1.86G, 17125 images
        data_dir = Path(data_dir)
        download_and_untar_from_url(urls, dir=data_dir / "images")

        # Convert
        data_path = data_dir / "images" / "VOCdevkit"
        for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
            dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
            dest_imgs_path.mkdir(exist_ok=True, parents=True)

            dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
            dest_labels_path.mkdir(exist_ok=True, parents=True)

            with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
                image_ids = f.read().strip().split()

            for id in tqdm(image_ids, desc=f"{image_set}{year}"):
                img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
                new_img_path = dest_imgs_path / img_path.name
                new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
                img_path.rename(new_img_path)  # Move image to dest folder
                _parse_and_save_labels(data_path, new_label_path, year, id)

__init__(images_sub_directory, download=False, *args, **kwargs)

Dataset for Pascal VOC object detection

Parameters:

Name Type Description Default
images_sub_directory str

Sub directory of data_dir that includes images.

required
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(self, images_sub_directory: str, download: bool = False, *args, **kwargs):
    """Dataset for Pascal VOC object detection

    :param images_sub_directory:    Sub directory of data_dir that includes images.
    """

    self.images_sub_directory = images_sub_directory
    self.img_and_target_path_list = None
    data_dir = kwargs.get("data_dir")
    if data_dir is None:
        raise ValueError("Must pass data_dir != None through **kwargs")
    if download:
        PascalVOCDetectionDataset.download(data_dir)

    kwargs["original_target_format"] = DetectionTargetsFormat.XYXY_LABEL
    kwargs["all_classes_list"] = PASCAL_VOC_2012_CLASSES_LIST
    super().__init__(*args, **kwargs)

download(data_dir) staticmethod

Download Pascal dataset in XYXY_LABEL format.

Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
@staticmethod
def download(data_dir: str) -> None:
    """Download Pascal dataset in XYXY_LABEL format.

    Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
    """

    def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
        """Parse and save the labels of an image in XYXY_LABEL format."""

        with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
            xml_parser = ElementTree.parse(f).getroot()

        labels = []
        for obj in xml_parser.iter("object"):
            cls = obj.find("name").text
            if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                xml_box = obj.find("bndbox")

                def get_coord(box_coord):
                    return xml_box.find(box_coord).text

                xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

        with open(new_label_path, "w") as f:
            f.write("\n".join(labels))

    urls = [
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
    ]  # 1.86G, 17125 images
    data_dir = Path(data_dir)
    download_and_untar_from_url(urls, dir=data_dir / "images")

    # Convert
    data_path = data_dir / "images" / "VOCdevkit"
    for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
        dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
        dest_imgs_path.mkdir(exist_ok=True, parents=True)

        dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
        dest_labels_path.mkdir(exist_ok=True, parents=True)

        with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
            image_ids = f.read().strip().split()

        for id in tqdm(image_ids, desc=f"{image_set}{year}"):
            img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
            new_img_path = dest_imgs_path / img_path.name
            new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
            img_path.rename(new_img_path)  # Move image to dest folder
            _parse_and_save_labels(data_path, new_label_path, year, id)

PascalVOCUnifiedDetectionTrainDataset

Bases: ConcatDataset

Unified Dataset for Pascal VOC object detection

To use this Dataset you need to: >> train_set = PascalVOCUnifiedDetectionTrainDataset(download=True, ...)

Dataset structure: ├─images │ ├─ train2012 │ ├─ val2012 │ ├─ VOCdevkit │ │ ├─ VOC2007 │ │ │ ├──JPEGImages │ │ │ ├──SegmentationClass │ │ │ ├──ImageSets │ │ │ ├──ImageSets/Segmentation │ │ │ ├──ImageSets/Main │ │ │ ├──ImageSets/Layout │ │ │ ├──Annotations │ │ │ └──SegmentationObject │ │ └──VOC2012 │ │ ├──JPEGImages │ │ ├──SegmentationClass │ │ ├──ImageSets │ │ ├──ImageSets/Segmentation │ │ ├──ImageSets/Main │ │ ├──ImageSets/Action │ │ ├──ImageSets/Layout │ │ ├──Annotations │ │ └──SegmentationObject │ ├─train2007 │ ├─test2007 │ └─val2007 └─labels ├─train2012 ├─val2012 ├─train2007 ├─test2007 └─val2007

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
class PascalVOCUnifiedDetectionTrainDataset(ConcatDataset):
    """Unified Dataset for Pascal VOC object detection

    To use this Dataset you need to:
        >> train_set = PascalVOCUnifiedDetectionTrainDataset(download=True, ...)

    Dataset structure:
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007
    """

    def __init__(
        self,
        data_dir: str,
        input_dim: tuple,
        cache: bool = False,
        cache_dir: str = None,
        transforms: List[DetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        max_num_samples: int = None,
        download: bool = False,
    ):
        if download:
            PascalVOCDetectionDataset.download(data_dir=data_dir)

        train_dataset_names = ["train2007", "val2007", "train2012", "val2012"]
        # We divide train_max_num_samples between the datasets
        if max_num_samples:
            max_num_samples_per_train_dataset = [len(segment) for segment in np.array_split(range(max_num_samples), len(train_dataset_names))]
        else:
            max_num_samples_per_train_dataset = [None] * len(train_dataset_names)
        train_sets = [
            PascalVOCDetectionDataset(
                data_dir=data_dir,
                input_dim=input_dim,
                cache=cache,
                cache_dir=cache_dir,
                transforms=transforms,
                images_sub_directory="images/" + trainset_name + "/",
                class_inclusion_list=class_inclusion_list,
                max_num_samples=max_num_samples_per_train_dataset[i],
            )
            for i, trainset_name in enumerate(train_dataset_names)
        ]
        super(PascalVOCUnifiedDetectionTrainDataset, self).__init__(train_sets)

RoboflowDetectionDataset

Bases: COCOFormatDetectionDataset

Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection. Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

To use this Dataset you need to:

- Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
    //!\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

- Your dataset should look like this:
    rf100
    ├── 4-fold-defect
    │      ├─ train
    │      │    ├─ 000000000001.jpg
    │      │    ├─ ...
    │      │    └─ _annotations.coco.json
    │      ├─ valid
    │      │    └─ ...
    │      └─ test
    │           └─ ...
    ├── abdomen-mri
    │      └─ ...
    └── ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
    >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
    >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

Note: dataset_name refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets) OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class RoboflowDetectionDataset(COCOFormatDetectionDataset):
    """Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection.
    Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

    To use this Dataset you need to:

        - Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
            //!\\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

        - Your dataset should look like this:
            rf100
            ├── 4-fold-defect
            │      ├─ train
            │      │    ├─ 000000000001.jpg
            │      │    ├─ ...
            │      │    └─ _annotations.coco.json
            │      ├─ valid
            │      │    └─ ...
            │      └─ test
            │           └─ ...
            ├── abdomen-mri
            │      └─ ...
            └── ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
            >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
            >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

    Note: `dataset_name` refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
          OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6
    """

    def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
        """
        :param data_dir:        Where the data is stored.
        :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
        :param split:           train, valid or test.
        """
        if split not in ("train", "valid", "test"):
            raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

        self.dataset_name = dataset_name
        dataset_split_dir = os.path.join(dataset_name, split)
        json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

        super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

    @staticmethod
    def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
        """List all available datasets of specified categories. By default, list all the datasets."""
        return list_datasets(categories=categories)

    @property
    def metadata(self) -> Optional[Dict[str, Union[str, int]]]:
        """Category of the dataset. Note that each dataset has one and only one category."""
        return get_dataset_metadata(self.dataset_name)

metadata: Optional[Dict[str, Union[str, int]]] property

Category of the dataset. Note that each dataset has one and only one category.

__init__(data_dir, dataset_name, split, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
dataset_name str

One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)

required
split str

train, valid or test.

required
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
    """
    :param data_dir:        Where the data is stored.
    :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
    :param split:           train, valid or test.
    """
    if split not in ("train", "valid", "test"):
        raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

    self.dataset_name = dataset_name
    dataset_split_dir = os.path.join(dataset_name, split)
    json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

    super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

list_datasets(categories=None) staticmethod

List all available datasets of specified categories. By default, list all the datasets.

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
60
61
62
63
@staticmethod
def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    return list_datasets(categories=categories)

get_dataset_metadata(dataset_name)

Get the metadata of a specific roboflow dataset.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv

required

Returns:

Type Description
Optional[Dict[str, Union[str, int]]]

Metadata of the dataset

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
15
16
17
18
19
20
21
22
23
24
def get_dataset_metadata(dataset_name: str) -> Optional[Dict[str, Union[str, int]]]:
    """Get the metadata of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Metadata of the dataset
    """
    dataset_metadata = DATASETS_METADATA.get(dataset_name)
    if dataset_metadata is None:
        logger.warning(f"No metadata found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return dataset_metadata

get_dataset_num_classes(dataset_name)

Get the number of classes of a specific roboflow dataset.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv

required

Returns:

Type Description
int

Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
27
28
29
30
31
32
33
34
35
36
def get_dataset_num_classes(dataset_name: str) -> int:
    """Get the number of classes of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.
    """
    metadata = get_dataset_metadata(dataset_name)
    if metadata is None:
        raise ValueError(f"No num_classes found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return metadata["num_classes_found"]

list_datasets(categories=None)

List all available datasets of specified categories. By default, list all the datasets.

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
 9
10
11
12
def list_datasets(categories: List[str] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    categories = categories or DATASETS_CATEGORIES
    return [dataset_name for dataset_name, metadata in DATASETS_METADATA.items() if metadata["category"] in categories]

YoloDarknetFormatDetectionDataset

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

Note: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

The dataset can have any structure, as long as images_dir and labels_dir inside data_dir. Each image is expected to have a file with the same name as the label.

Example1: data_dir ├── images │ ├─ 0001.jpg │ ├─ 0002.jpg │ └─ ... └── labels ├─ 0001.txt ├─ 0002.txt └─ ... >> data_set = YoloDarknetFormatDetectionDataset(data_dir='/data_dir', images_dir="images", labels_dir="labels", classes=[])

Example2: data_dir ├── train │ ├── images │ │ ├─ 0001.jpg │ │ ├─ 0002.jpg │ │ └─ ... │ └── labels │ ├─ 0001.txt │ ├─ 0002.txt │ └─ ... └── val ├── images │ ├─ 434343.jpg │ ├─ 434344.jpg │ └─ ... └── labels ├─ 434343.txt ├─ 434344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
    )
>> val_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
    )

Example3: data_dir ├── train │ ├─ 0001.jpg │ ├─ 0001.txt │ ├─ 0002.jpg │ ├─ 0002.txt │ └─ ... └── val ├─ 4343.jpg ├─ 4343.txt ├─ 4344.jpg ├─ 4344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
>> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

Each label file being in LABEL_NORMALIZED_CXCYWH format: 0 0.33 0.33 0.50 0.44 1 0.21 0.54 0.30 0.60 ...

Output format: XYXY_LABEL (x, y, x, y, class_id)

Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class YoloDarknetFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

    **Note**: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

    The dataset can have any structure, as long as `images_dir` and `labels_dir` inside `data_dir`.
    Each image is expected to have a file with the same name as the label.

    Example1:
        data_dir
        ├── images
        │      ├─ 0001.jpg
        │      ├─ 0002.jpg
        │      └─ ...
        └── labels
               ├─ 0001.txt
               ├─ 0002.txt
               └─ ...
        >> data_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="images", labels_dir="labels", classes=[<to-fill>])

    Example2:
        data_dir
        ├── train
        │   ├── images
        │   │      ├─ 0001.jpg
        │   │      ├─ 0002.jpg
        │   │      └─ ...
        │   └── labels
        │          ├─ 0001.txt
        │          ├─ 0002.txt
        │          └─ ...
        └── val
            ├── images
            │      ├─ 434343.jpg
            │      ├─ 434344.jpg
            │      └─ ...
            └── labels
                   ├─ 434343.txt
                   ├─ 434344.txt
                   └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
            )
        >> val_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
            )

    Example3:
        data_dir
        ├── train
        │      ├─ 0001.jpg
        │      ├─ 0001.txt
        │      ├─ 0002.jpg
        │      ├─ 0002.txt
        │      └─ ...
        └── val
               ├─ 4343.jpg
               ├─ 4343.txt
               ├─ 4344.jpg
               ├─ 4344.txt
               └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
        >> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

    Each label file being in LABEL_NORMALIZED_CXCYWH format:
        0 0.33 0.33 0.50 0.44
        1 0.21 0.54 0.30 0.60
        ...


    Output format: XYXY_LABEL (x, y, x, y, class_id)
    """

    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        classes: List[str],
        class_ids_to_ignore: Optional[List[int]] = None,
        ignore_invalid_labels: bool = True,
        show_all_warnings: bool = False,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
        :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
        :param classes:                 List of class names.
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
        """
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.class_ids_to_ignore = class_ids_to_ignore or []
        self.classes = classes
        self.ignore_invalid_labels = ignore_invalid_labels
        self.show_all_warnings = show_all_warnings

        kwargs["target_fields"] = ["target"]
        kwargs["output_fields"] = ["image", "target"]
        kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
        super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

    @property
    def _all_classes(self) -> List[str]:
        return self.classes

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: number of images in the dataset
        """
        self.images_folder = os.path.join(self.data_dir, self.images_dir)
        self.labels_folder = os.path.join(self.data_dir, self.labels_dir)

        all_images_file_names = list(image_name for image_name in os.listdir(self.images_folder) if is_image(image_name))
        all_labels_file_names = list(label_name for label_name in os.listdir(self.labels_folder) if label_name.endswith(".txt"))

        remove_file_extension = lambda file_name: os.path.splitext(os.path.basename(file_name))[0]
        unique_image_file_base_names = set(remove_file_extension(image_file_name) for image_file_name in all_images_file_names)
        unique_label_file_base_names = set(remove_file_extension(label_file_name) for label_file_name in all_labels_file_names)

        images_not_in_labels = unique_image_file_base_names - unique_label_file_base_names
        if images_not_in_labels:
            logger.warning(f"{len(images_not_in_labels)} images are note associated to any label file")

        labels_not_in_images = unique_label_file_base_names - unique_image_file_base_names
        if labels_not_in_images:
            logger.warning(f"{len(labels_not_in_images)} label files are not associated to any image.")

        # Only keep names that are in both the images and the labels
        valid_base_names = unique_image_file_base_names & unique_label_file_base_names
        if len(valid_base_names) != len(all_images_file_names):
            logger.warning(
                f"As a consequence, "
                f"{len(valid_base_names)}/{len(all_images_file_names)} images and "
                f"{len(valid_base_names)}/{len(all_labels_file_names)} label files will be used."
            )

        self.images_file_names = []
        self.labels_file_names = []
        for image_full_name in all_images_file_names:
            base_name = remove_file_extension(image_full_name)
            if base_name in valid_base_names:
                self.images_file_names.append(image_full_name)
                self.labels_file_names.append(base_name + ".txt")
        return len(self.images_file_names)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load relevant information of a specific image.

        :param sample_id:   Sample_id in the dataset
        :return:            Dictionary with the following keys:
            - "target":             Target Bboxes (detection) in XYXY_LABEL format
            - "initial_img_shape":  Image (height, width)
            - "resized_img_shape":  Resides image (height, width)
            - "img_path":           Path to the associated image
        """
        image_path = os.path.join(self.images_folder, self.images_file_names[sample_id])
        label_path = os.path.join(self.labels_folder, self.labels_file_names[sample_id])

        image_width, image_height = imagesize.get(image_path)
        image_shape = (image_height, image_width)

        yolo_format_target, invalid_labels = self._parse_yolo_label_file(
            label_file_path=label_path,
            num_classes=len(self.all_classes_list),
            ignore_invalid_labels=self.ignore_invalid_labels,
            show_warnings=self.show_all_warnings,
        )

        converter = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_CXCYWH, output_format=XYXY_LABEL, image_shape=image_shape)
        target = converter(yolo_format_target)

        # The base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        if self.input_dim is not None:
            r = min(self.input_dim[0] / image_height, self.input_dim[1] / image_width)
            target[:, :4] *= r
            resized_img_shape = (int(image_height * r), int(image_width * r))
        else:
            resized_img_shape = image_shape

        annotation = {
            "target": target,
            "initial_img_shape": image_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": image_path,
            "id": np.array([sample_id]),
            "n_invalid_labels": len(invalid_labels),
        }
        return annotation

    @staticmethod
    def _parse_yolo_label_file(
        label_file_path: str,
        ignore_invalid_labels: bool = True,
        show_warnings: bool = True,
        num_classes: Optional[int] = None,
    ) -> Tuple[np.ndarray, List[str]]:
        """Parse a single label file in yolo format.

        #TODO: Add support for additional fields (with ConcatenatedTensorFormat)
        :param label_file_path:         Path to the label file in yolo format.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_warnings:           Whether to show the warnings or not.
        :param num_classes:             Number of classes in the dataset. Used to ensure that class ids are within the range [0, num_classes - 1].
                                        If None, ignore.

        :return:
            - labels:           np.ndarray of shape (n_labels, 5) in yolo format (LABEL_NORMALIZED_CXCYWH)
            - invalid_labels:   List of lines that failed to be parsed
        """
        with open(label_file_path, "r") as f:
            lines = f.readlines()

        labels_yolo_format, invalid_labels = [], []
        for line in filter(lambda x: x != "\n", lines):
            try:
                label_id, cx, cw, w, h = line.split()
                label_id, cx, cw, w, h = int(label_id), float(cx), float(cw), float(w), float(h)

                if (num_classes is not None) and (label_id not in range(num_classes)):
                    raise ValueError(f"`class_id={label_id}` invalid. It should be between (0 - {num_classes - 1}).")

                labels_yolo_format.append([label_id, cx, cw, w, h])
            except Exception as e:
                error_msg = (
                    f"Line `{line}` of file {label_file_path} will be ignored because not cannot be parsed to (label, cx, cy, w, h) format, "
                    f"with Exception:\n{e}"
                )
                if ignore_invalid_labels:
                    invalid_labels.append(line)
                    if show_warnings:
                        logger.warning(error_msg)
                else:
                    raise RuntimeError(error_msg)
        return np.array(labels_yolo_format) if labels_yolo_format else np.zeros((0, 5)), invalid_labels

__init__(data_dir, images_dir, labels_dir, classes, class_ids_to_ignore=None, ignore_invalid_labels=True, show_all_warnings=False, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
images_dir str

Local path to directory that includes all the images. Path relative to data_dir. Can be the same as labels_dir.

required
labels_dir str

Local path to directory that includes all the labels. Path relative to data_dir. Can be the same as images_dir.

required
classes List[str]

List of class names.

required
class_ids_to_ignore Optional[List[int]]

List of class ids to ignore in the dataset. By default, doesnt ignore any class.

None
ignore_invalid_labels bool

Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.

True
show_all_warnings bool

Whether to show every yolo format parser warnings or not.

False
Source code in V3_4/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    labels_dir: str,
    classes: List[str],
    class_ids_to_ignore: Optional[List[int]] = None,
    ignore_invalid_labels: bool = True,
    show_all_warnings: bool = False,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
    :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
    :param classes:                 List of class names.
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
    :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
    """
    self.images_dir = images_dir
    self.labels_dir = labels_dir
    self.class_ids_to_ignore = class_ids_to_ignore or []
    self.classes = classes
    self.ignore_invalid_labels = ignore_invalid_labels
    self.show_all_warnings = show_all_warnings

    kwargs["target_fields"] = ["target"]
    kwargs["output_fields"] = ["image", "target"]
    kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
    super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

Mixup and Cutmix

Papers: mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)

CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)

Code Reference: CutMix: https://github.com/clovaai/CutMix-PyTorch CutMix by timm: https://github.com/rwightman/pytorch-image-models/timm

CollateMixup

Collate with Mixup/Cutmix that applies different params to each element or whole batch A Mixup impl that's performed while collating the batches.

Source code in V3_4/src/super_gradients/training/datasets/mixup.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
@register_collate_function()
class CollateMixup:
    """
    Collate with Mixup/Cutmix that applies different params to each element or whole batch
    A Mixup impl that's performed while collating the batches.
    """

    def __init__(
        self,
        mixup_alpha: float = 1.0,
        cutmix_alpha: float = 0.0,
        cutmix_minmax: List[float] = None,
        prob: float = 1.0,
        switch_prob: float = 0.5,
        mode: str = "batch",
        correct_lam: bool = True,
        label_smoothing: float = 0.1,
        num_classes: int = 1000,
    ):
        """
        Mixup/Cutmix that applies different params to each element or whole batch

        :param mixup_alpha: mixup alpha value, mixup is active if > 0.
        :param cutmix_alpha: cutmix alpha value, cutmix is active if > 0.
        :param cutmix_minmax: cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        :param prob: probability of applying mixup or cutmix per batch or element
        :param switch_prob: probability of switching to cutmix instead of mixup when both are active
        :param mode: how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        :param correct_lam: apply lambda correction when cutmix bbox clipped by image borders
        :param label_smoothing: apply label smoothing to the mixed target tensor
        :param num_classes: number of classes for target
        """
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
        if self.cutmix_minmax is not None:
            assert len(self.cutmix_minmax) == 2
            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
            self.cutmix_alpha = 1.0
        self.mix_prob = prob
        self.switch_prob = switch_prob
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

    def _params_per_elem(self, batch_size):
        """
        generate two random masks to define which elements of the batch will be mixed and how (depending on the
        self.mixup_enabled, self.mixup_alpha, self.cutmix_alpha parameters

        :param batch_size:
        :return: two tensors with shape=batch_size - the first contains the lambda value per batch element
        and the second is a binary flag indicating use of cutmix per batch element
        """
        lam = torch.ones(batch_size, dtype=torch.float32)
        use_cutmix = torch.zeros(batch_size, dtype=torch.bool)
        if self.mixup_enabled:
            if self.mixup_alpha > 0.0 and self.cutmix_alpha