Skip to content

Datasets

RandAugment RandAugment is a variant of AutoAugment which randomly selects transformations from AutoAugment to be applied on an image.

RandomAugmentation Implementation adapted from: https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py

Papers: RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719

AugmentOp

single auto augment operations

Source code in src/super_gradients/training/datasets/auto_augment.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class AugmentOp:
    """
    single auto augment operations
    """

    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
        hparams = hparams or _HPARAMS_DEFAULT
        self.aug_fn = NAME_TO_OP[name]
        self.level_fn = LEVEL_TO_ARG[name]
        self.prob = prob
        self.magnitude = magnitude
        self.hparams = hparams.copy()
        self.kwargs = dict(
            fillcolor=hparams["img_mean"] if "img_mean" in hparams else _FILL,
            resample=hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION,
        )

        # If magnitude_std is > 0, introduce some randomness
        self.magnitude_std = self.hparams.get("magnitude_std", 0)

    def __call__(self, img):
        if self.prob < 1.0 and random.random() > self.prob:
            return img
        magnitude = self.magnitude
        if self.magnitude_std:
            if self.magnitude_std == float("inf"):
                magnitude = random.uniform(0, magnitude)
            elif self.magnitude_std > 0:
                magnitude = random.gauss(magnitude, self.magnitude_std)
        magnitude = min(_MAX_MAGNITUDE, max(0, magnitude))  # clip to valid range
        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
        return self.aug_fn(img, *level_args, **self.kwargs)

RandAugment

Random auto augment class, will select auto augment transforms according to probability weights for each op

Source code in src/super_gradients/training/datasets/auto_augment.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
class RandAugment:
    """
    Random auto augment class, will select auto augment transforms according to probability weights for each op
    """

    def __init__(self, ops, num_layers=2, choice_weights=None):
        self.ops = ops
        self.num_layers = num_layers
        self.choice_weights = choice_weights

    def __call__(self, img):
        # no replacement when using weighted choice
        ops = np.random.choice(self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
        for op in ops:
            img = op(img)
        return img

rand_augment_transform(config_str, crop_size, img_mean)

Create a RandAugment transform

Parameters:

Name Type Description Default
config_str

String defining configuration of random augmentation. Consists of multiple sections separated by dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining sections, not order sepecific determine 'm' - integer magnitude of rand augment 'n' - integer num layers (number of transform ops selected per image) 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) 'mstd' - float std deviation of magnitude noise applied 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

required
crop_size int

The size of crop image

required
img_mean List[float]

Average per channel

required

Returns:

Type Description

A PyTorch compatible Transform

Source code in src/super_gradients/training/datasets/auto_augment.py
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@register_transform(Transforms.RandAugmentTransform)
def rand_augment_transform(config_str, crop_size: int, img_mean: List[float]):
    """
    Create a RandAugment transform

    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
    sections, not order sepecific determine
        'm' - integer magnitude of rand augment
        'n' - integer num layers (number of transform ops selected per image)
        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
        'mstd' -  float std deviation of magnitude noise applied
        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

    :param crop_size: The size of crop image
    :param img_mean:  Average per channel

    :return: A PyTorch compatible Transform
    """
    hparams = dict(translate_const=int(crop_size * 0.45), img_mean=tuple([min(255, round(255 * channel_mean)) for channel_mean in img_mean]))

    magnitude = _MAX_MAGNITUDE  # default to _MAX_MAGNITUDE for magnitude (currently 10)
    num_layers = 2  # default to 2 ops per image
    weight_idx = None  # default to no probability weights for op choice
    transforms = _RAND_TRANSFORMS
    config = config_str.split("-")
    for c in config:
        cs = re.split(r"(\d.*)", c)
        if len(cs) < 2:
            continue
        key, val = cs[:2]
        if key == "mstd":
            # noise param injected via hparams for now
            hparams.setdefault("magnitude_std", float(val))
        elif key == "inc":
            if bool(val):
                transforms = _RAND_INCREASING_TRANSFORMS
        elif key == "m":
            magnitude = int(val)
        elif key == "n":
            num_layers = int(val)
        elif key == "w":
            weight_idx = int(val)
        else:
            assert False, "Unknown RandAugment config section"
    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)

Cifar10

Bases: CIFAR10, HasPreprocessingParams

CIFAR10 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
@register_dataset(Datasets.CIFAR_10)
class Cifar10(CIFAR10, HasPreprocessingParams):
    """
    CIFAR10 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """

    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar10, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

Cifar100

Bases: CIFAR100, HasPreprocessingParams

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@register_dataset(Datasets.CIFAR_100)
class Cifar100(CIFAR100, HasPreprocessingParams):
    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        """
        CIFAR100 Dataset

        :param root:                    Path for the data to be extracted
        :param train:                   Bool to load training (True) or validation (False) part of the dataset
        :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
        :param target_transform:        Transform to apply to target output
        :param download:                Download (True) the dataset from source
        """
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar100, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

__init__(root, train=True, transforms=None, target_transform=None, download=False)

CIFAR100 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@resolve_param("transforms", TransformsFactory())
def __init__(
    self,
    root: str,
    train: bool = True,
    transforms: Union[list, dict] = None,
    target_transform: Optional[Callable] = None,
    download: bool = False,
) -> None:
    """
    CIFAR100 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """
    # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
    # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
    if isinstance(transforms, list):
        transforms = Compose(transforms)

    super(Cifar100, self).__init__(
        root=root,
        train=train,
        transform=transforms,
        target_transform=target_transform,
        download=download,
    )

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

ImageNetDataset

Bases: torch_datasets.ImageFolder, HasPreprocessingParams

ImageNetDataset dataset.

To use this Dataset you need to:

  • Download imagenet dataset (https://image-net.org/download.php) Imagenet ├──train │ ├──n02093991 │ │ ├──n02093991_1001.JPEG │ │ ├──n02093991_1004.JPEG │ │ └──... │ ├──n02093992 │ └──... └──val ├──n02093991 ├──n02093992 └──...

  • Instantiate the dataset: >> train_set = ImageNetDataset(root='.../Imagenet/train', ...) >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@register_dataset(Datasets.IMAGENET_DATASET)
class ImageNetDataset(torch_datasets.ImageFolder, HasPreprocessingParams):
    """ImageNetDataset dataset.

    To use this Dataset you need to:

    - Download imagenet dataset (https://image-net.org/download.php)
        Imagenet
         ├──train
         │  ├──n02093991
         │  │   ├──n02093991_1001.JPEG
         │  │   ├──n02093991_1004.JPEG
         │  │   └──...
         │  ├──n02093992
         │  └──...
         └──val
            ├──n02093991
            ├──n02093992
            └──...

    - Instantiate the dataset:
        >> train_set = ImageNetDataset(root='.../Imagenet/train', ...)
        >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)
    """

    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(self, root: str, transforms: Union[list, dict] = [], *args, **kwargs):
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)
        super(ImageNetDataset, self).__init__(root, transform=transforms, *args, **kwargs)

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

get_torchvision_transforms_equivalent_processing(transforms)

Get the equivalent processing pipeline for torchvision transforms.

Returns:

Type Description
List[Dict[str, Any]]

List of Processings operations

Source code in src/super_gradients/training/datasets/classification_datasets/torchvision_utils.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def get_torchvision_transforms_equivalent_processing(transforms: List[Any]) -> List[Dict[str, Any]]:
    """
    Get the equivalent processing pipeline for torchvision transforms.

    :return: List of Processings operations
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = []

    if isinstance(transforms, StandardTransform):
        transforms = transforms.transform

    if isinstance(transforms, Compose):
        transforms = transforms.transforms

    for transform in transforms:
        if isinstance(transform, ToTensor):
            pipeline.append({Processings.StandardizeImage: {"max_value": 255}})
        elif isinstance(transform, Normalize):
            pipeline.append({Processings.NormalizeImage: {"mean": tuple(map(float, transform.mean)), "std": tuple(map(float, transform.std))}})
        elif isinstance(transform, Resize):
            pipeline.append({Processings.Resize: {"size": int(transform.size)}})
        elif isinstance(transform, CenterCrop):
            pipeline.append({Processings.CenterCrop: {"size": int(transform.size)}})
        else:
            raise ValueError(f"Unsupported transform: {transform}")

    pipeline.append({Processings.ImagePermute: {"permutation": (2, 0, 1)}})
    return pipeline

Lighting

Bases: object

Lighting noise(AlexNet - style PCA - based noise) Taken from fastai Imagenet training - https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103 To use: - training_params = {"imagenet_pca_aug": 0.1} - Default training_params arg is 0.0 ("don't use") - 0.1 is that default in the original paper

Source code in src/super_gradients/training/datasets/data_augmentation.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@register_transform(Transforms.Lighting)
class Lighting(object):
    """
    Lighting noise(AlexNet - style PCA - based noise)
    Taken from fastai Imagenet training -
    https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103
    To use:
        - training_params = {"imagenet_pca_aug": 0.1}
        - Default training_params arg is 0.0 ("don't use")
        - 0.1 is that default in the original paper
    """

    def __init__(self, alphastd, eigval=IMAGENET_PCA["eigval"], eigvec=IMAGENET_PCA["eigvec"]):
        self.alphastd = alphastd
        self.eigval = eigval
        self.eigvec = eigvec

    def __call__(self, img):
        if self.alphastd == 0:
            return img
        alpha = img.new().resize_(3).normal_(0, self.alphastd)
        rgb = self.eigvec.type_as(img).clone().mul(alpha.view(1, 3).expand(3, 3)).mul(self.eigval.view(1, 3).expand(3, 3)).sum(1).squeeze()
        return img.add(rgb.view(3, 1, 1).expand_as(img))

RandomErase

Bases: RandomErasing

A simple class that translates the parameters supported in SuperGradient's code base

Source code in src/super_gradients/training/datasets/data_augmentation.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@register_transform(Transforms.RandomErase)
class RandomErase(RandomErasing):
    """
    A simple class that translates the parameters supported in SuperGradient's code base
    """

    def __init__(self, probability: float, value: str):
        # value might be a string representing a float. First we try to convert to float and if fails,
        # pass it as-is to super
        try:
            value = float(value)
        except ValueError:
            pass
        super().__init__(p=probability, value=value)

BoundingBoxFormat

Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class BoundingBoxFormat:
    """
    Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert
    whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to
    intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support
    all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.
    """

    def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert input boxes to XYXY format
        :param bboxes: Input bounding boxes [..., 4]
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in XYXY format
        """
        return self.get_to_xyxy(inplace)(bboxes, image_shape)

    def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert XYXY boxes to target bboxes format
        :param bboxes: Input bounding boxes [..., 4] in XYXY format
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in target format
        """
        return self.get_from_xyxy(inplace)(bboxes, image_shape)

    @abstractmethod
    def get_to_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    @abstractmethod
    def get_from_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    def get_num_parameters(self) -> int:
        return 4

from_xyxy(bboxes, image_shape, inplace)

Convert XYXY boxes to target bboxes format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4] in XYXY format

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in target format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
27
28
29
30
31
32
33
34
35
def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert XYXY boxes to target bboxes format
    :param bboxes: Input bounding boxes [..., 4] in XYXY format
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in target format
    """
    return self.get_from_xyxy(inplace)(bboxes, image_shape)

to_xyxy(bboxes, image_shape, inplace)

Convert input boxes to XYXY format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4]

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
17
18
19
20
21
22
23
24
25
def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert input boxes to XYXY format
    :param bboxes: Input bounding boxes [..., 4]
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in XYXY format
    """
    return self.get_to_xyxy(inplace)(bboxes, image_shape)

convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)

Convert bboxes from source to target format

Parameters:

Name Type Description Default
bboxes

Tensor of shape (..., 4) with input bounding boxes

required
image_shape Tuple[int, int]

Tuple of (rows, cols) corresponding to image shape

required
source_format BoundingBoxFormat

Format of the source bounding boxes

required
target_format BoundingBoxFormat

Format of the output bounding boxes

required

Returns:

Type Description

Tensor of shape (..., 4) with resulting bounding boxes

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
49
50
51
52
53
54
55
56
57
58
59
def convert_bboxes(bboxes, image_shape: Tuple[int, int], source_format: BoundingBoxFormat, target_format: BoundingBoxFormat, inplace: bool):
    """
    Convert bboxes from source to target format
    :param bboxes: Tensor of shape (..., 4) with input bounding boxes
    :param image_shape: Tuple of (rows, cols) corresponding to image shape
    :param source_format: Format of the source bounding boxes
    :param target_format: Format of the output bounding boxes
    :return: Tensor of shape (..., 4) with resulting bounding boxes
    """
    xyxy = source_format.to_xyxy(bboxes, image_shape, inplace)
    return target_format.from_xyxy(xyxy, image_shape, inplace)

cxcywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from CX-CY-W-H format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def cxcywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from CX-CY-W-H format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    cx, cy, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        if isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

cxcywh_to_xyxy_inplace(bboxes, image_shape)

Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def cxcywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    bboxes[..., 0:2] -= bboxes[..., 2:4] * 0.5  # cxcy -> x1y1
    bboxes[..., 2:4] += bboxes[..., 0:2]  # x1y1 + wh -> x2y2
    return bboxes

xyxy_to_cxcywh(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def xyxy_to_cxcywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1
    cx = x1 + 0.5 * w
    cy = y1 + 0.5 * h
    if torch.jit.is_scripting():
        return torch.stack([cx, cy, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([cx, cy, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([cx, cy, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_cxcywh_inplace(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place. Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def xyxy_to_cxcywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place.
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
        elif isinstance(bboxes, np.ndarray) and not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
    bboxes[..., 2:4] -= bboxes[..., 0:2]  # x2y2 - x1y2 -> wh
    bboxes[..., 0:2] += bboxes[..., 2:4] * 0.5  # cxcywh
    return bboxes

NormalizedXYXYCoordinateFormat

Bases: BoundingBoxFormat

Normalized X1,Y1,X2,Y2 bounding boxes format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class NormalizedXYXYCoordinateFormat(BoundingBoxFormat):
    """
    Normalized X1,Y1,X2,Y2 bounding boxes format
    """

    def __init__(self):
        super().__init__()
        self.format = "normalized_xyxy"
        self.normalized = True

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return normalized_xyxy_to_xyxy_inplace
        else:
            return normalized_xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_normalized_xyxy_inplace
        else:
            return xyxy_to_normalized_xyxy

normalized_xyxy_to_xyxy(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def normalized_xyxy_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
            scale = scale.reshape([1] * (len(bboxes.shape) - 1) + [4])
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

    return bboxes * scale

normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
67
68
69
70
71
72
73
74
75
76
77
def normalized_xyxy_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    bboxes[..., 0:3:2] *= cols
    bboxes[..., 1:4:2] *= rows
    return bboxes

xyxy_to_normalized_xyxy(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format

Parameters:

Name Type Description Default
bboxes Tensor

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description
Tensor

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def xyxy_to_normalized_xyxy(bboxes: Tensor, image_shape: Tuple[int, int]) -> Tensor:
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")
    return bboxes / scale

xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def xyxy_to_normalized_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """

    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if isinstance(bboxes, np.ndarray) and not np.issubdtype(bboxes.dtype, np.floating):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    rows, cols = image_shape
    bboxes[..., 0:3:2] /= cols
    bboxes[..., 1:4:2] /= rows
    return bboxes

xywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def xywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    x1, y1, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xywh_to_xyxy_inplace(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
65
66
67
68
69
70
71
72
def xywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    bboxes[..., 2:4] += bboxes[..., 0:2]
    return bboxes

xyxy_to_xywh(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def xyxy_to_xywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_xywh_inplace(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
55
56
57
58
59
60
61
62
def xyxy_to_xywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    bboxes[..., 2:4] -= bboxes[..., 0:2]
    return bboxes

XYXYCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format X1, Y1, X2, Y2

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xyxy.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class XYXYCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format X1, Y1, X2, Y2
    """

    def __init__(self):
        self.format = "xyxy"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

YXYXCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format Y1, X1, Y2, X1

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/yxyx.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class YXYXCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format Y1, X1, Y2, X1
    """

    def __init__(self):
        super().__init__()
        self.format = "yxyx"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

    def get_from_xyxy(self, inplace: bool):
        # XYXY <-> YXYX is interchangable operation, so we may reuse same routine here
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

ConcatenatedTensorFormatConverter

Source code in src/super_gradients/training/datasets/data_formats/format_converter.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class ConcatenatedTensorFormatConverter:
    def __init__(
        self,
        input_format: ConcatenatedTensorFormat,
        output_format: ConcatenatedTensorFormat,
        image_shape: Union[Tuple[int, int], None],
    ):
        """
        Converts concatenated tensors from input format to output format.

        Example:
            >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
            >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
            >>> h, w = 100, 200
            >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
            >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
            >>>
            >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
            >>>
            >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
            >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        self.permutation_indexes = get_permutation_indexes(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.image_shape = image_shape
        self.input_length = input_format.num_channels

    def __call__(self, tensor: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        if tensor.shape[-1] != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({tensor.shape[-1]}) must be "
                f"equal to {self.input_length} as defined by input format."
            )
        tensor = tensor[:, self.permutation_indexes]
        tensor = apply_on_bboxes(fn=self._convert_bbox, tensor=tensor, tensor_format=self.output_format)
        return tensor

    def _convert_bbox(self, bboxes: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        return convert_bboxes(
            bboxes=bboxes,
            source_format=self.input_format.bboxes_format.format,
            target_format=self.output_format.bboxes_format.format,
            inplace=False,
            image_shape=self.image_shape,
        )

__init__(input_format, output_format, image_shape)

Converts concatenated tensors from input format to output format.

Example: >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY >>> h, w = 100, 200 >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32) >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32) >>> >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w)) >>> >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in src/super_gradients/training/datasets/data_formats/format_converter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(
    self,
    input_format: ConcatenatedTensorFormat,
    output_format: ConcatenatedTensorFormat,
    image_shape: Union[Tuple[int, int], None],
):
    """
    Converts concatenated tensors from input format to output format.

    Example:
        >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
        >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
        >>> h, w = 100, 200
        >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
        >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
        >>>
        >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
        >>>
        >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
        >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    self.permutation_indexes = get_permutation_indexes(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.image_shape = image_shape
    self.input_length = input_format.num_channels

ConcatenatedTensorFormat

Bases: DetectionOutputFormat

Define the output format that return a single tensor of shape [N,M] (N - number of detections, M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields. A layout defines the order of concatenated tensors. For instance: - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1) - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

custom_format = ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

Source code in src/super_gradients/training/datasets/data_formats/formats.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class ConcatenatedTensorFormat(DetectionOutputFormat):
    """
    Define the output format that return a single tensor of shape [N,M] (N - number of detections,
    M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields.
    A layout defines the order of concatenated tensors. For instance:
    - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1)
    - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)


    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> custom_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>         TensorSliceItem(name="label", length=1),
    >>>         TensorSliceItem(name="distance", length=1),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>     )
    >>> )

    """

    layout: Mapping[str, TensorSliceItem]
    locations: Mapping[str, Tuple[int, int]]
    indexes: Mapping[str, List[int]]
    num_channels: int

    @property
    def bboxes_format(self) -> BoundingBoxesTensorSliceItem:
        bbox_items = [x for x in self.layout.values() if isinstance(x, BoundingBoxesTensorSliceItem)]
        return bbox_items[0]

    def __init__(self, layout: Union[List[TensorSliceItem], Tuple[TensorSliceItem, ...]]):
        bbox_items = [x for x in layout if isinstance(x, BoundingBoxesTensorSliceItem)]
        if len(bbox_items) != 1:
            raise RuntimeError("Number of bounding box items must be strictly equal to 1")

        _layout = []
        _locations = []
        _indexes = []

        offset = 0
        for item in layout:
            location_indexes = list(range(offset, offset + item.length))
            location_slice = offset, offset + item.length

            _layout.append((item.name, item))
            _locations.append((item.name, location_slice))
            _indexes.append((item.name, location_indexes))
            offset += item.length

        self.layout = collections.OrderedDict(_layout)
        self.locations = collections.OrderedDict(_locations)
        self.indexes = collections.OrderedDict(_indexes)
        self.num_channels = offset

    def __repr__(self):
        return str(self.layout)

apply_on_bboxes(fn, tensor, tensor_format)

Apply inplace a function only on the bboxes of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py
105
106
107
108
109
110
111
112
113
114
115
116
117
def apply_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on the bboxes of a concatenated tensor.

    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return apply_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

apply_on_layout(fn, tensor, tensor_format, layout_name)

Apply inplace a function only on a specific layout of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the layout

Source code in src/super_gradients/training/datasets/data_formats/formats.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def apply_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on a specific layout of a concatenated tensor.
    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after applying INPLACE the fn on the layout
    """
    location = slice(*iter(tensor_format.locations[layout_name]))
    result = fn(tensor[..., location])
    tensor[..., location] = result
    return tensor

filter_on_bboxes(fn, tensor, tensor_format)

Filter the tensor according to a condition on the bboxes.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py
139
140
141
142
143
144
145
146
147
148
149
150
151
def filter_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on the bboxes.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return filter_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

filter_on_layout(fn, tensor, tensor_format, layout_name)

Filter the tensor according to a condition on a specific layout.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after filtering the bboxes according to fn.

Source code in src/super_gradients/training/datasets/data_formats/formats.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def filter_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on a specific layout.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after filtering the bboxes according to fn.
    """
    location = slice(*tensor_format.locations[layout_name])
    mask = fn(tensor[..., location])
    tensor = tensor[mask]
    return tensor

get_permutation_indexes(input_format, output_format)

Compute the permutations required to change the format layout order.

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Input format to transform from

required
output_format ConcatenatedTensorFormat

Output format to transform to

required

Returns:

Type Description
List[int]

Permutation indexes to go from input to output format.

Source code in src/super_gradients/training/datasets/data_formats/formats.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_permutation_indexes(input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat) -> List[int]:
    """Compute the permutations required to change the format layout order.

    :param input_format:    Input format to transform from
    :param output_format:   Output format to transform to
    :return: Permutation indexes to go from input to output format.
    """
    output_indexes = []
    for output_name, output_spec in output_format.layout.items():
        if output_name not in input_format.layout:
            raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

        input_spec = input_format.layout[output_name]
        if input_spec.length != output_spec.length:
            raise RuntimeError(
                f"Length of the output must match in input and output format. "
                f"Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
            )
        indexes = input_format.indexes[output_name]
        output_indexes.extend(indexes)
    return output_indexes

ConvertBoundingBoxes

Bases: nn.Module

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ConvertBoundingBoxes(nn.Module):
    to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]
    from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]

    def __init__(
        self,
        location: Tuple[int, int],
        to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        image_shape: Tuple[int, int],
    ):
        super().__init__()
        self.to_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], to_xyxy)
        self.from_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], from_xyxy)
        self.image_shape = image_shape
        self.location = location

    def forward(self, x: Tensor) -> Tensor:
        """

        :param x:
        :param image_shape:
        :return:
        """
        location = slice(self.location[0], self.location[1])
        bboxes = x[..., location]
        xyxy = self.to_xyxy(bboxes, self.image_shape)
        x[..., location] = self.from_xyxy(xyxy, self.image_shape)
        return x

forward(x)

Parameters:

Name Type Description Default
x Tensor required
image_shape required

Returns:

Type Description
Tensor
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
56
57
58
59
60
61
62
63
64
65
66
67
def forward(self, x: Tensor) -> Tensor:
    """

    :param x:
    :param image_shape:
    :return:
    """
    location = slice(self.location[0], self.location[1])
    bboxes = x[..., location]
    xyxy = self.to_xyxy(bboxes, self.image_shape)
    x[..., location] = self.from_xyxy(xyxy, self.image_shape)
    return x

DetectionOutputAdapter

Bases: nn.Module

Adapter class for converting model's predictions for object detection to a desired format. This adapter supports torch.jit tracing & scripting & onnx conversion.

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

class CustomDetectionHead(nn.Module): num_classes: int = 123

@property def format(self): ''' Describe the semantics of the model's output. In this example model's output consists of - Bounding boxes in XYXY format [4] - Predicted probas of N classes [N] - A distance predictions [1] - K additional labels [K] ''' return ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

yolox = YoloX(head=CustomDetectionHead)

Suppose we want to return predictions in another format.

Let it be:

- Bounding boxes in normalized XYWH [4]

- Predicted attributes [4]

- Predicted label [1]

output_format = ConcatenatedTensorFormat( layout=( # Note: For output format it is not required to specify location attribute as it will be # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()), TensorSliceItem(name="attributes", length=4), TensorSliceItem(name="label", length=1), ) )

Now we can construct output adapter and attach it to the model

output_adapter = DetectionOutputAdapter( input_format=yolox.head.format, output_format=output_format, image_shape=(640, 640) )

yolox = nn.Sequential(yolox, output_adapter)

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class DetectionOutputAdapter(nn.Module):
    """
    Adapter class for converting model's predictions for object detection to a desired format.
    This adapter supports torch.jit tracing & scripting & onnx conversion.

    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> class CustomDetectionHead(nn.Module):
    >>>    num_classes: int = 123
    >>>
    >>>    @property
    >>>    def format(self):
    >>>        '''
    >>>        Describe the semantics of the model's output. In this example model's output consists of
    >>>         - Bounding boxes in XYXY format [4]
    >>>         - Predicted probas of N classes [N]
    >>>         - A distance predictions [1]
    >>>         - K additional labels [K]
    >>>        '''
    >>>        return ConcatenatedTensorFormat(
    >>>            layout=(
    >>>                BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>                TensorSliceItem(name="label", length=1),
    >>>                TensorSliceItem(name="distance", length=1),
    >>>                TensorSliceItem(name="attributes", length=4),
    >>>            )
    >>>        )
    >>>
    >>> yolox = YoloX(head=CustomDetectionHead)
    >>>
    >>> # Suppose we want to return predictions in another format.
    >>> # Let it be:
    >>> # - Bounding boxes in normalized XYWH [4]
    >>> # - Predicted attributes [4]
    >>> # - Predicted label [1]
    >>> output_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         # Note: For output format it is not required to specify location attribute as it will be
    >>>         # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>         TensorSliceItem(name="label", length=1),
    >>>     )
    >>> )
    >>>
    >>> # Now we can construct output adapter and attach it to the model
    >>> output_adapter = DetectionOutputAdapter(
    >>>     input_format=yolox.head.format,
    >>>     output_format=output_format,
    >>>     image_shape=(640, 640)
    >>> )
    >>>
    >>> yolox = nn.Sequential(yolox, output_adapter)
    >>>
    """

    def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
        """

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        super().__init__()

        self.format_conversion: nn.Module = self.get_format_conversion_module(
            location=input_format.locations[input_format.bboxes_format.name],
            input_bbox_format=input_format.bboxes_format.format,
            output_bbox_format=output_format.bboxes_format.format,
            image_shape=image_shape,
        )

        self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.input_length = input_format.num_channels

    def forward(self, predictions: Tensor) -> Tensor:
        """
        Convert output detections to the user-specified format
        :param predictions:
        :return:
        """
        if predictions.size(-1) != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
                f"equal to {self.input_length} as defined by input format."
            )

        predictions = self.format_conversion(predictions.clone())
        predictions = self.rearrange_outputs(predictions)
        return predictions

    @classmethod
    def get_rearrange_outputs_module(
        cls, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat
    ) -> Tuple[RearrangeOutput, ConcatenatedTensorFormat]:

        output_indexes = []
        rearranged_layout = []

        offset = 0
        for output_name, output_spec in output_format.layout.items():
            if output_name not in input_format.layout:
                raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

            input_spec = input_format.layout[output_name]

            if input_spec.length != output_spec.length:
                raise RuntimeError(
                    "Length of the output must match in input and output format. "
                    "Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
                )
            indexes = input_format.indexes[output_name]
            output_indexes.extend(indexes)
            output_len = len(indexes)

            rearranged_item = copy.deepcopy(output_spec)
            offset += output_len

            rearranged_layout.append(rearranged_item)
        rearranged_format = ConcatenatedTensorFormat(rearranged_layout)
        return RearrangeOutput(torch.tensor(output_indexes).long()), rearranged_format

    @classmethod
    def get_format_conversion_module(
        cls, location: Tuple[int, int], input_bbox_format: BoundingBoxFormat, output_bbox_format: BoundingBoxFormat, image_shape: Union[Tuple[int, int], None]
    ) -> ConvertBoundingBoxes:
        return ConvertBoundingBoxes(
            location=location,
            to_xyxy=input_bbox_format.get_to_xyxy(False),
            from_xyxy=output_bbox_format.get_from_xyxy(True),
            image_shape=image_shape,
        )

__init__(input_format, output_format, image_shape)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
    """

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    super().__init__()

    self.format_conversion: nn.Module = self.get_format_conversion_module(
        location=input_format.locations[input_format.bboxes_format.name],
        input_bbox_format=input_format.bboxes_format.format,
        output_bbox_format=output_format.bboxes_format.format,
        image_shape=image_shape,
    )

    self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.input_length = input_format.num_channels

forward(predictions)

Convert output detections to the user-specified format

Parameters:

Name Type Description Default
predictions Tensor required

Returns:

Type Description
Tensor
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def forward(self, predictions: Tensor) -> Tensor:
    """
    Convert output detections to the user-specified format
    :param predictions:
    :return:
    """
    if predictions.size(-1) != self.input_length:
        raise RuntimeError(
            f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
            f"equal to {self.input_length} as defined by input format."
        )

    predictions = self.format_conversion(predictions.clone())
    predictions = self.rearrange_outputs(predictions)
    return predictions

RearrangeOutput

Bases: nn.Module

Rearrange elements in last dimension of input tensor with respect to index argument

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class RearrangeOutput(nn.Module):
    """
    Rearrange elements in last dimension of input tensor with respect to index argument

    """

    def __init__(self, indexes: Tensor):
        super().__init__()
        self.indexes = indexes

    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: Input tensor of  [..., N] shape
        :return: Output tensor of [..., N[index]] shape
        """
        if torch.jit.is_scripting():
            # Workaround "Ellipses followed by tensor indexing is currently not supported"
            # https://github.com/pytorch/pytorch/issues/34837
            x = torch.moveaxis(x, -1, 0)
            x = x[self.indexes]
            x = torch.moveaxis(x, 0, -1)
            return x
        else:
            return x[..., self.indexes]

forward(x)

Parameters:

Name Type Description Default
x Tensor

Input tensor of [..., N] shape

required

Returns:

Type Description
Tensor

Output tensor of [..., N[index]] shape

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def forward(self, x: Tensor) -> Tensor:
    """
    :param x: Input tensor of  [..., N] shape
    :return: Output tensor of [..., N[index]] shape
    """
    if torch.jit.is_scripting():
        # Workaround "Ellipses followed by tensor indexing is currently not supported"
        # https://github.com/pytorch/pytorch/issues/34837
        x = torch.moveaxis(x, -1, 0)
        x = x[self.indexes]
        x = torch.moveaxis(x, 0, -1)
        return x
    else:
        return x[..., self.indexes]

AbstractCollateFunction

Bases: ABC

A collate function (for torch DataLoader)

Source code in src/super_gradients/training/datasets/datasets_utils.py
76
77
78
79
80
81
82
83
class AbstractCollateFunction(ABC):
    """
    A collate function (for torch DataLoader)
    """

    @abstractmethod
    def __call__(self, batch):
        pass

AbstractPrePredictionCallback

Bases: ABC

Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params pre_prediction_callback keyword arg.

Should implement call and return images, targets after applying the desired preprocessing.

Source code in src/super_gradients/training/datasets/datasets_utils.py
175
176
177
178
179
180
181
182
183
184
185
class AbstractPrePredictionCallback(ABC):
    """
    Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params
     pre_prediction_callback keyword arg.

    Should implement __call__ and return images, targets after applying the desired preprocessing.
    """

    @abstractmethod
    def __call__(self, inputs, targets, batch_idx):
        pass

ComposedCollateFunction

Bases: AbstractCollateFunction

A function (for torch DataLoader) which executes a sequence of sub collate functions

Source code in src/super_gradients/training/datasets/datasets_utils.py
86
87
88
89
90
91
92
93
94
95
96
97
98
@register_collate_function()
class ComposedCollateFunction(AbstractCollateFunction):
    """
    A function (for torch DataLoader) which executes a sequence of sub collate functions
    """

    def __init__(self, functions: list):
        self.functions = functions

    def __call__(self, batch):
        for f in self.functions:
            batch = f(batch)
        return batch

DatasetStatisticsTensorboardLogger

Source code in src/super_gradients/training/datasets/datasets_utils.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
class DatasetStatisticsTensorboardLogger:

    logger = get_logger(__name__)
    DEFAULT_SUMMARY_PARAMS = {
        "sample_images": 32,  # by default, 32 images will be sampled from each dataset
        "plot_class_distribution": True,
        "plot_box_size_distribution": True,
        "plot_anchors_coverage": True,
        "max_batches": 30,
    }

    def __init__(self, sg_logger, summary_params: dict = DEFAULT_SUMMARY_PARAMS):
        self.sg_logger = sg_logger
        self.summary_params = {**DatasetStatisticsTensorboardLogger.DEFAULT_SUMMARY_PARAMS, **summary_params}

    def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
        """
        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. applicable only for detection datasets
        :param all_classes: the list of all classes names
        """
        # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
        # if isinstance(data_loader.dataset, DetectionDataSet):
        #     self._analyze_detection(data_loader=data_loader, title=title,
        #                             all_classes=all_classes, anchors=anchors)
        # else:
        #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
        DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

    def _analyze_detection(self, data_loader, title, all_classes, anchors=None):
        """
        Analyze a detection dataset

        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param all_classes: the list of all classes names
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. if not provided, anchors coverage will not be analyzed
        """
        try:
            color_mean = AverageMeter()
            color_std = AverageMeter()
            all_labels = []
            image_size = 0
            for i, (images, labels) in enumerate(tqdm(data_loader)):

                if i >= self.summary_params["max_batches"] > 0:
                    break

                if i == 0:
                    image_size = max(images[0].shape[1], images[0].shape[2])
                    if images.shape[0] > self.summary_params["sample_images"]:
                        samples = images[: self.summary_params["sample_images"]]
                    else:
                        samples = images

                    pred = [torch.zeros(size=(0, 6)) for _ in range(len(samples))]
                    try:
                        result_images = DetectionVisualization.visualize_batch(
                            image_tensor=samples,
                            pred_boxes=pred,
                            target_boxes=copy.deepcopy(labels),
                            batch_name=title,
                            class_names=all_classes,
                            box_thickness=1,
                            gt_alpha=1.0,
                        )

                        self.sg_logger.add_images(tag=f"{title} sample images", images=np.stack(result_images).transpose([0, 3, 1, 2])[:, ::-1, :, :])
                    except Exception as e:
                        DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at adding an example batch:\n{e}")
                        return

                all_labels.append(labels)
                color_mean.update(torch.mean(images, dim=[0, 2, 3]), 1)
                color_std.update(torch.std(images, dim=[0, 2, 3]), 1)

            all_labels = torch.cat(all_labels, dim=0)[1:].numpy()

            try:
                if self.summary_params["plot_class_distribution"]:
                    self._analyze_class_distribution(labels=all_labels, num_classes=len(all_classes), title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing class distributions.\n{e}")
                return

            try:
                if self.summary_params["plot_box_size_distribution"]:
                    self._analyze_object_size_distribution(labels=all_labels, title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing object size " f"distributions.\n{e}")
                return

            summary = ""
            summary += f"dataset size: {len(data_loader)}  \n"
            summary += f"color mean: {color_mean.average}  \n"
            summary += f"color std: {color_std.average}  \n"

            try:
                if anchors is not None and image_size > 0:
                    coverage = self._analyze_anchors_coverage(anchors=anchors, image_size=image_size, title=title, labels=all_labels)
                    summary += f"anchors: {anchors}  \n"
                    summary += f"anchors coverage: {coverage}  \n"
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing anchors " f"coverage.\n{e}")
                return

            self.sg_logger.add_text(tag=f"{title} Statistics", text_string=summary)
            self.sg_logger.flush()

        except Exception as e:
            DatasetStatisticsTensorboardLogger.logger.error(f"dataset analysis failed!\n{e}")

    def _analyze_class_distribution(self, labels: list, num_classes: int, title: str):
        hist, edges = np.histogram(labels[:, 0], num_classes)

        f = plt.figure(figsize=[10, 8])

        plt.bar(range(num_classes), hist, width=0.5, color="#0504aa", alpha=0.7)
        plt.xlim(-1, num_classes)
        plt.grid(axis="y", alpha=0.75)
        plt.xlabel("Value", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.ylabel("Frequency", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.xticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.yticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.title(f"{title} class distribution", fontsize=STAT_LOGGER_FONT_SIZE)

        self.sg_logger.add_figure(f"{title} class distribution", figure=f)
        text_dist = ""
        for i, val in enumerate(hist):
            text_dist += f"[{i}]: {val}, "

        self.sg_logger.add_text(tag=f"{title} class distribution", text_string=text_dist)

    def _analyze_object_size_distribution(self, labels: list, title: str):
        """
        This function will add two plots to the tensorboard.
        one is a 2D histogram and the other is a scatter plot. in both cases the X axis is the object width and Y axis
        is the object width (both normalized by image size)
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        # histogram plot
        hist, xedges, yedges = np.histogram2d(labels[:, 4], labels[:, 3], 50)  # x and y are deliberately switched

        fig = plt.figure(figsize=(10, 6))
        fig.suptitle(f"{title} boxes w/h distribution")
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(np.log(hist + 1), interpolation="nearest", origin="lower", extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

        # scatter plot
        if len(labels) > 10000:
            # we randomly sample just 10000 objects so that the scatter plot will not get too dense
            labels = labels[np.random.randint(0, len(labels) - 1, 10000)]
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        plt.scatter(labels[:, 3], labels[:, 4], marker=".")

        self.sg_logger.add_figure(tag=f"{title} boxes w/h distribution", figure=fig)

    @staticmethod
    def _get_rect(w, h):
        min_w = w / 4.0
        min_h = h / 4.0
        return Rectangle((min_w, min_h), w * 4 - min_w, h * 4 - min_h, linewidth=1, edgecolor="b", facecolor="none")

    @staticmethod
    def _get_score(anchors: np.ndarray, points: np.ndarray, image_size: int):
        """
        Calculate the ratio (and 1/ratio) between each anchor width and height and each point (representing a possible
        object width and height).
        i.e. for an anchor with w=10,h=20 the point w=11,h=25 will have the ratios 11/10=1.1 and 25/20=1.25
        or 10/11=0.91 and 20/25=0.8 respectively

        :param anchors: array of anchors of the shape [2,N]
        :param points: array of points of the shape [2,M]
        :param image_size the size of the input image

        :returns: an array of size [image_size - 1, image_size - 1] where each cell i,j represent the minimum ratio
        for that cell (point) from all anchors
        """

        ratio = (
            anchors[:, :, None]
            / points[
                :,
            ]
        )
        inv_ratio = 1 / ratio
        min_ratio = 1 - np.minimum(ratio, inv_ratio)
        min_ratio = np.max(min_ratio, axis=1)
        to_closest_anchor = np.min(min_ratio, axis=0)
        to_closest_anchor[to_closest_anchor > 0.75] = 2
        return to_closest_anchor.reshape(image_size - 1, -1)

    def _analyze_anchors_coverage(self, anchors: Anchors, image_size: int, labels: list, title: str):
        """
        This function will add anchors coverage plots to the tensorboard.
        :param anchors: a list of anchors
        :param image_size: the input image size for this training
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        fig = plt.figure(figsize=(12, 5))
        fig.suptitle(f"{title} anchors coverage")

        # box style plot
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_xlim([0, image_size])
        ax.set_ylim([0, image_size])

        anchors_boxes = anchors.anchors.cpu().numpy()
        anchors_len = anchors.num_anchors

        anchors_boxes = anchors_boxes.reshape(-1, 2)

        for i in range(anchors_len):
            rect = self._get_rect(anchors_boxes[i][0], anchors_boxes[i][1])
            rect.set_alpha(0.3)
            rect.set_facecolor([random.random(), random.random(), random.random(), 0.3])
            ax.add_patch(rect)

        # distance from anchor plot
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        x = np.arange(1, image_size, 1)
        y = np.arange(1, image_size, 1)

        xx, yy = np.meshgrid(x, y, sparse=False, indexing="xy")
        points = np.concatenate([xx.reshape(1, -1), yy.reshape(1, -1)])

        color = self._get_score(anchors_boxes, points, image_size)

        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(color, interpolation="nearest", origin="lower", extent=[0, image_size, 0, image_size])

        # calculate the coverage for the dataset labels
        cover_masks = []
        for i in range(anchors_len):
            w_max = (anchors_boxes[i][0] / image_size) * 4
            w_min = (anchors_boxes[i][0] / image_size) * 0.25
            h_max = (anchors_boxes[i][1] / image_size) * 4
            h_min = (anchors_boxes[i][1] / image_size) * 0.25
            cover_masks.append(
                np.logical_and(np.logical_and(np.logical_and(labels[:, 3] < w_max, labels[:, 3] > w_min), labels[:, 4] < h_max), labels[:, 4] > h_min)
            )
        cover_masks = np.stack(cover_masks)
        coverage = np.count_nonzero(np.any(cover_masks, axis=0)) / len(labels)

        self.sg_logger.add_figure(tag=f"{title} anchors coverage", figure=fig)
        return coverage

analyze(data_loader, title, all_classes, anchors=None)

Parameters:

Name Type Description Default
data_loader torch.utils.data.DataLoader

the dataset data loader

required
dataset_params

the dataset parameters

required
title str

the title for this dataset (i.e. Coco 2017 test set)

required
anchors list

the list of anchors used by the model. applicable only for detection datasets

None
all_classes List[str]

the list of all classes names

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
    """
    :param data_loader: the dataset data loader
    :param dataset_params: the dataset parameters
    :param title: the title for this dataset (i.e. Coco 2017 test set)
    :param anchors: the list of anchors used by the model. applicable only for detection datasets
    :param all_classes: the list of all classes names
    """
    # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
    # if isinstance(data_loader.dataset, DetectionDataSet):
    #     self._analyze_detection(data_loader=data_loader, title=title,
    #                             all_classes=all_classes, anchors=anchors)
    # else:
    #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
    DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

DetectionMultiscalePrePredictionCallback

Bases: MultiscalePrePredictionCallback

Mutiscalepre-prediction callback for object detection.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.

Parameters:

Name Type Description Default
multiscale_range

Range of values for resize sizes as discussed above (default=5)

required
image_size_steps

Image step sizes as discussed abov (default=32)

required
change_frequency

The frequency to apply change in input size.

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
@register_callback(Callbacks.DETECTION_MULTISCALE_PREPREDICTION)
class DetectionMultiscalePrePredictionCallback(MultiscalePrePredictionCallback):
    """
    Mutiscalepre-prediction callback for object detection.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.

    """

    def __call__(self, inputs, targets, batch_idx):
        # RESCALE THE IMAGE FIRST WITH SUPER(), AND IF RESCALING HAS ACTUALLY BEEN DONE APPLY TO BOXES AS WELL
        input_size = inputs.shape[2:]
        inputs, targets = super(DetectionMultiscalePrePredictionCallback, self).__call__(inputs, targets, batch_idx)
        new_input_size = inputs.shape[2:]
        scale_y = new_input_size[0] / input_size[0]
        scale_x = new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            targets[..., 2::2] = targets[..., 2::2] * scale_x
            targets[..., 3::2] = targets[..., 3::2] * scale_y
        return inputs, targets

MultiScaleCollateFunction

Bases: AbstractCollateFunction

a collate function to implement multi-scale data augmentation according to https://arxiv.org/pdf/1612.08242.pdf

Source code in src/super_gradients/training/datasets/datasets_utils.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@register_collate_function()
class MultiScaleCollateFunction(AbstractCollateFunction):
    """
    a collate function to implement multi-scale data augmentation
    according to https://arxiv.org/pdf/1612.08242.pdf
    """

    _counter = AtomicInteger(0)
    _current_size = AtomicInteger(0)
    _lock = Lock()

    def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
        """
        set parameters for the multi-scale collate function
        the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
        a new size will be randomly selected every change_frequency calls to the collate_fn()
            :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
            :param min_image_size: the minimum size to scale down to (in pixels)
            :param max_image_size: the maximum size to scale up to (in pixels)
            :param image_size_steps: typically, the stride of the net, which defines the possible image
                    size multiplications
            :param change_frequency:
        """
        assert target_size is not None or (
            max_image_size is not None and min_image_size is not None
        ), "either target_size or min_image_size and max_image_size has to be set"
        assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

        if target_size is not None:
            min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
            max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

        print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

        self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self._current_size = random.choice(self.sizes)

    def __call__(self, batch):

        with self._lock:

            # Important: this implementation was tailored for a specific input. it assumes the batch is a tuple where
            # the images are the first item
            assert isinstance(batch, tuple), "this collate function expects the input to be a tuple (images, labels)"
            images = batch[0]
            if self._counter % self.frequency == 0:
                self._current_size = random.choice(self.sizes)
            self._counter += 1

            assert images.shape[2] % self.image_size_steps == 0 and images.shape[3] % self.image_size_steps == 0, (
                "images sized not divisible by %d. (resize images before calling multi_scale)" % self.image_size_steps
            )

            if self._current_size != max(images.shape[2:]):
                ratio = float(self._current_size) / max(images.shape[2:])
                new_size = (int(round(images.shape[2] * ratio)), int(round(images.shape[3] * ratio)))
                images = F.interpolate(images, size=new_size, mode="bilinear", align_corners=False)

            return images, batch[1]

__init__(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)

set parameters for the multi-scale collate function the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps a new size will be randomly selected every change_frequency calls to the collate_fn() :param target_size: scales will be [0.66 * target_size, 1.5 * target_size] :param min_image_size: the minimum size to scale down to (in pixels) :param max_image_size: the maximum size to scale up to (in pixels) :param image_size_steps: typically, the stride of the net, which defines the possible image size multiplications :param change_frequency:

Source code in src/super_gradients/training/datasets/datasets_utils.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
    """
    set parameters for the multi-scale collate function
    the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
    a new size will be randomly selected every change_frequency calls to the collate_fn()
        :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
        :param min_image_size: the minimum size to scale down to (in pixels)
        :param max_image_size: the maximum size to scale up to (in pixels)
        :param image_size_steps: typically, the stride of the net, which defines the possible image
                size multiplications
        :param change_frequency:
    """
    assert target_size is not None or (
        max_image_size is not None and min_image_size is not None
    ), "either target_size or min_image_size and max_image_size has to be set"
    assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

    if target_size is not None:
        min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
        max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

    print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

    self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
    self.image_size_steps = image_size_steps
    self.frequency = change_frequency
    self._current_size = random.choice(self.sizes)

MultiscalePrePredictionCallback

Bases: AbstractPrePredictionCallback

Mutiscale pre-prediction callback pass function.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps)

Parameters:

Name Type Description Default
multiscale_range int

Range of values for resize sizes as discussed above (default=5)

5
image_size_steps int

Image step sizes as discussed abov (default=32)

32
change_frequency int

The frequency to apply change in input size.

10
Source code in src/super_gradients/training/datasets/datasets_utils.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
class MultiscalePrePredictionCallback(AbstractPrePredictionCallback):
    """
    Mutiscale pre-prediction callback pass function.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps)


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.
    """

    def __init__(self, multiscale_range: int = 5, image_size_steps: int = 32, change_frequency: int = 10):

        self.multiscale_range = multiscale_range
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self.rank = None
        self.is_distributed = None
        self.sampled_imres_once = False
        self.new_input_size = None

    def __call__(self, inputs, targets, batch_idx):
        if self.rank is None:
            self.rank = get_local_rank()
        if self.is_distributed is None:
            self.is_distributed = get_world_size() > 1

        # GENERATE A NEW SIZE AND BROADCAST IT TO THE THE OTHER RANKS SO THEY HAVE THE SAME SCALE
        input_size = inputs.shape[2:]
        if batch_idx % self.frequency == 0:
            tensor = torch.LongTensor(2).to(inputs.device)

            if self.rank == 0:
                size_factor = input_size[1] * 1.0 / input_size[0]
                min_size = int(input_size[0] / self.image_size_steps) - self.multiscale_range
                max_size = int(input_size[0] / self.image_size_steps) + self.multiscale_range
                random_size = (min_size, max_size)
                if self.sampled_imres_once:
                    size = random.randint(*random_size)
                else:
                    # sample the biggest resolution first to make sure the run fits into the GPU memory
                    size = max_size
                    self.sampled_imres_once = True
                size = (int(self.image_size_steps * size), self.image_size_steps * int(size * size_factor))
                tensor[0] = size[0]
                tensor[1] = size[1]

            if self.is_distributed:
                dist.barrier()
                dist.broadcast(tensor, 0)

            self.new_input_size = (tensor[0].item(), tensor[1].item())

        scale_y = self.new_input_size[0] / input_size[0]
        scale_x = self.new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            inputs = torch.nn.functional.interpolate(inputs, size=self.new_input_size, mode="bilinear", align_corners=False)
        return inputs, targets

RandomResizedCropAndInterpolation

Bases: RandomResizedCrop

Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks.

Parameters:

Name Type Description Default
size

Expected output size of each edge

required
scale

Range of size of the origin size cropped

(0.08, 1.0)
ratio

Range of aspect ratio of the origin aspect ratio cropped

(3.0 / 4.0, 4.0 / 3.0)
interpolation

Default: PIL.Image.BILINEAR

'default'
Source code in src/super_gradients/training/datasets/datasets_utils.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
@register_transform(Transforms.RandomResizedCropAndInterpolation)
class RandomResizedCropAndInterpolation(RandomResizedCrop):
    """
    Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
    is finally resized to given size.
    This is popularly used to train the Inception networks.

    :param size: Expected output size of each edge
    :param scale: Range of size of the origin size cropped
    :param ratio: Range of aspect ratio of the origin aspect ratio cropped
    :param interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="default"):
        super(RandomResizedCropAndInterpolation, self).__init__(size=size, scale=scale, ratio=ratio, interpolation=interpolation)
        if interpolation == "random":
            self.interpolation = _RANDOM_INTERPOLATION
        elif interpolation == "default":
            self.interpolation = InterpolationMode.BILINEAR
        else:
            self.interpolation = _pil_interp(interpolation)

    def forward(self, img: Image) -> Image:
        """
        :param img: Image to be cropped and resized.
        :return: Image: Randomly cropped and resized image.
        """
        i, j, h, w = self.get_params(img, self.scale, self.ratio)
        if isinstance(self.interpolation, (tuple, list)):
            interpolation = random.choice(self.interpolation)
        else:
            interpolation = self.interpolation
        return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

    def __repr__(self):
        if isinstance(self.interpolation, (tuple, list)):
            interpolate_str = " ".join([_pil_interpolation_to_str[x] for x in self.interpolation])
        else:
            interpolate_str = _pil_interpolation_to_str[self.interpolation]
        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
        format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale))
        format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio))
        format_string += ", interpolation={0})".format(interpolate_str)
        return format_string

forward(img)

Parameters:

Name Type Description Default
img Image

Image to be cropped and resized.

required

Returns:

Type Description
Image

Image: Randomly cropped and resized image.

Source code in src/super_gradients/training/datasets/datasets_utils.py
344
345
346
347
348
349
350
351
352
353
354
def forward(self, img: Image) -> Image:
    """
    :param img: Image to be cropped and resized.
    :return: Image: Randomly cropped and resized image.
    """
    i, j, h, w = self.get_params(img, self.scale, self.ratio)
    if isinstance(self.interpolation, (tuple, list)):
        interpolation = random.choice(self.interpolation)
    else:
        interpolation = self.interpolation
    return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])

Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned according to rand_augment_config_string

Parameters:

Name Type Description Default
rand_augment_config_string str

string which defines the auto augment configurations. If none, color jitter will be returned. For possibile values see auto_augment.py

required
color_jitter tuple

tuple for color jitter value.

required
crop_size

relevant only for auto augment

224
img_mean

relevant only for auto augment

[0.485, 0.456, 0.406]

Returns:

Type Description

RandAugment transform or ColorJitter

Source code in src/super_gradients/training/datasets/datasets_utils.py
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
def get_color_augmentation(rand_augment_config_string: str, color_jitter: tuple, crop_size=224, img_mean=[0.485, 0.456, 0.406]):
    """
    Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned
    according to rand_augment_config_string

    :param rand_augment_config_string: string which defines the auto augment configurations.
                                       If none, color jitter will be returned. For possibile values see auto_augment.py
    :param color_jitter: tuple for color jitter value.
    :param crop_size: relevant only for auto augment
    :param img_mean: relevant only for auto augment
    :return: RandAugment transform or ColorJitter
    """
    if rand_augment_config_string:
        color_augmentation = rand_augment_transform(rand_augment_config_string, crop_size, img_mean)

    else:  # RandAugment includes colorjitter like augmentations, both cannot be applied together.
        color_augmentation = transforms.ColorJitter(*color_jitter)
    return color_augmentation

get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)

A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

Parameters:

Name Type Description Default
data_dir

String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"

None
dataloader

a torch DataLoader, as it would feed the data into the trainer (including transforms etc).

None
RandomResizeSize

Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet, this value should be 224).

224

Returns:

Type Description

2 lists,mean and std, each one of len 3 (1 for each channel)

Source code in src/super_gradients/training/datasets/datasets_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224):
    """
    A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

    :param data_dir: String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"
    :param dataloader: a torch DataLoader, as it would feed the data into the trainer (including transforms etc).
    :param RandomResizeSize: Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet,
    this value should be 224).
    :return: 2 lists,mean and std, each one of len 3 (1 for each channel)
    """
    assert data_dir is None or dataloader is None, "Please provide either path to data folder or DataLoader, not both."

    if dataloader is None:
        traindir = os.path.join(os.path.abspath(data_dir), "train")
        trainset = ImageFolder(
            traindir, transforms.Compose([transforms.RandomResizedCrop(RandomResizeSize), transforms.RandomHorizontalFlip(), transforms.ToTensor()])
        )
        dataloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=num_workers)

    print(f"Calculating on {len(dataloader.dataset.targets)} Training Samples")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    h, w = 0, 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            h, w = inputs.size(2), inputs.size(3)
            print(f"Min: {inputs.min()}, Max: {inputs.max()}")
            chsum = inputs.sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += inputs.sum(dim=(0, 2, 3), keepdim=True)
    mean = chsum / len(trainset) / h / w
    print(f"mean: {mean.view(-1)}")

    chsum = None
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            chsum = (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
    std = torch.sqrt(chsum / (len(trainset) * h * w - 1))
    print(f"std: {std.view(-1)}")
    return mean.view(-1).cpu().numpy().tolist(), std.view(-1).cpu().numpy().tolist()

worker_init_reset_seed(worker_id)

Make sure each process has different random seed, especially for 'fork' method. Check https://github.com/pytorch/pytorch/issues/63311 for more details.

Parameters:

Name Type Description Default
worker_id

placeholder (needs to be passed to DataLoader init).

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
657
658
659
660
661
662
663
664
665
666
667
def worker_init_reset_seed(worker_id):
    """
    Make sure each process has different random seed, especially for 'fork' method.
    Check https://github.com/pytorch/pytorch/issues/63311 for more details.

    :param worker_id: placeholder (needs to be passed to DataLoader init).
    """
    seed = uuid.uuid4().int % 2**32
    random.seed(seed)
    torch.set_rng_state(torch.manual_seed(seed).get_state())
    np.random.seed(seed)

AbstractDepthEstimationDataset

Bases: Dataset

Abstract class for datasets for depth estimation task.

Attempting to follow principles provided in pose_etimation_dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
class AbstractDepthEstimationDataset(Dataset):
    """
    Abstract class for datasets for depth estimation task.

    Attempting to follow principles provided in pose_etimation_dataset.
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(self, transforms: List[AbstractDepthEstimationTransform] = None):
        super().__init__()
        self.transforms = transforms or []

    @abc.abstractmethod
    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample from the dataset.

        :param index: Index of the sample to load.
        :return: Instance of DepthEstimationSample.

        If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
        ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
        Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
        """
        raise NotImplementedError()

    def load_random_sample(self) -> DepthEstimationSample:
        """
        Return a random sample from the dataset

        :return: Instance of DepthEstimationSample
        """
        num_samples = len(self)
        random_index = random.randrange(0, num_samples)
        return self.load_sample(random_index)

    def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get a transformed depth estimation sample from the dataset.

        :param index: Index of the sample to retrieve.
        :return: Tuple containing the transformed image and depth map as np.ndarrays.

        After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
        a 2D array (e.g., Height x Width).

        Before returning the image and depth map, the image's channels are moved to CHW format and additional
         dummy dimension is added to the depth map resulting 1HW shape.
        """
        sample = self.load_sample(index)
        for transform in self.transforms:
            sample = transform(sample)
        return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

    def plot(
        self,
        max_samples_per_plot: int = 8,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        color_scheme: Optional[int] = None,
        drop_extreme_percentage: float = 0,
        inverse: bool = False,
    ):
        """
        Combine samples of images with depth maps into plots and display the result.

        :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
        :param n_plots:                 Number of plots to display.
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                        If False, the plot will be over the raw samples (i.e., on load_sample).
        :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                        - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                        - If `inverse=False`, the default is COLORMAP_MAGMA.


        :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
        :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

        :return: None
        """
        plot_counter = 0

        for plot_i in range(n_plots):
            fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * max_samples_per_plot
                if plot_transformed_data:
                    image, depth_map = self[index]

                    # Transpose to HWC format for visualization
                    image = image.transpose(1, 2, 0)
                    depth_map = depth_map.squeeze()  # Remove dummy dimension
                else:
                    sample = self.load_sample(index)
                    image, depth_map = sample.image, sample.depth_map

                # Plot the image
                axes[0, img_i].imshow(image)
                axes[0, img_i].axis("off")
                axes[0, img_i].set_title(f"Sample {index}")

                # Plot the depth map side by side with the selected color scheme
                depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
                axes[1, img_i].imshow(depth_map)
                axes[1, img_i].axis("off")
                axes[1, img_i].set_title(f"Depth Map {index}")

            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

__getitem__(index)

Get a transformed depth estimation sample from the dataset.

Parameters:

Name Type Description Default
index int

Index of the sample to retrieve.

required

Returns:

Type Description
Tuple[np.ndarray, np.ndarray]

Tuple containing the transformed image and depth map as np.ndarrays. After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be a 2D array (e.g., Height x Width). Before returning the image and depth map, the image's channels are moved to CHW format and additional dummy dimension is added to the depth map resulting 1HW shape.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Get a transformed depth estimation sample from the dataset.

    :param index: Index of the sample to retrieve.
    :return: Tuple containing the transformed image and depth map as np.ndarrays.

    After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
    a 2D array (e.g., Height x Width).

    Before returning the image and depth map, the image's channels are moved to CHW format and additional
     dummy dimension is added to the depth map resulting 1HW shape.
    """
    sample = self.load_sample(index)
    for transform in self.transforms:
        sample = transform(sample)
    return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

load_random_sample()

Return a random sample from the dataset

Returns:

Type Description
DepthEstimationSample

Instance of DepthEstimationSample

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
44
45
46
47
48
49
50
51
52
def load_random_sample(self) -> DepthEstimationSample:
    """
    Return a random sample from the dataset

    :return: Instance of DepthEstimationSample
    """
    num_samples = len(self)
    random_index = random.randrange(0, num_samples)
    return self.load_sample(random_index)

load_sample(index) abstractmethod

Load a depth estimation sample from the dataset.

Parameters:

Name Type Description Default
index int

Index of the sample to load.

required

Returns:

Type Description
DepthEstimationSample

Instance of DepthEstimationSample. If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas, ensure that the same value is used as the ignore_val argument in your metric and loss functions. Fill the entries in the depth map that are supposed to be ignored with the ignore_val after loading the sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
30
31
32
33
34
35
36
37
38
39
40
41
42
@abc.abstractmethod
def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample from the dataset.

    :param index: Index of the sample to load.
    :return: Instance of DepthEstimationSample.

    If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
    ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
    Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
    """
    raise NotImplementedError()

plot(max_samples_per_plot=8, n_plots=1, plot_transformed_data=True, color_scheme=None, drop_extreme_percentage=0, inverse=False)

Combine samples of images with depth maps into plots and display the result.

Parameters:

Name Type Description Default
max_samples_per_plot int

Maximum number of samples (image with depth map) to be displayed per plot.

8
n_plots int

Number of plots to display.

1
plot_transformed_data bool

If True, the plot will be over samples after applying transforms (i.e., on getitem). If False, the plot will be over the raw samples (i.e., on load_sample).

True
color_scheme Optional[int]

OpenCV color scheme for the depth map visualization. If not specified: - If inverse=True, the default is COLORMAP_VIRIDIS. - If inverse=False, the default is COLORMAP_MAGMA.

None
drop_extreme_percentage float

Percentage of extreme values to drop on both ends of the depth spectrum.

0
inverse bool

Apply inversion (1 / depth) if True to the depth map.

False

Returns:

Type Description

None

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def plot(
    self,
    max_samples_per_plot: int = 8,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    color_scheme: Optional[int] = None,
    drop_extreme_percentage: float = 0,
    inverse: bool = False,
):
    """
    Combine samples of images with depth maps into plots and display the result.

    :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
    :param n_plots:                 Number of plots to display.
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                    If False, the plot will be over the raw samples (i.e., on load_sample).
    :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                    - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                    - If `inverse=False`, the default is COLORMAP_MAGMA.


    :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
    :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

    :return: None
    """
    plot_counter = 0

    for plot_i in range(n_plots):
        fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * max_samples_per_plot
            if plot_transformed_data:
                image, depth_map = self[index]

                # Transpose to HWC format for visualization
                image = image.transpose(1, 2, 0)
                depth_map = depth_map.squeeze()  # Remove dummy dimension
            else:
                sample = self.load_sample(index)
                image, depth_map = sample.image, sample.depth_map

            # Plot the image
            axes[0, img_i].imshow(image)
            axes[0, img_i].axis("off")
            axes[0, img_i].set_title(f"Sample {index}")

            # Plot the depth map side by side with the selected color scheme
            depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
            axes[1, img_i].imshow(depth_map)
            axes[1, img_i].axis("off")
            axes[1, img_i].set_title(f"Depth Map {index}")

        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

NYUv2DepthEstimationDataset

Bases: AbstractDepthEstimationDataset

Dataset class for NYU Depth V2 dataset for depth estimation.

Parameters:

Name Type Description Default
root str

Root directory containing the dataset.

required
df_path str

Path to the CSV file containing image and depth map file paths, relative to root.

required
transforms

Transforms to be applied to the samples. To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows: - Root directory (specified as 'root' when initializing the dataset) - nyu2_train (or any other split) - scene_category_1 - image_1.jpg - image_2.png - ... - scene_category_2 - image_1.jpg - image_2.png - ... - ... - nyu2_test (or any other split) - 00000_colors.png - 00001_colors.png - 00002_colors.png ... The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns: path to the color images, path to depth maps (both relative to the root). Example CSV content: data/nyu2_train/scene_category_1/image_1.jpg, data/nyu2_train/scene_category_1/image_1_depth.png data/nyu2_train/scene_category_1/image_2.jpg, data/nyu2_train/scene_category_1/image_2_depth.png data/nyu2_train/scene_category_2/image_1.jpg, data/nyu2_train/scene_category_2/image_1_depth.png Note: As of 14/12/2023 official downlaod link is broken. Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input ...

None
Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
@register_dataset(Datasets.NYUV2_DEPTH_ESTIMATION_DATASET)
class NYUv2DepthEstimationDataset(AbstractDepthEstimationDataset):
    """
    Dataset class for NYU Depth V2 dataset for depth estimation.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths, relative to root.
    :param transforms: Transforms to be applied to the samples.

    To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows:

    - Root directory (specified as 'root' when initializing the dataset)
      - nyu2_train (or any other split)
        - scene_category_1
          - image_1.jpg
          - image_2.png
          - ...
        - scene_category_2
          - image_1.jpg
          - image_2.png
          - ...
        - ...
      - nyu2_test (or any other split)
        - 00000_colors.png
        - 00001_colors.png
        - 00002_colors.png
        ...

    The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns:
     path to the color images,  path to depth maps (both relative to the root).

    Example CSV content:
    data/nyu2_train/scene_category_1/image_1.jpg,   data/nyu2_train/scene_category_1/image_1_depth.png
    data/nyu2_train/scene_category_1/image_2.jpg,   data/nyu2_train/scene_category_1/image_2_depth.png
    data/nyu2_train/scene_category_2/image_1.jpg,   data/nyu2_train/scene_category_2/image_1_depth.png

    Note: As of 14/12/2023 official downlaod link is broken.
     Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input
    ...
    """

    def __init__(self, root: str, df_path: str, transforms=None):
        """
        Initialize NYUv2Dataset.

        :param root: Root directory containing the dataset.
        :param df_path: Path to the CSV file containing image and depth map file paths.
        :param transforms: Transforms to be applied to the samples.
        """
        super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
        self.root = root
        self.df = self._read_df(df_path)
        self._check_paths_exist()

    def _read_df(self, df_path: str) -> pd.DataFrame:
        """
        Read the CSV file containing image and depth map file paths.

        :param df_path: Path to the CSV file.

        :return: DataFrame containing image and depth map file paths.
        """
        df = pd.read_csv(df_path, header=None)
        df[0] = df[0].map(lambda x: os.path.join(self.root, x))
        df[1] = df[1].map(lambda x: os.path.join(self.root, x))
        return df

    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample at the specified index.

        :param index: Index of the sample.

        :return: Loaded depth estimation sample.
        """
        sample_paths = self.df.iloc[index, :]
        image_path, dp_path = sample_paths[0], sample_paths[1]
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
        return DepthEstimationSample(image=image, depth_map=depth_map)

    def __len__(self):
        """
        Get the number of samples in the dataset.

        :return: Number of samples in the dataset.
        """
        return len(self.df)

    def _check_paths_exist(self):
        """
        Check if the paths in self.df exist. Remove lines with missing paths and print information about removed paths.
        Raise an error if all lines are removed.
        """
        valid_paths = []
        for _, row in self.df.iterrows():
            paths_exist = all(os.path.exists(path) for path in row)
            if paths_exist:
                valid_paths.append(row)
            else:
                warnings.warn(f"Warning: Removed the following line as one or more paths do not exist: {row}")

        if not valid_paths:
            raise FileNotFoundError("All lines in the dataset have been removed as some paths do not exist. " "Please check the paths and dataset structure.")

        self.df = pd.DataFrame(valid_paths, columns=[0, 1])

__init__(root, df_path, transforms=None)

Initialize NYUv2Dataset.

Parameters:

Name Type Description Default
root str

Root directory containing the dataset.

required
df_path str

Path to the CSV file containing image and depth map file paths.

required
transforms

Transforms to be applied to the samples.

None
Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(self, root: str, df_path: str, transforms=None):
    """
    Initialize NYUv2Dataset.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths.
    :param transforms: Transforms to be applied to the samples.
    """
    super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
    self.root = root
    self.df = self._read_df(df_path)
    self._check_paths_exist()

__len__()

Get the number of samples in the dataset.

Returns:

Type Description

Number of samples in the dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
 94
 95
 96
 97
 98
 99
100
def __len__(self):
    """
    Get the number of samples in the dataset.

    :return: Number of samples in the dataset.
    """
    return len(self.df)

load_sample(index)

Load a depth estimation sample at the specified index.

Parameters:

Name Type Description Default
index int

Index of the sample.

required

Returns:

Type Description
DepthEstimationSample

Loaded depth estimation sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
80
81
82
83
84
85
86
87
88
89
90
91
92
def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample at the specified index.

    :param index: Index of the sample.

    :return: Loaded depth estimation sample.
    """
    sample_paths = self.df.iloc[index, :]
    image_path, dp_path = sample_paths[0], sample_paths[1]
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
    return DepthEstimationSample(image=image, depth_map=depth_map)

COCODetectionDataset

Bases: COCOFormatDetectionDataset

Dataset for COCO object detection.

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset:
    >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
@register_dataset(Datasets.COCO_DETECTION_DATASET)
class COCODetectionDataset(COCOFormatDetectionDataset):
    """Dataset for COCO object detection.

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset:
            >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(
        self,
        json_file: str = "instances_train2017.json",
        subdir: str = "images/train2017",
        *args,
        **kwargs,
    ):
        """
        :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
        :param subdir:              Sub directory of data_dir containing the data.
        :param with_crowd: Add the crowd groundtruths to __getitem__

        kwargs:
            all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
        """
        super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

__init__(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)

Parameters:

Name Type Description Default
json_file str

Name of the coco json file, that resides in data_dir/annotations/json_file.

'instances_train2017.json'
subdir str

Sub directory of data_dir containing the data.

'images/train2017'
with_crowd

Add the crowd groundtruths to getitem kwargs: all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.

required
Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    json_file: str = "instances_train2017.json",
    subdir: str = "images/train2017",
    *args,
    **kwargs,
):
    """
    :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
    :param subdir:              Sub directory of data_dir containing the data.
    :param with_crowd: Add the crowd groundtruths to __getitem__

    kwargs:
        all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
    """
    super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

COCOFormatDetectionDataset

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the COCO dataset. - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh). - One folder with all the images.

Output format: (x, y, x, y, class_id)

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@register_dataset("COCOFormatDetectionDataset")
class COCOFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the COCO dataset.
    - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh).
    - One folder with all the images.

    Output format: (x, y, x, y, class_id)
    """

    @deprecated_parameter(
        "tight_box_rotation",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
    )
    def __init__(
        self,
        data_dir: str,
        json_annotation_file: str,
        images_dir: str,
        with_crowd: bool = True,
        class_ids_to_ignore: Optional[List[int]] = None,
        tight_box_rotation=None,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
        :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
        :param with_crowd:              Add the crowd groundtruths to __getitem__
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
        """
        if tight_box_rotation is not None:
            logger.warning(
                "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
            )
        self.images_dir = images_dir
        self.json_annotation_file = json_annotation_file
        self.with_crowd = with_crowd
        self.class_ids_to_ignore = class_ids_to_ignore or []

        target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
        kwargs["target_fields"] = target_fields
        kwargs["output_fields"] = ["image", *target_fields]
        kwargs["original_target_format"] = XYXY_LABEL
        super().__init__(data_dir=data_dir, *args, **kwargs)

        if len(self.original_classes) != len(self.all_classes_list):
            if set(self.all_classes_list).issubset(set(self.original_classes)):
                raise ParameterMismatchException(
                    "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                    "Please use `class_inclusion_list` to train with reduced number of classes",
                )
            else:
                raise DatasetValidationException(
                    "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                    "Most likely this indicates an error in your all_classes_list parameter"
                )

    def _setup_data_source(self) -> int:
        """
        Parse COCO annotation file
        :return: Number of images in annotation JSON
        """
        if os.path.isabs(self.json_annotation_file):
            annotation_file_path = self.json_annotation_file
        else:
            annotation_file_path = os.path.join(self.data_dir, self.json_annotation_file)
        if not os.path.exists(annotation_file_path):
            raise ValueError("Could not find annotation file under " + str(annotation_file_path))

        all_class_names, annotations = parse_coco_into_detection_annotations(
            annotation_file_path,
            exclude_classes=None,
            include_classes=None,
            # This parameter exists solely for the purpose of keeping the backward compatibility with the old code.
            # Once we refactor base dataset, we can remove this parameter and use only exclude_classes/include_classes
            # at parsing time instead.
            class_ids_to_ignore=self.class_ids_to_ignore,
            image_path_prefix=os.path.join(self.data_dir, self.images_dir),
        )

        self.original_classes = list(all_class_names)
        self.classes = copy.deepcopy(self.original_classes)
        self._annotations = annotations
        return len(annotations)

    @property
    def _all_classes(self) -> List[str]:
        return self.original_classes

    def _load_annotation(self, sample_id: int) -> dict:
        """
        Load relevant information of a specific image.

        :param sample_id:               Sample_id in the dataset
        :return target:                 Target Bboxes (detection) in XYXY_LABEL format
        :return crowd_target:           Crowd target Bboxes (detection) in XYXY_LABEL format
        :return target_segmentation:    Segmentation
        :return initial_img_shape:      Image (height, width)
        :return resized_img_shape:      Resides image (height, width)
        :return img_path:               Path to the associated image
        """

        annotation = self._annotations[sample_id]

        width = annotation.image_width
        height = annotation.image_height

        # Make a copy of the annotations, so that we can modify them
        boxes_xyxy = change_bbox_bounds_for_image_size(annotation.ann_boxes_xyxy, img_shape=(height, width), inplace=False)
        iscrowd = annotation.ann_is_crowd.copy()
        labels = annotation.ann_labels.copy()

        # Exclude boxes with invalid dimensions (x1 > x2 or y1 > y2)
        mask = np.logical_and(boxes_xyxy[:, 2] >= boxes_xyxy[:, 0], boxes_xyxy[:, 3] >= boxes_xyxy[:, 1])
        boxes_xyxy = boxes_xyxy[mask]
        iscrowd = iscrowd[mask]
        labels = labels[mask]

        # Currently, the base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        initial_img_shape = (height, width)
        if self.input_dim is not None:
            scale_factor = min(self.input_dim[0] / height, self.input_dim[1] / width)
            resized_img_shape = (int(height * scale_factor), int(width * scale_factor))
        else:
            resized_img_shape = initial_img_shape
            scale_factor = 1

        targets = np.concatenate([boxes_xyxy[~iscrowd] * scale_factor, labels[~iscrowd, None]], axis=1).astype(np.float32)
        crowd_targets = np.concatenate([boxes_xyxy[iscrowd] * scale_factor, labels[iscrowd, None]], axis=1).astype(np.float32)

        annotation = {
            "target": targets,
            "crowd_target": crowd_targets,
            "initial_img_shape": initial_img_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": annotation.image_path,
        }
        return annotation

__init__(data_dir, json_annotation_file, images_dir, with_crowd=True, class_ids_to_ignore=None, tight_box_rotation=None, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
json_annotation_file str

Name of the coco json file. Path can be either absolute, or relative to data_dir.

required
images_dir str

Name of the directory that includes all the images. Path relative to data_dir.

required
with_crowd bool

Add the crowd groundtruths to getitem

True
class_ids_to_ignore Optional[List[int]]

List of class ids to ignore in the dataset. By default, doesnt ignore any class.

None
tight_box_rotation

This parameter is deprecated and will be removed in a SuperGradients 3.8.

None
Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
@deprecated_parameter(
    "tight_box_rotation",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
)
def __init__(
    self,
    data_dir: str,
    json_annotation_file: str,
    images_dir: str,
    with_crowd: bool = True,
    class_ids_to_ignore: Optional[List[int]] = None,
    tight_box_rotation=None,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
    :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
    :param with_crowd:              Add the crowd groundtruths to __getitem__
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
    """
    if tight_box_rotation is not None:
        logger.warning(
            "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
        )
    self.images_dir = images_dir
    self.json_annotation_file = json_annotation_file
    self.with_crowd = with_crowd
    self.class_ids_to_ignore = class_ids_to_ignore or []

    target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
    kwargs["target_fields"] = target_fields
    kwargs["output_fields"] = ["image", *target_fields]
    kwargs["original_target_format"] = XYXY_LABEL
    super().__init__(data_dir=data_dir, *args, **kwargs)

    if len(self.original_classes) != len(self.all_classes_list):
        if set(self.all_classes_list).issubset(set(self.original_classes)):
            raise ParameterMismatchException(
                "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                "Please use `class_inclusion_list` to train with reduced number of classes",
            )
        else:
            raise DatasetValidationException(
                "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                "Most likely this indicates an error in your all_classes_list parameter"
            )

parse_coco_into_detection_annotations(ann, exclude_classes=None, include_classes=None, class_ids_to_ignore=None, image_path_prefix=None)

Load COCO detection dataset from annotation file.

Parameters:

Name Type Description Default
ann str

A path to the JSON annotation file in COCO format.

required
exclude_classes Optional[List[str]]

List of classes to exclude from the dataset. All other classes will be included. This parameter is mutually exclusive with include_classes and class_ids_to_ignore.

None
include_classes Optional[List[str]]

List of classes to include in the dataset. All other classes will be excluded. This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.

None
class_ids_to_ignore Optional[List[int]]

List of category ids to ignore in the dataset. All other classes will be included. This parameter added for the purpose of backward compatibility with the class_ids_to_ignore argument of COCOFormatDetectionDataset but will be removed in future in favor of include_classes/exclude_classes. This parameter is mutually exclusive with exclude_classes and include_classes.

None
image_path_prefix

A prefix to add to the image paths in the annotation file.

None

Returns:

Type Description
Tuple[List[str], List[DetectionAnnotation]]

Tuple (class_names, annotations) where class_names is a list of class names (respecting include_classes/exclude_classes/class_ids_to_ignore) and annotations is a list of DetectionAnnotation objects.

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def parse_coco_into_detection_annotations(
    ann: str,
    exclude_classes: Optional[List[str]] = None,
    include_classes: Optional[List[str]] = None,
    class_ids_to_ignore: Optional[List[int]] = None,
    image_path_prefix=None,
) -> Tuple[List[str], List[DetectionAnnotation]]:
    """
    Load COCO detection dataset from annotation file.
    :param ann: A path to the JSON annotation file in COCO format.
    :param exclude_classes: List of classes to exclude from the dataset. All other classes will be included.
                                This parameter is mutually exclusive with include_classes and class_ids_to_ignore.

    :param include_classes:     List of classes to include in the dataset. All other classes will be excluded.
                                This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.
    :param class_ids_to_ignore: List of category ids to ignore in the dataset. All other classes will be included.
                                This parameter added for the purpose of backward compatibility with the class_ids_to_ignore
                                argument of COCOFormatDetectionDataset but will be
                                removed in future in favor of include_classes/exclude_classes.
                                This parameter is mutually exclusive with exclude_classes and include_classes.
    :param image_path_prefix:   A prefix to add to the image paths in the annotation file.
    :return:                    Tuple (class_names, annotations) where class_names is a list of class names
                                (respecting include_classes/exclude_classes/class_ids_to_ignore) and
                                annotations is a list of DetectionAnnotation objects.
    """
    with open(ann, "r") as f:
        coco = json.load(f)

    # Extract class names and class ids
    category_ids = np.array([category["id"] for category in coco["categories"]], dtype=int)
    category_names = np.array([category["name"] for category in coco["categories"]], dtype=str)

    # Extract box annotations
    ann_box_xyxy = xywh_to_xyxy_inplace(np.array([annotation["bbox"] for annotation in coco["annotations"]], dtype=np.float32).reshape(-1, 4), image_shape=None)

    ann_category_id = np.array([annotation["category_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)
    ann_iscrowd = np.array([annotation["iscrowd"] for annotation in coco["annotations"]], dtype=bool).reshape(-1)
    ann_image_ids = np.array([annotation["image_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)

    # Extract image stuff
    img_ids = [img["id"] for img in coco["images"]]
    img_paths = [img["file_name"] if "file_name" in img else "{:012}".format(img["id"]) + ".jpg" for img in coco["images"]]
    img_width_height = [(img["width"], img["height"]) for img in coco["images"]]

    # Now, we can drop the annotations that belongs to the excluded classes
    if int(class_ids_to_ignore is not None) + int(exclude_classes is not None) + int(include_classes is not None) > 1:
        raise ValueError("Only one of exclude_classes, class_ids_to_ignore or include_classes can be specified")
    elif exclude_classes is not None:
        if len(exclude_classes) != len(set(exclude_classes)):
            raise ValueError("The excluded classes must be unique")
        classes_not_in_dataset = set(exclude_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the excluded classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, exclude_classes, invert=True)
    elif class_ids_to_ignore is not None:
        if len(class_ids_to_ignore) != len(set(class_ids_to_ignore)):
            raise ValueError("The ignored classes must be unique")
        classes_not_in_dataset = set(class_ids_to_ignore).difference(set(category_ids))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the ignored classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_ids, class_ids_to_ignore, invert=True)
    elif include_classes is not None:
        if len(include_classes) != len(set(include_classes)):
            raise ValueError("The included classes must be unique")
        classes_not_in_dataset = set(include_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the included classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, include_classes)
    else:
        keep_classes_mask = None

    if keep_classes_mask is not None:
        category_ids = category_ids[keep_classes_mask]
        category_names = category_names[keep_classes_mask]

        keep_anns_mask = np.isin(ann_category_id, category_ids)
        ann_category_id = ann_category_id[keep_anns_mask]

    # category_ids can be non-sequential and not ordered
    num_categories = len(category_ids)

    # Make sequential
    order = np.argsort(category_ids, kind="stable")
    category_ids = category_ids[order]  #
    category_names = category_names[order]

    # Remap category ids to be in range [0, num_categories)
    class_label_table = np.zeros(np.max(category_ids) + 1, dtype=int) - 1
    new_class_ids = np.arange(num_categories, dtype=int)
    class_label_table[category_ids] = new_class_ids

    # Remap category ids in annotations
    ann_category_id = class_label_table[ann_category_id]
    if (ann_category_id < 0).any():
        raise ValueError("Some annotations have class ids that are not in the list of classes. This probably indicates a bug in the annotation file")

    annotations = []

    img_id2ann_box_xyxy = defaultdict(list)
    img_id2ann_iscrowd = defaultdict(list)
    img_id2ann_category_id = defaultdict(list)
    for ann_image_id, _ann_box_xyxy, _ann_iscrowd, _ann_category_id in zip(ann_image_ids, ann_box_xyxy, ann_iscrowd, ann_category_id):
        img_id2ann_box_xyxy[ann_image_id].append(_ann_box_xyxy)
        img_id2ann_iscrowd[ann_image_id].append(_ann_iscrowd)
        img_id2ann_category_id[ann_image_id].append(_ann_category_id)

    for img_id, image_path, (image_width, image_height) in zip(img_ids, img_paths, img_width_height):
        if image_path_prefix is not None:
            image_path = os.path.join(image_path_prefix, image_path)

        ann = DetectionAnnotation(
            image_id=img_id,
            image_path=image_path,
            image_width=image_width,
            image_height=image_height,
            ann_boxes_xyxy=np.asarray(img_id2ann_box_xyxy[img_id], dtype=np.float32).reshape(-1, 4),
            ann_is_crowd=np.asarray(img_id2ann_iscrowd[img_id], dtype=bool).reshape(-1),
            ann_labels=np.asarray(img_id2ann_category_id[img_id], dtype=int).reshape(-1),
        )
        annotations.append(ann)

    return category_names, annotations

DetectionDataset

Bases: Dataset, HasPreprocessingParams, HasClassesInformation

Detection dataset.

This is a boilerplate class to facilitate the implementation of datasets.

HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ? - Inherit from DetectionDataSet - implement the method self.load_annotation to return at least the fields "target" and "img_path" - Call super().__init_ with the required params. //!\ super().init will call self.load_annotation, so make sure that every required attributes are set up before calling super().__init_ (ideally just call it last)

WORKFLOW: - On instantiation: - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

- On call (__getitem__) for a specific image index:
    - The image and annotations are grouped together in a dict called SAMPLE
    - the sample is processed according to th transform
    - Only the specified fields are returned by __getitem__

TERMINOLOGY - TARGET: Groundtruth, made of bboxes. The format can vary from one dataset to another - ANNOTATION: Combination of targets (groundtruth) and metadata of the image, but without the image itself. > Has to include the fields "target" and "img_path" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - SAMPLE: Outout of the dataset: > Has to include the fields "target" and "image" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - Index: Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1 - Sample ID: Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
@register_dataset(Datasets.DETECTION_DATASET)
class DetectionDataset(Dataset, HasPreprocessingParams, HasClassesInformation):
    """Detection dataset.

    This is a boilerplate class to facilitate the implementation of datasets.

    HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ?
        - Inherit from DetectionDataSet
        - implement the method self._load_annotation to return at least the fields "target" and "img_path"
        - Call super().__init__ with the required params.
                //!\\ super().__init__ will call self._load_annotation, so make sure that every required
                      attributes are set up before calling super().__init__ (ideally just call it last)

    WORKFLOW:
        - On instantiation:
            - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

        - On call (__getitem__) for a specific image index:
            - The image and annotations are grouped together in a dict called SAMPLE
            - the sample is processed according to th transform
            - Only the specified fields are returned by __getitem__

    TERMINOLOGY
        - TARGET:       Groundtruth, made of bboxes. The format can vary from one dataset to another
        - ANNOTATION:   Combination of targets (groundtruth) and metadata of the image, but without the image itself.
                            > Has to include the fields "target" and "img_path"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - SAMPLE:       Outout of the dataset:
                            > Has to include the fields "target" and "image"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - Index:        Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        - Sample ID:    Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(
        self,
        data_dir: str,
        original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        target_fields: List[str] = None,
        output_fields: List[str] = None,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """Detection dataset.

        :param data_dir:                Where the data is stored
        :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                        None means that the image will be loaded as is.
                                        Scalar (size) - Image will be resized to (size, size)
                                        Tuple (rows,cols) - Image will be resized to (rows, cols)
        :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                        differ based on transforms.
        :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
        :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                        but requires more RAM and more time to instantiate the dataset when working on very large datasets.
        :param transforms:              List of transforms to apply sequentially on sample.
        :param all_classes_list:        All the class names.
        :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                        Classes not in this list will excluded from training.
                                        Thus, number of classes in model must be adjusted accordingly.
        :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                                will be ignored.
        :param target_fields:                   List of the fields target fields. This has to include regular target,
                                                but can also include crowd target, segmentation target, ...
                                                It has to include at least "target" but can include other.
        :param output_fields:                   Fields that will be outputed by __getitem__.
                                                It has to include at least "image" and "target" but can include other.
        :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
        :param show_all_warnings:       Whether to show all warnings or not.
        :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        """
        if cache is not None:
            warnings.warn(
                "cache parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )
        if cache_dir is not None:
            warnings.warn(
                "cache_dir parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )

        super().__init__()
        self.verbose = verbose
        self.show_all_warnings = show_all_warnings

        if isinstance(original_target_format, DetectionTargetsFormat):
            logger.warning(
                "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
                "Support for DetectionTargetsFormat will be removed in 3.1"
            )

        self.data_dir = data_dir
        if not Path(data_dir).exists():
            raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

        # Number of images that are available (regardless of ignored images)
        n_dataset_samples = self._setup_data_source()
        if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
            raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
        n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

        self.input_dim = ensure_is_tuple_of_two(input_dim)
        self.original_target_format = original_target_format

        if len(all_classes_list) != len(set(all_classes_list)):
            raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

        if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
            raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

        self.all_classes_list = all_classes_list or self._all_classes
        self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
        self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
        self.classes = self.class_inclusion_list or self.all_classes_list
        if len(set(self.classes) - set(self.all_classes_list)) > 0:
            wrong_classes = set(self.classes) - set(all_classes_list)
            raise DatasetValidationException(
                f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
            )

        self.ignore_empty_annotations = ignore_empty_annotations
        self.target_fields = target_fields or ["target"]
        if "target" not in self.target_fields:
            raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

        self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

        self.transforms = transforms

        self.output_fields = output_fields or ["image", "target"]
        if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
            raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

        self._cache_annotations = cache_annotations
        self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

        # Maps (dataset index) -> (non-empty sample ids)
        self._non_empty_sample_ids: Optional[List[int]] = None

        # Some transform may require non-empty annotations to be indexed.
        transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

        # Iterate over the whole dataset to index the images with/without annotations.
        if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
            if self._cache_annotations:
                logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
            elif self.ignore_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
                )
            elif transform_require_non_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. "
                    "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
                )

            # Map indexes to sample annotations.
            non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
            if self._cache_annotations:
                if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                    self._cached_annotations = non_empty_annotations
                else:
                    # Non overlapping dicts. since they map unique sample_ids -> sample
                    self._cached_annotations = {**non_empty_annotations, **empty_annotations}

            if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
                raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

            self._non_empty_sample_ids = list(non_empty_annotations.keys())

        self._n_samples = n_samples  # Regardless of any filtering

    @property
    def _all_classes(self):
        """Placeholder to setup the class names. This is an alternative to passing "all_classes_list" to __init__.
        This is usefull when all_classes_list is not known in advance, only after loading the dataset."""
        raise NotImplementedError

    def _setup_data_source(self) -> int:
        """Set up the data source and store relevant objects as attributes.

        :return: Number of available samples, (i.e. how many images we have, regardless of any filter we might want to use)"""
        raise NotImplementedError

    def _load_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load annotations associated to a specific sample.
        Please note that the targets should be resized according to self.input_dim!

        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        :return:            Annotation, a dict with any field but has to include at least the fields specified in self._required_annotation_fields.
        """
        raise NotImplementedError

    def _get_sample_annotations(self, index: int, ignore_empty_annotations: bool) -> Dict[str, Union[np.ndarray, Any]]:
        """Get the annotation associated to a specific sample. Use cache if enabled.
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    Whether to ignore empty annotations or not.
        :return:                            Dict representing the annotation of a specific image
        """
        sample_id = self._non_empty_sample_ids[index] if ignore_empty_annotations else index
        if self._cache_annotations:
            return self._cached_annotations[sample_id]
        else:
            return self._load_sample_annotation(sample_id=sample_id)

    def _load_sample_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load the annotation associated to a specific sample and apply subclassing.
        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        """
        sample_annotations = self._load_annotation(sample_id=sample_id)
        if not self._required_annotation_fields.issubset(set(sample_annotations.keys())):
            raise KeyError(
                f"_load_annotation is expected to return at least the fields {self._required_annotation_fields}, but got {set(sample_annotations.keys())}"
            )

        # Filter out classes that are not in self.class_inclusion_list
        if self.class_inclusion_list is not None:
            sample_annotations = self._sub_class_annotation(annotation=sample_annotations)

        return sample_annotations

    def _load_all_annotations(self, n_samples: int) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[str, Any]]]:
        """Load ALL the annotations into memory. This is usually required when `ignore_empty_annotations=True`,
        because we have to iterate over the whole dataset once in order to know which sample is empty and which is not.
        Question: Why not just check if annotation is empty on the fly ?
        Answer: When running with DDP, we split the dataset into small chunks.
                Therefore, we need to make sure that each chunk includes a similar subset of index.
                If we were to check on the fly, we would not know in advance the size of dataset/chunks
                and this means that some chunks would be smaller than others

        :param n_samples:   Number of samples in the datasets (including samples without annotations).
        :return:            A tuple of two dicts, one for non-empty annotations and one for empty annotations
                                - non_empty_annotations: Dict mapping dataset index -> non-empty annotations
                                - empty_annotations:     Dict mapping dataset index -> empty annotations
        """
        n_invalid_bbox = 0
        non_empty_annotations, empty_annotations = {}, {}

        for index in tqdm(range(n_samples), desc="Indexing dataset annotations", disable=not self.verbose):
            sample_annotations = self._load_sample_annotation(sample_id=index)
            n_invalid_bbox += sample_annotations.get("n_invalid_labels", 0)

            is_annotation_non_empty = any(len(sample_annotations[field]) != 0 for field in self.target_fields)
            if is_annotation_non_empty:
                non_empty_annotations[index] = sample_annotations if self._cache_annotations else None
            else:
                empty_annotations[index] = sample_annotations if self._cache_annotations else None

        if len(non_empty_annotations) + len(empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        if n_invalid_bbox > 0:
            logger.warning(f"Found {n_invalid_bbox} invalid bbox that were ignored. For more information, please set `show_all_warnings=True`.")

        return non_empty_annotations, empty_annotations

    def _sub_class_annotation(self, annotation: dict) -> Union[dict, None]:
        """Subclass every field listed in self.target_fields. It could be targets, crowd_targets, ...

        :param annotation: Dict representing the annotation of a specific image
        :return:           Subclassed annotation if non-empty after subclassing, otherwise None
        """
        class_index = _get_class_index_in_target(target_format=self.original_target_format)
        for field in self.target_fields:
            annotation[field] = self._sub_class_target(targets=annotation[field], class_index=class_index)
        return annotation

    def _sub_class_target(self, targets: np.ndarray, class_index: int) -> np.ndarray:
        """Sublass targets of a specific image.

        :param targets:     Target array to subclass of shape [n_targets, 5], 5 representing a bbox
        :param class_index:    Position of the class id in a bbox
                                ex: 0 if bbox of format label_xyxy | -1 if bbox of format xyxy_label
        :return:            Subclassed target
        """
        targets_kept = []
        for target in targets:
            cls_id = int(target[class_index])
            cls_name = self.all_classes_list[cls_id]
            if cls_name in self.class_inclusion_list:
                # Replace the target cls_id in self.all_classes_list by cls_id in self.class_inclusion_list
                target[class_index] = self.class_inclusion_list.index(cls_name)
                targets_kept.append(target)

        return np.array(targets_kept) if len(targets_kept) > 0 else np.zeros((0, 5), dtype=np.float32)

    def _load_resized_img(self, image_path: str) -> np.ndarray:
        """Load an image and resize it to the desired size (If relevant).
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img = self._load_image(image_path=image_path)

        if self.input_dim is not None:
            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)

        return img

    def _load_image(self, image_path: str) -> np.ndarray:
        """Load an image.
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img_file = os.path.join(image_path)
        img = cv2.imread(img_file)

        if img is None:
            raise FileNotFoundError(f"{img_file} was no found. Please make sure that the dataset was" f"downloaded and that the path is correct")
        return img

    def __len__(self) -> int:
        """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
        return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

    def __getitem__(self, index: int) -> Tuple:
        """Get the sample post transforms at a specific index of the dataset.
        The output of this function will be collated to form batches.

        :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :return:        Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
        sample = self.apply_transforms(sample)
        for field in self.output_fields:
            if field not in sample.keys():
                raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
        return tuple(sample[field] for field in self.output_fields)

    def get_random_item(self):
        return self[self.get_random_sample(ignore_empty_annotations=self.ignore_empty_annotations)]

    def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        """Get raw sample, before any transform (beside subclassing).
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    If True, empty annotations will be ignored
        :return:                            Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
        image = self._load_resized_img(image_path=sample_annotations["img_path"])
        return {"image": image, **deepcopy(sample_annotations)}

    def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
        """
        Applies self.transforms sequentially to sample

        If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
         sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
         only additional samples with objects in them.

        :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
        :return: Transformed sample
        """

        has_crowd_target = "crowd_target" in sample
        detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
        target_format_transform: Optional[DetectionTargetsFormatTransform] = None

        for transform in self.transforms:
            detection_sample.additional_samples = [
                LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
            ]
            detection_sample = transform.apply_to_sample(sample=detection_sample)

            detection_sample.additional_samples = None
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format_transform = transform

        transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
        if target_format_transform is not None:
            transformed_dict = target_format_transform(sample=transformed_dict)
        return transformed_dict

    def _get_additional_inputs_for_transform(self, transform: AbstractDetectionTransform) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Add additional inputs required by a transform to the sample"""
        additional_samples_count = transform.additional_samples_count if hasattr(transform, "additional_samples_count") else 0
        non_empty_annotations = transform.non_empty_annotations if hasattr(transform, "non_empty_annotations") else False
        return self.get_random_samples(count=additional_samples_count, ignore_empty_annotations=non_empty_annotations)

    def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Load random samples.

        :param count: The number of samples wanted
        :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
        :return: A list of samples satisfying input params
        """
        return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

    def get_random_sample(self, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        n_relevant_samples = len(self._non_empty_sample_ids) if ignore_empty_annotations else self._n_samples
        random_index = random.randint(0, n_relevant_samples - 1)
        return self.get_sample(index=random_index, ignore_empty_annotations=ignore_empty_annotations)

    @property
    def output_target_format(self):
        target_format = self.original_target_format
        for transform in self.transforms:
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format = transform.output_format
        return target_format

    @staticmethod
    def _standardize_image(image):
        # Normalize the image to have minimum of 0 and maximum of 1
        image_min = image.min()
        image_max = image.max()
        normalized_image = (image - image_min) / (image_max - image_min + 1e-8)

        # Rescale the normalized image to 0-255
        standardized_image = (normalized_image * 255).astype(np.uint8)

        return standardized_image

    def plot(
        self,
        max_samples_per_plot: int = 16,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        box_thickness: int = 2,
    ):
        """Combine samples of images with bbox into plots and display the result.

        :param max_samples_per_plot:    Maximum number of images to be displayed per plot
        :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                        If False, the plot will be over the raw samples (i.e. on get_sample)
        :return:
        """
        plot_counter = 0
        input_format = self.output_target_format if plot_transformed_data else self.original_target_format
        if isinstance(input_format, DetectionTargetsFormat):
            raise ValueError(
                "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
            )

        for plot_i in range(n_plots):
            fig = plt.figure(figsize=(10, 10))

            n_subplot = int(np.ceil(max_samples_per_plot**0.5))

            # Plot `max_samples_per_plot` images.
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * 16

                # LOAD IMAGE/TARGETS
                if plot_transformed_data:
                    # Access to the image and the target AFTER self.transform
                    image, targets, *_ = self[img_i + plot_i * 16]
                else:
                    # Access to the image and the target BEFORE self.transform
                    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                    image, targets = sample["image"], sample["target"]

                # FORMAT TARGETS
                if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                    image = image.transpose((1, 2, 0))

                image = self._standardize_image(image)
                image = image.astype(np.uint8)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

                # Convert to XYXY_LABEL format
                targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
                targets_label_xyxy = targets_format_converter(targets)

                image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

                plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
                plt.imshow(image)
                plt.axis("off")

            fig.tight_layout()
            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as as list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = [Processings.ReverseImageChannels]
        if self.input_dim is not None:
            pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(
            class_names=self.classes,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            iou=0.65,
            conf=0.5,
        )
        return params

    def get_sample_classes_information(self, index) -> np.ndarray:
        target = self._get_sample_annotations(index=index, ignore_empty_annotations=self.ignore_empty_annotations)["target"]
        if len(target) == 0:  # in case of no objects in the sample
            return np.zeros(len(self.classes))

        target_class_index = _get_class_index_in_target(target_format=self.original_target_format)  # can be sped-up with a property rather computing per index
        classes = target[:, target_class_index].astype(int)

        return np.bincount(classes, minlength=len(self.classes))

    def get_dataset_classes_information(self) -> np.ndarray:
        return np.row_stack([self.get_sample_classes_information(index=index) for index in range(len(self))])

__getitem__(index)

Get the sample post transforms at a specific index of the dataset. The output of this function will be collated to form batches.

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required

Returns:

Type Description
Tuple

Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
367
368
369
370
371
372
373
374
375
376
377
378
379
def __getitem__(self, index: int) -> Tuple:
    """Get the sample post transforms at a specific index of the dataset.
    The output of this function will be collated to form batches.

    :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :return:        Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
    sample = self.apply_transforms(sample)
    for field in self.output_fields:
        if field not in sample.keys():
            raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
    return tuple(sample[field] for field in self.output_fields)

__init__(data_dir, original_target_format, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, target_fields=None, output_fields=None, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Detection dataset.

Parameters:

Name Type Description Default
data_dir str

Where the data is stored

required
input_dim Union[int, Tuple[int, int], None]

Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols). None means that the image will be loaded as is. Scalar (size) - Image will be resized to (size, size) Tuple (rows,cols) - Image will be resized to (rows, cols)

None
original_target_format Union[ConcatenatedTensorFormat, DetectionTargetsFormat]

Format of targets stored on disk. raw data format, the output format might differ based on transforms.

required
max_num_samples int

If not None, set the maximum size of the dataset by only indexing the first n annotations/images.

None
cache_annotations bool

Whether to cache annotations or not. This reduces training time by pre-loading all the annotations, but requires more RAM and more time to instantiate the dataset when working on very large datasets.

True
transforms List[AbstractDetectionTransform]

List of transforms to apply sequentially on sample.

[]
all_classes_list Optional[List[str]]

All the class names.

[]
class_inclusion_list Optional[List[str]]

If not None, define the subset of classes to be included as targets. Classes not in this list will excluded from training. Thus, number of classes in model must be adjusted accordingly.

None
ignore_empty_annotations bool

If True and class_inclusion_list not None, images without any target will be ignored.

True
target_fields List[str]

List of the fields target fields. This has to include regular target, but can also include crowd target, segmentation target, ... It has to include at least "target" but can include other.

None
output_fields List[str]

Fields that will be outputed by getitem. It has to include at least "image" and "target" but can include other.

None
verbose bool

Whether to show additional information or not, such as loading progress. (doesnt include warnings)

True
show_all_warnings bool

Whether to show all warnings or not.

False
cache

Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8

None
cache_dir

Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8

None
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
@resolve_param("transforms", ListFactory(TransformsFactory()))
def __init__(
    self,
    data_dir: str,
    original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    target_fields: List[str] = None,
    output_fields: List[str] = None,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """Detection dataset.

    :param data_dir:                Where the data is stored
    :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                    None means that the image will be loaded as is.
                                    Scalar (size) - Image will be resized to (size, size)
                                    Tuple (rows,cols) - Image will be resized to (rows, cols)
    :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                    differ based on transforms.
    :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
    :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                    but requires more RAM and more time to instantiate the dataset when working on very large datasets.
    :param transforms:              List of transforms to apply sequentially on sample.
    :param all_classes_list:        All the class names.
    :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                    Classes not in this list will excluded from training.
                                    Thus, number of classes in model must be adjusted accordingly.
    :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                            will be ignored.
    :param target_fields:                   List of the fields target fields. This has to include regular target,
                                            but can also include crowd target, segmentation target, ...
                                            It has to include at least "target" but can include other.
    :param output_fields:                   Fields that will be outputed by __getitem__.
                                            It has to include at least "image" and "target" but can include other.
    :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
    :param show_all_warnings:       Whether to show all warnings or not.
    :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    """
    if cache is not None:
        warnings.warn(
            "cache parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )
    if cache_dir is not None:
        warnings.warn(
            "cache_dir parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )

    super().__init__()
    self.verbose = verbose
    self.show_all_warnings = show_all_warnings

    if isinstance(original_target_format, DetectionTargetsFormat):
        logger.warning(
            "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
            "Support for DetectionTargetsFormat will be removed in 3.1"
        )

    self.data_dir = data_dir
    if not Path(data_dir).exists():
        raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

    # Number of images that are available (regardless of ignored images)
    n_dataset_samples = self._setup_data_source()
    if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
        raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
    n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

    self.input_dim = ensure_is_tuple_of_two(input_dim)
    self.original_target_format = original_target_format

    if len(all_classes_list) != len(set(all_classes_list)):
        raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

    if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
        raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

    self.all_classes_list = all_classes_list or self._all_classes
    self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
    self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
    self.classes = self.class_inclusion_list or self.all_classes_list
    if len(set(self.classes) - set(self.all_classes_list)) > 0:
        wrong_classes = set(self.classes) - set(all_classes_list)
        raise DatasetValidationException(
            f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
        )

    self.ignore_empty_annotations = ignore_empty_annotations
    self.target_fields = target_fields or ["target"]
    if "target" not in self.target_fields:
        raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

    self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

    self.transforms = transforms

    self.output_fields = output_fields or ["image", "target"]
    if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
        raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

    self._cache_annotations = cache_annotations
    self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

    # Maps (dataset index) -> (non-empty sample ids)
    self._non_empty_sample_ids: Optional[List[int]] = None

    # Some transform may require non-empty annotations to be indexed.
    transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

    # Iterate over the whole dataset to index the images with/without annotations.
    if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
        if self._cache_annotations:
            logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
        elif self.ignore_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
            )
        elif transform_require_non_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. "
                "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
            )

        # Map indexes to sample annotations.
        non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
        if self._cache_annotations:
            if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                self._cached_annotations = non_empty_annotations
            else:
                # Non overlapping dicts. since they map unique sample_ids -> sample
                self._cached_annotations = {**non_empty_annotations, **empty_annotations}

        if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        self._non_empty_sample_ids = list(non_empty_annotations.keys())

    self._n_samples = n_samples  # Regardless of any filtering

__len__()

Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant).

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
363
364
365
def __len__(self) -> int:
    """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
    return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

apply_transforms(sample)

Applies self.transforms sequentially to sample

If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load only additional samples with objects in them.

Parameters:

Name Type Description Default
sample Dict[str, Union[np.ndarray, Any]]

Sample to apply the transforms on to (loaded with self.get_sample)

required

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Transformed sample

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
    """
    Applies self.transforms sequentially to sample

    If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
     sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
     only additional samples with objects in them.

    :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
    :return: Transformed sample
    """

    has_crowd_target = "crowd_target" in sample
    detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
    target_format_transform: Optional[DetectionTargetsFormatTransform] = None

    for transform in self.transforms:
        detection_sample.additional_samples = [
            LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
        ]
        detection_sample = transform.apply_to_sample(sample=detection_sample)

        detection_sample.additional_samples = None
        if isinstance(transform, DetectionTargetsFormatTransform):
            target_format_transform = transform

    transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
    if target_format_transform is not None:
        transformed_dict = target_format_transform(sample=transformed_dict)
    return transformed_dict

get_dataset_preprocessing_params()

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as as list of dicts to be resolved by processing factory.

Returns:

Type Description
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as as list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = [Processings.ReverseImageChannels]
    if self.input_dim is not None:
        pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(
        class_names=self.classes,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        iou=0.65,
        conf=0.5,
    )
    return params

get_random_samples(count, ignore_empty_annotations=False)

Load random samples.

Parameters:

Name Type Description Default
count int

The number of samples wanted

required
ignore_empty_annotations bool

If true, only return samples with at least 1 annotation

False

Returns:

Type Description
List[Dict[str, Union[np.ndarray, Any]]]

A list of samples satisfying input params

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
431
432
433
434
435
436
437
438
def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
    """Load random samples.

    :param count: The number of samples wanted
    :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
    :return: A list of samples satisfying input params
    """
    return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

get_sample(index, ignore_empty_annotations=False)

Get raw sample, before any transform (beside subclassing).

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required
ignore_empty_annotations bool

If True, empty annotations will be ignored

False

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
384
385
386
387
388
389
390
391
392
def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
    """Get raw sample, before any transform (beside subclassing).
    :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :param ignore_empty_annotations:    If True, empty annotations will be ignored
    :return:                            Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
    image = self._load_resized_img(image_path=sample_annotations["img_path"])
    return {"image": image, **deepcopy(sample_annotations)}

plot(max_samples_per_plot=16, n_plots=1, plot_transformed_data=True, box_thickness=2)

Combine samples of images with bbox into plots and display the result.

Parameters:

Name Type Description Default
max_samples_per_plot int

Maximum number of images to be displayed per plot

16
n_plots int

Number of plots to display (each plot being a combination of img with bbox)

1
plot_transformed_data bool

If True, the plot will be over samples after applying transforms (i.e. on getitem). If False, the plot will be over the raw samples (i.e. on get_sample)

True

Returns:

Type Description
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
def plot(
    self,
    max_samples_per_plot: int = 16,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    box_thickness: int = 2,
):
    """Combine samples of images with bbox into plots and display the result.

    :param max_samples_per_plot:    Maximum number of images to be displayed per plot
    :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                    If False, the plot will be over the raw samples (i.e. on get_sample)
    :return:
    """
    plot_counter = 0
    input_format = self.output_target_format if plot_transformed_data else self.original_target_format
    if isinstance(input_format, DetectionTargetsFormat):
        raise ValueError(
            "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
        )

    for plot_i in range(n_plots):
        fig = plt.figure(figsize=(10, 10))

        n_subplot = int(np.ceil(max_samples_per_plot**0.5))

        # Plot `max_samples_per_plot` images.
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * 16

            # LOAD IMAGE/TARGETS
            if plot_transformed_data:
                # Access to the image and the target AFTER self.transform
                image, targets, *_ = self[img_i + plot_i * 16]
            else:
                # Access to the image and the target BEFORE self.transform
                sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                image, targets = sample["image"], sample["target"]

            # FORMAT TARGETS
            if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                image = image.transpose((1, 2, 0))

            image = self._standardize_image(image)
            image = image.astype(np.uint8)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

            # Convert to XYXY_LABEL format
            targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
            targets_label_xyxy = targets_format_converter(targets)

            image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

            plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
            plt.imshow(image)
            plt.axis("off")

        fig.tight_layout()
        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

PascalVOCDetectionDataset

Bases: PascalVOCFormatDetectionDataset

Dataset for Pascal VOC object detection

Parameters:
    data_dir (str): Base directory where the dataset is stored.
    images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
    labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
    images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
    download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@register_dataset(Datasets.PASCAL_VOC_DETECTION_DATASET)
class PascalVOCDetectionDataset(PascalVOCFormatDetectionDataset):
    """Dataset for Pascal VOC object detection

        Parameters:
            data_dir (str): Base directory where the dataset is stored.
            images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
            labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
            images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
            download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_sub_directory: Optional[str] = None,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        download: bool = False,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        # Adding a check for deprecated usage alongside new parameters
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )

        elif images_sub_directory is not None:
            images_dir = images_sub_directory
            labels_dir = images_sub_directory.replace("images", "labels")
        elif images_dir is None or labels_dir is None:
            raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

        if download:
            self.download(data_dir)

        super().__init__(
            data_dir=data_dir,
            images_dir=images_dir,
            labels_dir=labels_dir,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
            all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
        )

    @staticmethod
    def download(data_dir: str) -> None:
        """Download Pascal dataset in XYXY_LABEL format.

        Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
        """

        def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
            """Parse and save the labels of an image in XYXY_LABEL format."""

            with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
                xml_parser = ElementTree.parse(f).getroot()

            labels = []
            for obj in xml_parser.iter("object"):
                cls = obj.find("name").text
                if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                    xml_box = obj.find("bndbox")

                    def get_coord(box_coord):
                        return xml_box.find(box_coord).text

                    xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                    labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

            with open(new_label_path, "w") as f:
                f.write("\n".join(labels))

        urls = [
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
        ]  # 1.86G, 17125 images
        data_dir = Path(data_dir)
        download_and_untar_from_url(urls, dir=data_dir / "images")

        # Convert
        data_path = data_dir / "images" / "VOCdevkit"
        for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
            dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
            dest_imgs_path.mkdir(exist_ok=True, parents=True)

            dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
            dest_labels_path.mkdir(exist_ok=True, parents=True)

            with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
                image_ids = f.read().strip().split()

            for id in tqdm(image_ids, desc=f"{image_set}{year}"):
                img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
                new_img_path = dest_imgs_path / img_path.name
                new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
                img_path.rename(new_img_path)  # Move image to dest folder
                _parse_and_save_labels(data_path, new_label_path, year, id)

__init__(data_dir, images_sub_directory=None, images_dir=None, labels_dir=None, download=False, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@deprecated_parameter(
    "images_sub_directory",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
)
def __init__(
    self,
    data_dir: str,
    images_sub_directory: Optional[str] = None,
    images_dir: Optional[str] = None,
    labels_dir: Optional[str] = None,
    download: bool = False,
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """
    Initialize the Pascal VOC Detection Dataset.

    """

    # Adding a check for deprecated usage alongside new parameters
    if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
        logger.warning(
            "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
            "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
            DeprecationWarning,
        )

    elif images_sub_directory is not None:
        images_dir = images_sub_directory
        labels_dir = images_sub_directory.replace("images", "labels")
    elif images_dir is None or labels_dir is None:
        raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

    if download:
        self.download(data_dir)

    super().__init__(
        data_dir=data_dir,
        images_dir=images_dir,
        labels_dir=labels_dir,
        max_num_samples=max_num_samples,
        cache_annotations=cache_annotations,
        input_dim=input_dim,
        transforms=transforms,
        class_inclusion_list=class_inclusion_list,
        ignore_empty_annotations=ignore_empty_annotations,
        verbose=verbose,
        show_all_warnings=show_all_warnings,
        cache=cache,
        cache_dir=cache_dir,
        all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
    )

download(data_dir) staticmethod

Download Pascal dataset in XYXY_LABEL format.

Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@staticmethod
def download(data_dir: str) -> None:
    """Download Pascal dataset in XYXY_LABEL format.

    Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
    """

    def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
        """Parse and save the labels of an image in XYXY_LABEL format."""

        with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
            xml_parser = ElementTree.parse(f).getroot()

        labels = []
        for obj in xml_parser.iter("object"):
            cls = obj.find("name").text
            if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                xml_box = obj.find("bndbox")

                def get_coord(box_coord):
                    return xml_box.find(box_coord).text

                xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

        with open(new_label_path, "w") as f:
            f.write("\n".join(labels))

    urls = [
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
    ]  # 1.86G, 17125 images
    data_dir = Path(data_dir)
    download_and_untar_from_url(urls, dir=data_dir / "images")

    # Convert
    data_path = data_dir / "images" / "VOCdevkit"
    for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
        dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
        dest_imgs_path.mkdir(exist_ok=True, parents=True)

        dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
        dest_labels_path.mkdir(exist_ok=True, parents=True)

        with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
            image_ids = f.read().strip().split()

        for id in tqdm(image_ids, desc=f"{image_set}{year}"):
            img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
            new_img_path = dest_imgs_path / img_path.name
            new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
            img_path.rename(new_img_path)  # Move image to dest folder
            _parse_and_save_labels(data_path, new_label_path, year, id)

PascalVOCUnifiedDetectionTrainDataset

Bases: ConcatDataset

Unified Dataset for Pascal VOC object detection.

Unified Dataset class for training on Pascal VOC object detection datasets.

This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

Parameters: data_dir (str): Base directory where the dataset is stored. input_dim (tuple): Input dimension that the images should be resized to. cache (optional): Cache configuration. cache_dir (optional): Directory for cache. transforms (List[AbstractDetectionTransform], optional): List of transforms to apply. class_inclusion_list (Optional[List[str]], optional): List of classes to include. max_num_samples (int, optional): Maximum number of samples to include from each dataset part. download (bool, optional): If True, downloads the dataset parts to data_dir. Defaults to False. images_dir (Optional[str], optional): Directory containing all the images, relative to data_dir. Should only be used without 'images_sub_directory'. labels_dir (Optional[str], optional): Directory containing all the labels, relative to data_dir. Should only be used without 'images_sub_directory'. images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.

Example Dataset structure:

    ./data/pascal_voc/
    ├─images
    │   ├─ train2012
    │   ├─ val2012
    │   ├─ VOCdevkit
    │   │    ├─ VOC2007
    │   │    │  ├──JPEGImages
    │   │    │  ├──SegmentationClass
    │   │    │  ├──ImageSets
    │   │    │  ├──ImageSets/Segmentation
    │   │    │  ├──ImageSets/Main
    │   │    │  ├──ImageSets/Layout
    │   │    │  ├──Annotations
    │   │    │  └──SegmentationObject
    │   │    └──VOC2012
    │   │       ├──JPEGImages
    │   │       ├──SegmentationClass
    │   │       ├──ImageSets
    │   │       ├──ImageSets/Segmentation
    │   │       ├──ImageSets/Main
    │   │       ├──ImageSets/Action
    │   │       ├──ImageSets/Layout
    │   │       ├──Annotations
    │   │       └──SegmentationObject
    │   ├─train2007
    │   ├─test2007
    │   └─val2007
    └─labels
        ├─train2012
        ├─val2012
        ├─train2007
        ├─test2007
        └─val2007
    Usage:
unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                        input_dim=(512, 512),
                                                        download=True,
                                                        images_dir="images",
                                                        labels_dir="labels")
Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class PascalVOCUnifiedDetectionTrainDataset(ConcatDataset):
    """Unified Dataset for Pascal VOC object detection.

    Unified Dataset class for training on Pascal VOC object detection datasets.

    This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

    Parameters:
        data_dir (str): Base directory where the dataset is stored.
        input_dim (tuple): Input dimension that the images should be resized to.
        cache (optional): Cache configuration.
        cache_dir (optional): Directory for cache.
        transforms (List[AbstractDetectionTransform], optional): List of transforms to apply.
        class_inclusion_list (Optional[List[str]], optional): List of classes to include.
        max_num_samples (int, optional): Maximum number of samples to include from each dataset part.
        download (bool, optional): If True, downloads the dataset parts to `data_dir`. Defaults to False.
        images_dir (Optional[str], optional): Directory containing all the images, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        labels_dir (Optional[str], optional): Directory containing all the labels, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.


        Example Dataset structure:

            ./data/pascal_voc/
            ├─images
            │   ├─ train2012
            │   ├─ val2012
            │   ├─ VOCdevkit
            │   │    ├─ VOC2007
            │   │    │  ├──JPEGImages
            │   │    │  ├──SegmentationClass
            │   │    │  ├──ImageSets
            │   │    │  ├──ImageSets/Segmentation
            │   │    │  ├──ImageSets/Main
            │   │    │  ├──ImageSets/Layout
            │   │    │  ├──Annotations
            │   │    │  └──SegmentationObject
            │   │    └──VOC2012
            │   │       ├──JPEGImages
            │   │       ├──SegmentationClass
            │   │       ├──ImageSets
            │   │       ├──ImageSets/Segmentation
            │   │       ├──ImageSets/Main
            │   │       ├──ImageSets/Action
            │   │       ├──ImageSets/Layout
            │   │       ├──Annotations
            │   │       └──SegmentationObject
            │   ├─train2007
            │   ├─test2007
            │   └─val2007
            └─labels
                ├─train2012
                ├─val2012
                ├─train2007
                ├─test2007
                └─val2007
            Usage:
        unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                                input_dim=(512, 512),
                                                                download=True,
                                                                images_dir="images",
                                                                labels_dir="labels")

    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility. Please use " "'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        input_dim: tuple,
        cache=None,
        cache_dir=None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        max_num_samples: int = None,
        download: bool = False,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        images_sub_directory: Optional[str] = None,  # Marked for deprecation.
    ):
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )
        if download:
            PascalVOCDetectionDataset.download(data_dir=data_dir)

        train_dataset_names = ["train2007", "val2007", "train2012", "val2012"]
        if max_num_samples:
            max_num_samples_per_train_dataset = [len(segment) for segment in np.array_split(range(max_num_samples), len(train_dataset_names))]
        else:
            max_num_samples_per_train_dataset = [None] * len(train_dataset_names)

        train_sets = []
        for i, trainset_name in enumerate(train_dataset_names):
            dataset_kwargs = {
                "data_dir": data_dir,
                "input_dim": input_dim,
                "cache": cache,
                "cache_dir": cache_dir,
                "transforms": transforms,
                "class_inclusion_list": class_inclusion_list,
                "max_num_samples": max_num_samples_per_train_dataset[i],
            }
            if images_dir is not None and labels_dir is not None:
                dataset_kwargs["images_dir"] = os.path.join(images_dir, trainset_name)
                dataset_kwargs["labels_dir"] = os.path.join(labels_dir, trainset_name)
            elif images_sub_directory is not None:
                deprecated_images_path = os.path.join("images", trainset_name)
                deprecated_labels_path = os.path.join("labels", trainset_name)
                dataset_kwargs["images_dir"] = deprecated_images_path
                dataset_kwargs["labels_dir"] = deprecated_labels_path
            else:
                raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

            train_sets.append(PascalVOCDetectionDataset(**dataset_kwargs))
            super(PascalVOCUnifiedDetectionTrainDataset, self).__init__(train_sets)

PascalVOCFormatDetectionDataset

Bases: DetectionDataset

Dataset for Pascal VOC object detection

Parameters: data_dir (str): Base directory where the dataset is stored.

images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
 n annotations/images. Defaults to None.

cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
 but requires more RAM. Defaults to True.

input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
 or a tuple (height, width). Defaults to None.

transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
 Defaults to an empty list.

all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
 Adjust the number of model classes accordingly. Defaults to None.

ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
 ignored. Defaults to True.

verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

show_all_warnings (bool): If True, displays all warnings. Defaults to False.

cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
 future version.

cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
 a future version.



Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
@register_dataset("PascalVOCFormatDetectionDataset")
class PascalVOCFormatDetectionDataset(DetectionDataset):
    """Dataset for Pascal VOC object detection

    Parameters:
        data_dir (str): Base directory where the dataset is stored.

        images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

        labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

        max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
         n annotations/images. Defaults to None.

        cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
         but requires more RAM. Defaults to True.

        input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
         or a tuple (height, width). Defaults to None.

        transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
         Defaults to an empty list.

        all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

        class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
         Adjust the number of model classes accordingly. Defaults to None.

        ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
         ignored. Defaults to True.

        verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

        show_all_warnings (bool): If True, displays all warnings. Defaults to False.

        cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
         future version.

        cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
         a future version.



        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        self.data_dir = data_dir

        self.images_dir = os.path.join(data_dir, images_dir)
        self.labels_dir = os.path.join(data_dir, labels_dir)

        super(PascalVOCFormatDetectionDataset, self).__init__(
            data_dir=data_dir,
            original_target_format=XYXY_LABEL,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            all_classes_list=all_classes_list,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
        )

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: List of tuples made of (img_path,target_path)
        """
        if not Path(self.images_dir).exists():
            raise FileNotFoundError(f"{self.images_dir} not found.")

        img_files = list(sorted(glob.glob(os.path.join(self.images_dir, "*.jpg"))))
        if len(img_files) == 0:
            raise FileNotFoundError(f"No image files found in {self.images_dir}")

        target_files = [os.path.join(self.labels_dir, os.path.basename(img_file).replace(".jpg", ".txt")) for img_file in img_files]

        img_and_target_path_list = [(img_file, target_file) for img_file, target_file in zip(img_files, target_files) if os.path.exists(target_file)]
        if len(img_and_target_path_list) == 0:
            raise FileNotFoundError("No target files associated with the images were found")

        num_missing_files = len(img_files) - len(img_and_target_path_list)
        if num_missing_files > 0:
            logger.warning(f"{num_missing_files} label files were not loaded out of {len(img_files)} image files")

        self.img_and_target_path_list = img_and_target_path_list
        return len(self.img_and_target_path_list)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load annotations for a given sample.

        :return: Annotation including:
                    - target in XYXY_LABEL format
                    - img_path
        """
        img_path, target_path = self.img_and_target_path_list[sample_id]
        with open(target_path, "r") as file:
            target = np.array([x.split() for x in file.read().splitlines()], dtype=np.float32)

        height, width = get_image_size_from_path(img_path)
        r = min(self.input_dim[1] / height, self.input_dim[0] / width)
        target[:, :4] *= r
        resized_img_shape = (int(height * r), int(width * r))

        return {"img_path": img_path, "target": target, "resized_img_shape": resized_img_shape}

__init__(data_dir, images_dir, labels_dir, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151