Skip to content

Datasets

RandAugment RandAugment is a variant of AutoAugment which randomly selects transformations from AutoAugment to be applied on an image.

RandomAugmentation Implementation adapted from: https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py

Papers: RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719

AugmentOp

single auto augment operations

Source code in src/super_gradients/training/datasets/auto_augment.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class AugmentOp:
    """
    single auto augment operations
    """

    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
        hparams = hparams or _HPARAMS_DEFAULT
        self.aug_fn = NAME_TO_OP[name]
        self.level_fn = LEVEL_TO_ARG[name]
        self.prob = prob
        self.magnitude = magnitude
        self.hparams = hparams.copy()
        self.kwargs = dict(
            fillcolor=hparams["img_mean"] if "img_mean" in hparams else _FILL,
            resample=hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION,
        )

        # If magnitude_std is > 0, introduce some randomness
        self.magnitude_std = self.hparams.get("magnitude_std", 0)

    def __call__(self, img):
        if self.prob < 1.0 and random.random() > self.prob:
            return img
        magnitude = self.magnitude
        if self.magnitude_std:
            if self.magnitude_std == float("inf"):
                magnitude = random.uniform(0, magnitude)
            elif self.magnitude_std > 0:
                magnitude = random.gauss(magnitude, self.magnitude_std)
        magnitude = min(_MAX_MAGNITUDE, max(0, magnitude))  # clip to valid range
        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
        return self.aug_fn(img, *level_args, **self.kwargs)

RandAugment

Random auto augment class, will select auto augment transforms according to probability weights for each op

Source code in src/super_gradients/training/datasets/auto_augment.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
class RandAugment:
    """
    Random auto augment class, will select auto augment transforms according to probability weights for each op
    """

    def __init__(self, ops, num_layers=2, choice_weights=None):
        self.ops = ops
        self.num_layers = num_layers
        self.choice_weights = choice_weights

    def __call__(self, img):
        # no replacement when using weighted choice
        ops = np.random.choice(self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
        for op in ops:
            img = op(img)
        return img

rand_augment_transform(config_str, crop_size, img_mean)

Create a RandAugment transform

Parameters:

Name Type Description Default
config_str

String defining configuration of random augmentation. Consists of multiple sections separated by dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining sections, not order sepecific determine 'm' - integer magnitude of rand augment 'n' - integer num layers (number of transform ops selected per image) 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) 'mstd' - float std deviation of magnitude noise applied 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

required
crop_size int

The size of crop image

required
img_mean List[float]

Average per channel

required

Returns:

Type Description

A PyTorch compatible Transform

Source code in src/super_gradients/training/datasets/auto_augment.py
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@register_transform(Transforms.RandAugmentTransform)
def rand_augment_transform(config_str, crop_size: int, img_mean: List[float]):
    """
    Create a RandAugment transform

    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
    sections, not order sepecific determine
        'm' - integer magnitude of rand augment
        'n' - integer num layers (number of transform ops selected per image)
        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
        'mstd' -  float std deviation of magnitude noise applied
        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

    :param crop_size: The size of crop image
    :param img_mean:  Average per channel

    :return: A PyTorch compatible Transform
    """
    hparams = dict(translate_const=int(crop_size * 0.45), img_mean=tuple([min(255, round(255 * channel_mean)) for channel_mean in img_mean]))

    magnitude = _MAX_MAGNITUDE  # default to _MAX_MAGNITUDE for magnitude (currently 10)
    num_layers = 2  # default to 2 ops per image
    weight_idx = None  # default to no probability weights for op choice
    transforms = _RAND_TRANSFORMS
    config = config_str.split("-")
    for c in config:
        cs = re.split(r"(\d.*)", c)
        if len(cs) < 2:
            continue
        key, val = cs[:2]
        if key == "mstd":
            # noise param injected via hparams for now
            hparams.setdefault("magnitude_std", float(val))
        elif key == "inc":
            if bool(val):
                transforms = _RAND_INCREASING_TRANSFORMS
        elif key == "m":
            magnitude = int(val)
        elif key == "n":
            num_layers = int(val)
        elif key == "w":
            weight_idx = int(val)
        else:
            assert False, "Unknown RandAugment config section"
    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)

Cifar10

Bases: CIFAR10, HasPreprocessingParams

CIFAR10 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
@register_dataset(Datasets.CIFAR_10)
class Cifar10(CIFAR10, HasPreprocessingParams):
    """
    CIFAR10 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """

    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar10, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

Cifar100

Bases: CIFAR100, HasPreprocessingParams

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@register_dataset(Datasets.CIFAR_100)
class Cifar100(CIFAR100, HasPreprocessingParams):
    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        """
        CIFAR100 Dataset

        :param root:                    Path for the data to be extracted
        :param train:                   Bool to load training (True) or validation (False) part of the dataset
        :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
        :param target_transform:        Transform to apply to target output
        :param download:                Download (True) the dataset from source
        """
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar100, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

__init__(root, train=True, transforms=None, target_transform=None, download=False)

CIFAR100 Dataset

Parameters:

Name Type Description Default
root str

Path for the data to be extracted

required
train bool

Bool to load training (True) or validation (False) part of the dataset

True
transforms Union[list, dict]

List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose

None
target_transform Optional[Callable]

Transform to apply to target output

None
download bool

Download (True) the dataset from source

False
Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@resolve_param("transforms", TransformsFactory())
def __init__(
    self,
    root: str,
    train: bool = True,
    transforms: Union[list, dict] = None,
    target_transform: Optional[Callable] = None,
    download: bool = False,
) -> None:
    """
    CIFAR100 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """
    # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
    # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
    if isinstance(transforms, list):
        transforms = Compose(transforms)

    super(Cifar100, self).__init__(
        root=root,
        train=train,
        transform=transforms,
        target_transform=target_transform,
        download=download,
    )

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

ImageNetDataset

Bases: torch_datasets.ImageFolder, HasPreprocessingParams

ImageNetDataset dataset.

To use this Dataset you need to:

  • Download imagenet dataset (https://image-net.org/download.php) Imagenet ├──train │ ├──n02093991 │ │ ├──n02093991_1001.JPEG │ │ ├──n02093991_1004.JPEG │ │ └──... │ ├──n02093992 │ └──... └──val ├──n02093991 ├──n02093992 └──...

  • Instantiate the dataset: >> train_set = ImageNetDataset(root='.../Imagenet/train', ...) >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@register_dataset(Datasets.IMAGENET_DATASET)
class ImageNetDataset(torch_datasets.ImageFolder, HasPreprocessingParams):
    """ImageNetDataset dataset.

    To use this Dataset you need to:

    - Download imagenet dataset (https://image-net.org/download.php)
        Imagenet
         ├──train
         │  ├──n02093991
         │  │   ├──n02093991_1001.JPEG
         │  │   ├──n02093991_1004.JPEG
         │  │   └──...
         │  ├──n02093992
         │  └──...
         └──val
            ├──n02093991
            ├──n02093992
            └──...

    - Instantiate the dataset:
        >> train_set = ImageNetDataset(root='.../Imagenet/train', ...)
        >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)
    """

    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(self, root: str, transforms: Union[list, dict] = [], *args, **kwargs):
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)
        super(ImageNetDataset, self).__init__(root, transform=transforms, *args, **kwargs)

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

get_dataset_preprocessing_params()

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type Description
Dict

(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

get_torchvision_transforms_equivalent_processing(transforms)

Get the equivalent processing pipeline for torchvision transforms.

Returns:

Type Description
List[Dict[str, Any]]

List of Processings operations

Source code in src/super_gradients/training/datasets/classification_datasets/torchvision_utils.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def get_torchvision_transforms_equivalent_processing(transforms: List[Any]) -> List[Dict[str, Any]]:
    """
    Get the equivalent processing pipeline for torchvision transforms.

    :return: List of Processings operations
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = []

    if isinstance(transforms, StandardTransform):
        transforms = transforms.transform

    if isinstance(transforms, Compose):
        transforms = transforms.transforms

    for transform in transforms:
        if isinstance(transform, ToTensor):
            pipeline.append({Processings.StandardizeImage: {"max_value": 255}})
        elif isinstance(transform, Normalize):
            pipeline.append({Processings.NormalizeImage: {"mean": tuple(map(float, transform.mean)), "std": tuple(map(float, transform.std))}})
        elif isinstance(transform, Resize):
            pipeline.append({Processings.Resize: {"size": int(transform.size)}})
        elif isinstance(transform, CenterCrop):
            pipeline.append({Processings.CenterCrop: {"size": int(transform.size)}})
        else:
            raise ValueError(f"Unsupported transform: {transform}")

    pipeline.append({Processings.ImagePermute: {"permutation": (2, 0, 1)}})
    return pipeline

Lighting

Bases: object

Lighting noise(AlexNet - style PCA - based noise) Taken from fastai Imagenet training - https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103 To use: - training_params = {"imagenet_pca_aug": 0.1} - Default training_params arg is 0.0 ("don't use") - 0.1 is that default in the original paper

Source code in src/super_gradients/training/datasets/data_augmentation.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@register_transform(Transforms.Lighting)
class Lighting(object):
    """
    Lighting noise(AlexNet - style PCA - based noise)
    Taken from fastai Imagenet training -
    https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103
    To use:
        - training_params = {"imagenet_pca_aug": 0.1}
        - Default training_params arg is 0.0 ("don't use")
        - 0.1 is that default in the original paper
    """

    def __init__(self, alphastd, eigval=IMAGENET_PCA["eigval"], eigvec=IMAGENET_PCA["eigvec"]):
        self.alphastd = alphastd
        self.eigval = eigval
        self.eigvec = eigvec

    def __call__(self, img):
        if self.alphastd == 0:
            return img
        alpha = img.new().resize_(3).normal_(0, self.alphastd)
        rgb = self.eigvec.type_as(img).clone().mul(alpha.view(1, 3).expand(3, 3)).mul(self.eigval.view(1, 3).expand(3, 3)).sum(1).squeeze()
        return img.add(rgb.view(3, 1, 1).expand_as(img))

RandomErase

Bases: RandomErasing

A simple class that translates the parameters supported in SuperGradient's code base

Source code in src/super_gradients/training/datasets/data_augmentation.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@register_transform(Transforms.RandomErase)
class RandomErase(RandomErasing):
    """
    A simple class that translates the parameters supported in SuperGradient's code base
    """

    def __init__(self, probability: float, value: str):
        # value might be a string representing a float. First we try to convert to float and if fails,
        # pass it as-is to super
        try:
            value = float(value)
        except ValueError:
            pass
        super().__init__(p=probability, value=value)

BoundingBoxFormat

Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class BoundingBoxFormat:
    """
    Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert
    whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to
    intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support
    all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.
    """

    def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert input boxes to XYXY format
        :param bboxes: Input bounding boxes [..., 4]
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in XYXY format
        """
        return self.get_to_xyxy(inplace)(bboxes, image_shape)

    def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert XYXY boxes to target bboxes format
        :param bboxes: Input bounding boxes [..., 4] in XYXY format
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in target format
        """
        return self.get_from_xyxy(inplace)(bboxes, image_shape)

    @abstractmethod
    def get_to_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    @abstractmethod
    def get_from_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    def get_num_parameters(self) -> int:
        return 4

from_xyxy(bboxes, image_shape, inplace)

Convert XYXY boxes to target bboxes format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4] in XYXY format

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in target format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
27
28
29
30
31
32
33
34
35
def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert XYXY boxes to target bboxes format
    :param bboxes: Input bounding boxes [..., 4] in XYXY format
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in target format
    """
    return self.get_from_xyxy(inplace)(bboxes, image_shape)

to_xyxy(bboxes, image_shape, inplace)

Convert input boxes to XYXY format

Parameters:

Name Type Description Default
bboxes

Input bounding boxes [..., 4]

required
image_shape Tuple[int, int]

Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.

required

Returns:

Type Description

Converted bounding boxes [..., 4] in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
17
18
19
20
21
22
23
24
25
def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert input boxes to XYXY format
    :param bboxes: Input bounding boxes [..., 4]
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in XYXY format
    """
    return self.get_to_xyxy(inplace)(bboxes, image_shape)

convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)

Convert bboxes from source to target format

Parameters:

Name Type Description Default
bboxes

Tensor of shape (..., 4) with input bounding boxes

required
image_shape Tuple[int, int]

Tuple of (rows, cols) corresponding to image shape

required
source_format BoundingBoxFormat

Format of the source bounding boxes

required
target_format BoundingBoxFormat

Format of the output bounding boxes

required

Returns:

Type Description

Tensor of shape (..., 4) with resulting bounding boxes

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py
49
50
51
52
53
54
55
56
57
58
59
def convert_bboxes(bboxes, image_shape: Tuple[int, int], source_format: BoundingBoxFormat, target_format: BoundingBoxFormat, inplace: bool):
    """
    Convert bboxes from source to target format
    :param bboxes: Tensor of shape (..., 4) with input bounding boxes
    :param image_shape: Tuple of (rows, cols) corresponding to image shape
    :param source_format: Format of the source bounding boxes
    :param target_format: Format of the output bounding boxes
    :return: Tensor of shape (..., 4) with resulting bounding boxes
    """
    xyxy = source_format.to_xyxy(bboxes, image_shape, inplace)
    return target_format.from_xyxy(xyxy, image_shape, inplace)

cxcywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from CX-CY-W-H format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def cxcywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from CX-CY-W-H format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    cx, cy, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        if isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

cxcywh_to_xyxy_inplace(bboxes, image_shape)

Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in CX-CY-W-H format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def cxcywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    bboxes[..., 0:2] -= bboxes[..., 2:4] * 0.5  # cxcy -> x1y1
    bboxes[..., 2:4] += bboxes[..., 0:2]  # x1y1 + wh -> x2y2
    return bboxes

xyxy_to_cxcywh(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def xyxy_to_cxcywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1
    cx = x1 + 0.5 * w
    cy = y1 + 0.5 * h
    if torch.jit.is_scripting():
        return torch.stack([cx, cy, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([cx, cy, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([cx, cy, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_cxcywh_inplace(bboxes, image_shape)

Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place. Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def xyxy_to_cxcywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place.
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
        elif isinstance(bboxes, np.ndarray) and not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
    bboxes[..., 2:4] -= bboxes[..., 0:2]  # x2y2 - x1y2 -> wh
    bboxes[..., 0:2] += bboxes[..., 2:4] * 0.5  # cxcywh
    return bboxes

NormalizedXYXYCoordinateFormat

Bases: BoundingBoxFormat

Normalized X1,Y1,X2,Y2 bounding boxes format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class NormalizedXYXYCoordinateFormat(BoundingBoxFormat):
    """
    Normalized X1,Y1,X2,Y2 bounding boxes format
    """

    def __init__(self):
        super().__init__()
        self.format = "normalized_xyxy"
        self.normalized = True

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return normalized_xyxy_to_xyxy_inplace
        else:
            return normalized_xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_normalized_xyxy_inplace
        else:
            return xyxy_to_normalized_xyxy

normalized_xyxy_to_xyxy(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def normalized_xyxy_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
            scale = scale.reshape([1] * (len(bboxes.shape) - 1) + [4])
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

    return bboxes * scale

normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
67
68
69
70
71
72
73
74
75
76
77
def normalized_xyxy_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    bboxes[..., 0:3:2] *= cols
    bboxes[..., 1:4:2] *= rows
    return bboxes

xyxy_to_normalized_xyxy(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format

Parameters:

Name Type Description Default
bboxes Tensor

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description
Tensor

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def xyxy_to_normalized_xyxy(bboxes: Tensor, image_shape: Tuple[int, int]) -> Tensor:
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")
    return bboxes / scale

xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY (pixels) format

required
image_shape Tuple[int, int]

Image shape (rows,cols)

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def xyxy_to_normalized_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """

    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if isinstance(bboxes, np.ndarray) and not np.issubdtype(bboxes.dtype, np.floating):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    rows, cols = image_shape
    bboxes[..., 0:3:2] /= cols
    bboxes[..., 1:4:2] /= rows
    return bboxes

xywh_to_xyxy(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def xywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    x1, y1, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xywh_to_xyxy_inplace(bboxes, image_shape)

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYWH format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
65
66
67
68
69
70
71
72
def xywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    bboxes[..., 2:4] += bboxes[..., 0:2]
    return bboxes

xyxy_to_xywh(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def xyxy_to_xywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

xyxy_to_xywh_inplace(bboxes, image_shape)

Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.

Parameters:

Name Type Description Default
bboxes

BBoxes of shape (..., 4) in XYXY format

required

Returns:

Type Description

BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py
55
56
57
58
59
60
61
62
def xyxy_to_xywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    bboxes[..., 2:4] -= bboxes[..., 0:2]
    return bboxes

XYXYCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format X1, Y1, X2, Y2

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xyxy.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class XYXYCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format X1, Y1, X2, Y2
    """

    def __init__(self):
        self.format = "xyxy"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

YXYXCoordinateFormat

Bases: BoundingBoxFormat

Bounding boxes format Y1, X1, Y2, X1

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/yxyx.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class YXYXCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format Y1, X1, Y2, X1
    """

    def __init__(self):
        super().__init__()
        self.format = "yxyx"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

    def get_from_xyxy(self, inplace: bool):
        # XYXY <-> YXYX is interchangable operation, so we may reuse same routine here
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

ConcatenatedTensorFormatConverter

Source code in src/super_gradients/training/datasets/data_formats/format_converter.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class ConcatenatedTensorFormatConverter:
    def __init__(
        self,
        input_format: ConcatenatedTensorFormat,
        output_format: ConcatenatedTensorFormat,
        image_shape: Union[Tuple[int, int], None],
    ):
        """
        Converts concatenated tensors from input format to output format.

        Example:
            >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
            >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
            >>> h, w = 100, 200
            >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
            >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
            >>>
            >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
            >>>
            >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
            >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        self.permutation_indexes = get_permutation_indexes(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.image_shape = image_shape
        self.input_length = input_format.num_channels

    def __call__(self, tensor: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        if tensor.shape[-1] != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({tensor.shape[-1]}) must be "
                f"equal to {self.input_length} as defined by input format."
            )
        tensor = tensor[:, self.permutation_indexes]
        tensor = apply_on_bboxes(fn=self._convert_bbox, tensor=tensor, tensor_format=self.output_format)
        return tensor

    def _convert_bbox(self, bboxes: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        return convert_bboxes(
            bboxes=bboxes,
            source_format=self.input_format.bboxes_format.format,
            target_format=self.output_format.bboxes_format.format,
            inplace=False,
            image_shape=self.image_shape,
        )

__init__(input_format, output_format, image_shape)

Converts concatenated tensors from input format to output format.

Example: >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY >>> h, w = 100, 200 >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32) >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32) >>> >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w)) >>> >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in src/super_gradients/training/datasets/data_formats/format_converter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(
    self,
    input_format: ConcatenatedTensorFormat,
    output_format: ConcatenatedTensorFormat,
    image_shape: Union[Tuple[int, int], None],
):
    """
    Converts concatenated tensors from input format to output format.

    Example:
        >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
        >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
        >>> h, w = 100, 200
        >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
        >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
        >>>
        >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
        >>>
        >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
        >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    self.permutation_indexes = get_permutation_indexes(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.image_shape = image_shape
    self.input_length = input_format.num_channels

ConcatenatedTensorFormat

Bases: DetectionOutputFormat

Define the output format that return a single tensor of shape [N,M] (N - number of detections, M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields. A layout defines the order of concatenated tensors. For instance: - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1) - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

custom_format = ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

Source code in src/super_gradients/training/datasets/data_formats/formats.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class ConcatenatedTensorFormat(DetectionOutputFormat):
    """
    Define the output format that return a single tensor of shape [N,M] (N - number of detections,
    M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields.
    A layout defines the order of concatenated tensors. For instance:
    - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1)
    - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)


    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> custom_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>         TensorSliceItem(name="label", length=1),
    >>>         TensorSliceItem(name="distance", length=1),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>     )
    >>> )

    """

    layout: Mapping[str, TensorSliceItem]
    locations: Mapping[str, Tuple[int, int]]
    indexes: Mapping[str, List[int]]
    num_channels: int

    @property
    def bboxes_format(self) -> BoundingBoxesTensorSliceItem:
        bbox_items = [x for x in self.layout.values() if isinstance(x, BoundingBoxesTensorSliceItem)]
        return bbox_items[0]

    def __init__(self, layout: Union[List[TensorSliceItem], Tuple[TensorSliceItem, ...]]):
        bbox_items = [x for x in layout if isinstance(x, BoundingBoxesTensorSliceItem)]
        if len(bbox_items) != 1:
            raise RuntimeError("Number of bounding box items must be strictly equal to 1")

        _layout = []
        _locations = []
        _indexes = []

        offset = 0
        for item in layout:
            location_indexes = list(range(offset, offset + item.length))
            location_slice = offset, offset + item.length

            _layout.append((item.name, item))
            _locations.append((item.name, location_slice))
            _indexes.append((item.name, location_indexes))
            offset += item.length

        self.layout = collections.OrderedDict(_layout)
        self.locations = collections.OrderedDict(_locations)
        self.indexes = collections.OrderedDict(_indexes)
        self.num_channels = offset

    def __repr__(self):
        return str(self.layout)

apply_on_bboxes(fn, tensor, tensor_format)

Apply inplace a function only on the bboxes of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py
105
106
107
108
109
110
111
112
113
114
115
116
117
def apply_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on the bboxes of a concatenated tensor.

    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return apply_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

apply_on_layout(fn, tensor, tensor_format, layout_name)

Apply inplace a function only on a specific layout of a concatenated tensor.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to apply on the bboxes.

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the layout

Source code in src/super_gradients/training/datasets/data_formats/formats.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def apply_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on a specific layout of a concatenated tensor.
    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after applying INPLACE the fn on the layout
    """
    location = slice(*iter(tensor_format.locations[layout_name]))
    result = fn(tensor[..., location])
    tensor[..., location] = result
    return tensor

filter_on_bboxes(fn, tensor, tensor_format)

Filter the tensor according to a condition on the bboxes.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the bboxes.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the bboxes.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py
139
140
141
142
143
144
145
146
147
148
149
150
151
def filter_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on the bboxes.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return filter_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

filter_on_layout(fn, tensor, tensor_format, layout_name)

Filter the tensor according to a condition on a specific layout.

Parameters:

Name Type Description Default
fn Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]

Function to filter the bboxes (keep only True elements).

required
tensor Union[np.ndarray, Tensor]

Concatenated tensor that include - among other - the layout of interest.

required
tensor_format ConcatenatedTensorFormat

Format of the tensor, required to know the indexes of the layout.

required
layout_name str

Name of the layout of interest. It has to be defined in the tensor_format.

required

Returns:

Type Description
Union[np.ndarray, Tensor]

Tensor, after filtering the bboxes according to fn.

Source code in src/super_gradients/training/datasets/data_formats/formats.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def filter_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on a specific layout.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after filtering the bboxes according to fn.
    """
    location = slice(*tensor_format.locations[layout_name])
    mask = fn(tensor[..., location])
    tensor = tensor[mask]
    return tensor

get_permutation_indexes(input_format, output_format)

Compute the permutations required to change the format layout order.

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Input format to transform from

required
output_format ConcatenatedTensorFormat

Output format to transform to

required

Returns:

Type Description
List[int]

Permutation indexes to go from input to output format.

Source code in src/super_gradients/training/datasets/data_formats/formats.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_permutation_indexes(input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat) -> List[int]:
    """Compute the permutations required to change the format layout order.

    :param input_format:    Input format to transform from
    :param output_format:   Output format to transform to
    :return: Permutation indexes to go from input to output format.
    """
    output_indexes = []
    for output_name, output_spec in output_format.layout.items():
        if output_name not in input_format.layout:
            raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

        input_spec = input_format.layout[output_name]
        if input_spec.length != output_spec.length:
            raise RuntimeError(
                f"Length of the output must match in input and output format. "
                f"Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
            )
        indexes = input_format.indexes[output_name]
        output_indexes.extend(indexes)
    return output_indexes

ConvertBoundingBoxes

Bases: nn.Module

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ConvertBoundingBoxes(nn.Module):
    to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]
    from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]

    def __init__(
        self,
        location: Tuple[int, int],
        to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        image_shape: Tuple[int, int],
    ):
        super().__init__()
        self.to_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], to_xyxy)
        self.from_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], from_xyxy)
        self.image_shape = image_shape
        self.location = location

    def forward(self, x: Tensor) -> Tensor:
        """

        :param x:
        :param image_shape:
        :return:
        """
        location = slice(self.location[0], self.location[1])
        bboxes = x[..., location]
        xyxy = self.to_xyxy(bboxes, self.image_shape)
        x[..., location] = self.from_xyxy(xyxy, self.image_shape)
        return x

forward(x)

Parameters:

Name Type Description Default
x Tensor required
image_shape required

Returns:

Type Description
Tensor
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
56
57
58
59
60
61
62
63
64
65
66
67
def forward(self, x: Tensor) -> Tensor:
    """

    :param x:
    :param image_shape:
    :return:
    """
    location = slice(self.location[0], self.location[1])
    bboxes = x[..., location]
    xyxy = self.to_xyxy(bboxes, self.image_shape)
    x[..., location] = self.from_xyxy(xyxy, self.image_shape)
    return x

DetectionOutputAdapter

Bases: nn.Module

Adapter class for converting model's predictions for object detection to a desired format. This adapter supports torch.jit tracing & scripting & onnx conversion.

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

class CustomDetectionHead(nn.Module): num_classes: int = 123

@property def format(self): ''' Describe the semantics of the model's output. In this example model's output consists of - Bounding boxes in XYXY format [4] - Predicted probas of N classes [N] - A distance predictions [1] - K additional labels [K] ''' return ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

yolox = YoloX(head=CustomDetectionHead)

Suppose we want to return predictions in another format.

Let it be:

- Bounding boxes in normalized XYWH [4]

- Predicted attributes [4]

- Predicted label [1]

output_format = ConcatenatedTensorFormat( layout=( # Note: For output format it is not required to specify location attribute as it will be # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()), TensorSliceItem(name="attributes", length=4), TensorSliceItem(name="label", length=1), ) )

Now we can construct output adapter and attach it to the model

output_adapter = DetectionOutputAdapter( input_format=yolox.head.format, output_format=output_format, image_shape=(640, 640) )

yolox = nn.Sequential(yolox, output_adapter)

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class DetectionOutputAdapter(nn.Module):
    """
    Adapter class for converting model's predictions for object detection to a desired format.
    This adapter supports torch.jit tracing & scripting & onnx conversion.

    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> class CustomDetectionHead(nn.Module):
    >>>    num_classes: int = 123
    >>>
    >>>    @property
    >>>    def format(self):
    >>>        '''
    >>>        Describe the semantics of the model's output. In this example model's output consists of
    >>>         - Bounding boxes in XYXY format [4]
    >>>         - Predicted probas of N classes [N]
    >>>         - A distance predictions [1]
    >>>         - K additional labels [K]
    >>>        '''
    >>>        return ConcatenatedTensorFormat(
    >>>            layout=(
    >>>                BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>                TensorSliceItem(name="label", length=1),
    >>>                TensorSliceItem(name="distance", length=1),
    >>>                TensorSliceItem(name="attributes", length=4),
    >>>            )
    >>>        )
    >>>
    >>> yolox = YoloX(head=CustomDetectionHead)
    >>>
    >>> # Suppose we want to return predictions in another format.
    >>> # Let it be:
    >>> # - Bounding boxes in normalized XYWH [4]
    >>> # - Predicted attributes [4]
    >>> # - Predicted label [1]
    >>> output_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         # Note: For output format it is not required to specify location attribute as it will be
    >>>         # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>         TensorSliceItem(name="label", length=1),
    >>>     )
    >>> )
    >>>
    >>> # Now we can construct output adapter and attach it to the model
    >>> output_adapter = DetectionOutputAdapter(
    >>>     input_format=yolox.head.format,
    >>>     output_format=output_format,
    >>>     image_shape=(640, 640)
    >>> )
    >>>
    >>> yolox = nn.Sequential(yolox, output_adapter)
    >>>
    """

    def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
        """

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        super().__init__()

        self.format_conversion: nn.Module = self.get_format_conversion_module(
            location=input_format.locations[input_format.bboxes_format.name],
            input_bbox_format=input_format.bboxes_format.format,
            output_bbox_format=output_format.bboxes_format.format,
            image_shape=image_shape,
        )

        self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.input_length = input_format.num_channels

    def forward(self, predictions: Tensor) -> Tensor:
        """
        Convert output detections to the user-specified format
        :param predictions:
        :return:
        """
        if predictions.size(-1) != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
                f"equal to {self.input_length} as defined by input format."
            )

        predictions = self.format_conversion(predictions.clone())
        predictions = self.rearrange_outputs(predictions)
        return predictions

    @classmethod
    def get_rearrange_outputs_module(
        cls, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat
    ) -> Tuple[RearrangeOutput, ConcatenatedTensorFormat]:

        output_indexes = []
        rearranged_layout = []

        offset = 0
        for output_name, output_spec in output_format.layout.items():
            if output_name not in input_format.layout:
                raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

            input_spec = input_format.layout[output_name]

            if input_spec.length != output_spec.length:
                raise RuntimeError(
                    "Length of the output must match in input and output format. "
                    "Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
                )
            indexes = input_format.indexes[output_name]
            output_indexes.extend(indexes)
            output_len = len(indexes)

            rearranged_item = copy.deepcopy(output_spec)
            offset += output_len

            rearranged_layout.append(rearranged_item)
        rearranged_format = ConcatenatedTensorFormat(rearranged_layout)
        return RearrangeOutput(torch.tensor(output_indexes).long()), rearranged_format

    @classmethod
    def get_format_conversion_module(
        cls, location: Tuple[int, int], input_bbox_format: BoundingBoxFormat, output_bbox_format: BoundingBoxFormat, image_shape: Union[Tuple[int, int], None]
    ) -> ConvertBoundingBoxes:
        return ConvertBoundingBoxes(
            location=location,
            to_xyxy=input_bbox_format.get_to_xyxy(False),
            from_xyxy=output_bbox_format.get_from_xyxy(True),
            image_shape=image_shape,
        )

__init__(input_format, output_format, image_shape)

Parameters:

Name Type Description Default
input_format ConcatenatedTensorFormat

Format definition of the inputs

required
output_format ConcatenatedTensorFormat

Format definition of the outputs

required
image_shape Union[Tuple[int, int], None]

Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None

required
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
    """

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    super().__init__()

    self.format_conversion: nn.Module = self.get_format_conversion_module(
        location=input_format.locations[input_format.bboxes_format.name],
        input_bbox_format=input_format.bboxes_format.format,
        output_bbox_format=output_format.bboxes_format.format,
        image_shape=image_shape,
    )

    self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.input_length = input_format.num_channels

forward(predictions)

Convert output detections to the user-specified format

Parameters:

Name Type Description Default
predictions Tensor required

Returns:

Type Description
Tensor
Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def forward(self, predictions: Tensor) -> Tensor:
    """
    Convert output detections to the user-specified format
    :param predictions:
    :return:
    """
    if predictions.size(-1) != self.input_length:
        raise RuntimeError(
            f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
            f"equal to {self.input_length} as defined by input format."
        )

    predictions = self.format_conversion(predictions.clone())
    predictions = self.rearrange_outputs(predictions)
    return predictions

RearrangeOutput

Bases: nn.Module

Rearrange elements in last dimension of input tensor with respect to index argument

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class RearrangeOutput(nn.Module):
    """
    Rearrange elements in last dimension of input tensor with respect to index argument

    """

    def __init__(self, indexes: Tensor):
        super().__init__()
        self.indexes = indexes

    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: Input tensor of  [..., N] shape
        :return: Output tensor of [..., N[index]] shape
        """
        if torch.jit.is_scripting():
            # Workaround "Ellipses followed by tensor indexing is currently not supported"
            # https://github.com/pytorch/pytorch/issues/34837
            x = torch.moveaxis(x, -1, 0)
            x = x[self.indexes]
            x = torch.moveaxis(x, 0, -1)
            return x
        else:
            return x[..., self.indexes]

forward(x)

Parameters:

Name Type Description Default
x Tensor

Input tensor of [..., N] shape

required

Returns:

Type Description
Tensor

Output tensor of [..., N[index]] shape

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def forward(self, x: Tensor) -> Tensor:
    """
    :param x: Input tensor of  [..., N] shape
    :return: Output tensor of [..., N[index]] shape
    """
    if torch.jit.is_scripting():
        # Workaround "Ellipses followed by tensor indexing is currently not supported"
        # https://github.com/pytorch/pytorch/issues/34837
        x = torch.moveaxis(x, -1, 0)
        x = x[self.indexes]
        x = torch.moveaxis(x, 0, -1)
        return x
    else:
        return x[..., self.indexes]

AbstractCollateFunction

Bases: ABC

A collate function (for torch DataLoader)

Source code in src/super_gradients/training/datasets/datasets_utils.py
76
77
78
79
80
81
82
83
class AbstractCollateFunction(ABC):
    """
    A collate function (for torch DataLoader)
    """

    @abstractmethod
    def __call__(self, batch):
        pass

AbstractPrePredictionCallback

Bases: ABC

Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params pre_prediction_callback keyword arg.

Should implement call and return images, targets after applying the desired preprocessing.

Source code in src/super_gradients/training/datasets/datasets_utils.py
175
176
177
178
179
180
181
182
183
184
185
class AbstractPrePredictionCallback(ABC):
    """
    Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params
     pre_prediction_callback keyword arg.

    Should implement __call__ and return images, targets after applying the desired preprocessing.
    """

    @abstractmethod
    def __call__(self, inputs, targets, batch_idx):
        pass

ComposedCollateFunction

Bases: AbstractCollateFunction

A function (for torch DataLoader) which executes a sequence of sub collate functions

Source code in src/super_gradients/training/datasets/datasets_utils.py
86
87
88
89
90
91
92
93
94
95
96
97
98
@register_collate_function()
class ComposedCollateFunction(AbstractCollateFunction):
    """
    A function (for torch DataLoader) which executes a sequence of sub collate functions
    """

    def __init__(self, functions: list):
        self.functions = functions

    def __call__(self, batch):
        for f in self.functions:
            batch = f(batch)
        return batch

DatasetStatisticsTensorboardLogger

Source code in src/super_gradients/training/datasets/datasets_utils.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
class DatasetStatisticsTensorboardLogger:

    logger = get_logger(__name__)
    DEFAULT_SUMMARY_PARAMS = {
        "sample_images": 32,  # by default, 32 images will be sampled from each dataset
        "plot_class_distribution": True,
        "plot_box_size_distribution": True,
        "plot_anchors_coverage": True,
        "max_batches": 30,
    }

    def __init__(self, sg_logger, summary_params: dict = DEFAULT_SUMMARY_PARAMS):
        self.sg_logger = sg_logger
        self.summary_params = {**DatasetStatisticsTensorboardLogger.DEFAULT_SUMMARY_PARAMS, **summary_params}

    def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
        """
        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. applicable only for detection datasets
        :param all_classes: the list of all classes names
        """
        # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
        # if isinstance(data_loader.dataset, DetectionDataSet):
        #     self._analyze_detection(data_loader=data_loader, title=title,
        #                             all_classes=all_classes, anchors=anchors)
        # else:
        #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
        DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

    def _analyze_detection(self, data_loader, title, all_classes, anchors=None):
        """
        Analyze a detection dataset

        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param all_classes: the list of all classes names
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. if not provided, anchors coverage will not be analyzed
        """
        try:
            color_mean = AverageMeter()
            color_std = AverageMeter()
            all_labels = []
            image_size = 0
            for i, (images, labels) in enumerate(tqdm(data_loader)):

                if i >= self.summary_params["max_batches"] > 0:
                    break

                if i == 0:
                    image_size = max(images[0].shape[1], images[0].shape[2])
                    if images.shape[0] > self.summary_params["sample_images"]:
                        samples = images[: self.summary_params["sample_images"]]
                    else:
                        samples = images

                    pred = [torch.zeros(size=(0, 6)) for _ in range(len(samples))]
                    try:
                        result_images = DetectionVisualization.visualize_batch(
                            image_tensor=samples,
                            pred_boxes=pred,
                            target_boxes=copy.deepcopy(labels),
                            batch_name=title,
                            class_names=all_classes,
                            box_thickness=1,
                            gt_alpha=1.0,
                        )

                        self.sg_logger.add_images(tag=f"{title} sample images", images=np.stack(result_images).transpose([0, 3, 1, 2])[:, ::-1, :, :])
                    except Exception as e:
                        DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at adding an example batch:\n{e}")
                        return

                all_labels.append(labels)
                color_mean.update(torch.mean(images, dim=[0, 2, 3]), 1)
                color_std.update(torch.std(images, dim=[0, 2, 3]), 1)

            all_labels = torch.cat(all_labels, dim=0)[1:].numpy()

            try:
                if self.summary_params["plot_class_distribution"]:
                    self._analyze_class_distribution(labels=all_labels, num_classes=len(all_classes), title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing class distributions.\n{e}")
                return

            try:
                if self.summary_params["plot_box_size_distribution"]:
                    self._analyze_object_size_distribution(labels=all_labels, title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing object size " f"distributions.\n{e}")
                return

            summary = ""
            summary += f"dataset size: {len(data_loader)}  \n"
            summary += f"color mean: {color_mean.average}  \n"
            summary += f"color std: {color_std.average}  \n"

            try:
                if anchors is not None and image_size > 0:
                    coverage = self._analyze_anchors_coverage(anchors=anchors, image_size=image_size, title=title, labels=all_labels)
                    summary += f"anchors: {anchors}  \n"
                    summary += f"anchors coverage: {coverage}  \n"
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing anchors " f"coverage.\n{e}")
                return

            self.sg_logger.add_text(tag=f"{title} Statistics", text_string=summary)
            self.sg_logger.flush()

        except Exception as e:
            DatasetStatisticsTensorboardLogger.logger.error(f"dataset analysis failed!\n{e}")

    def _analyze_class_distribution(self, labels: list, num_classes: int, title: str):
        hist, edges = np.histogram(labels[:, 0], num_classes)

        f = plt.figure(figsize=[10, 8])

        plt.bar(range(num_classes), hist, width=0.5, color="#0504aa", alpha=0.7)
        plt.xlim(-1, num_classes)
        plt.grid(axis="y", alpha=0.75)
        plt.xlabel("Value", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.ylabel("Frequency", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.xticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.yticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.title(f"{title} class distribution", fontsize=STAT_LOGGER_FONT_SIZE)

        self.sg_logger.add_figure(f"{title} class distribution", figure=f)
        text_dist = ""
        for i, val in enumerate(hist):
            text_dist += f"[{i}]: {val}, "

        self.sg_logger.add_text(tag=f"{title} class distribution", text_string=text_dist)

    def _analyze_object_size_distribution(self, labels: list, title: str):
        """
        This function will add two plots to the tensorboard.
        one is a 2D histogram and the other is a scatter plot. in both cases the X axis is the object width and Y axis
        is the object width (both normalized by image size)
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        # histogram plot
        hist, xedges, yedges = np.histogram2d(labels[:, 4], labels[:, 3], 50)  # x and y are deliberately switched

        fig = plt.figure(figsize=(10, 6))
        fig.suptitle(f"{title} boxes w/h distribution")
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(np.log(hist + 1), interpolation="nearest", origin="lower", extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

        # scatter plot
        if len(labels) > 10000:
            # we randomly sample just 10000 objects so that the scatter plot will not get too dense
            labels = labels[np.random.randint(0, len(labels) - 1, 10000)]
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        plt.scatter(labels[:, 3], labels[:, 4], marker=".")

        self.sg_logger.add_figure(tag=f"{title} boxes w/h distribution", figure=fig)

    @staticmethod
    def _get_rect(w, h):
        min_w = w / 4.0
        min_h = h / 4.0
        return Rectangle((min_w, min_h), w * 4 - min_w, h * 4 - min_h, linewidth=1, edgecolor="b", facecolor="none")

    @staticmethod
    def _get_score(anchors: np.ndarray, points: np.ndarray, image_size: int):
        """
        Calculate the ratio (and 1/ratio) between each anchor width and height and each point (representing a possible
        object width and height).
        i.e. for an anchor with w=10,h=20 the point w=11,h=25 will have the ratios 11/10=1.1 and 25/20=1.25
        or 10/11=0.91 and 20/25=0.8 respectively

        :param anchors: array of anchors of the shape [2,N]
        :param points: array of points of the shape [2,M]
        :param image_size the size of the input image

        :returns: an array of size [image_size - 1, image_size - 1] where each cell i,j represent the minimum ratio
        for that cell (point) from all anchors
        """

        ratio = (
            anchors[:, :, None]
            / points[
                :,
            ]
        )
        inv_ratio = 1 / ratio
        min_ratio = 1 - np.minimum(ratio, inv_ratio)
        min_ratio = np.max(min_ratio, axis=1)
        to_closest_anchor = np.min(min_ratio, axis=0)
        to_closest_anchor[to_closest_anchor > 0.75] = 2
        return to_closest_anchor.reshape(image_size - 1, -1)

    def _analyze_anchors_coverage(self, anchors: Anchors, image_size: int, labels: list, title: str):
        """
        This function will add anchors coverage plots to the tensorboard.
        :param anchors: a list of anchors
        :param image_size: the input image size for this training
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        fig = plt.figure(figsize=(12, 5))
        fig.suptitle(f"{title} anchors coverage")

        # box style plot
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_xlim([0, image_size])
        ax.set_ylim([0, image_size])

        anchors_boxes = anchors.anchors.cpu().numpy()
        anchors_len = anchors.num_anchors

        anchors_boxes = anchors_boxes.reshape(-1, 2)

        for i in range(anchors_len):
            rect = self._get_rect(anchors_boxes[i][0], anchors_boxes[i][1])
            rect.set_alpha(0.3)
            rect.set_facecolor([random.random(), random.random(), random.random(), 0.3])
            ax.add_patch(rect)

        # distance from anchor plot
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        x = np.arange(1, image_size, 1)
        y = np.arange(1, image_size, 1)

        xx, yy = np.meshgrid(x, y, sparse=False, indexing="xy")
        points = np.concatenate([xx.reshape(1, -1), yy.reshape(1, -1)])

        color = self._get_score(anchors_boxes, points, image_size)

        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(color, interpolation="nearest", origin="lower", extent=[0, image_size, 0, image_size])

        # calculate the coverage for the dataset labels
        cover_masks = []
        for i in range(anchors_len):
            w_max = (anchors_boxes[i][0] / image_size) * 4
            w_min = (anchors_boxes[i][0] / image_size) * 0.25
            h_max = (anchors_boxes[i][1] / image_size) * 4
            h_min = (anchors_boxes[i][1] / image_size) * 0.25
            cover_masks.append(
                np.logical_and(np.logical_and(np.logical_and(labels[:, 3] < w_max, labels[:, 3] > w_min), labels[:, 4] < h_max), labels[:, 4] > h_min)
            )
        cover_masks = np.stack(cover_masks)
        coverage = np.count_nonzero(np.any(cover_masks, axis=0)) / len(labels)

        self.sg_logger.add_figure(tag=f"{title} anchors coverage", figure=fig)
        return coverage

analyze(data_loader, title, all_classes, anchors=None)

Parameters:

Name Type Description Default
data_loader torch.utils.data.DataLoader

the dataset data loader

required
dataset_params

the dataset parameters

required
title str

the title for this dataset (i.e. Coco 2017 test set)

required
anchors list

the list of anchors used by the model. applicable only for detection datasets

None
all_classes List[str]

the list of all classes names

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
    """
    :param data_loader: the dataset data loader
    :param dataset_params: the dataset parameters
    :param title: the title for this dataset (i.e. Coco 2017 test set)
    :param anchors: the list of anchors used by the model. applicable only for detection datasets
    :param all_classes: the list of all classes names
    """
    # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
    # if isinstance(data_loader.dataset, DetectionDataSet):
    #     self._analyze_detection(data_loader=data_loader, title=title,
    #                             all_classes=all_classes, anchors=anchors)
    # else:
    #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
    DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

DetectionMultiscalePrePredictionCallback

Bases: MultiscalePrePredictionCallback

Mutiscalepre-prediction callback for object detection.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.

Parameters:

Name Type Description Default
multiscale_range

Range of values for resize sizes as discussed above (default=5)

required
image_size_steps

Image step sizes as discussed abov (default=32)

required
change_frequency

The frequency to apply change in input size.

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
@register_callback(Callbacks.DETECTION_MULTISCALE_PREPREDICTION)
class DetectionMultiscalePrePredictionCallback(MultiscalePrePredictionCallback):
    """
    Mutiscalepre-prediction callback for object detection.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.

    """

    def __call__(self, inputs, targets, batch_idx):
        # RESCALE THE IMAGE FIRST WITH SUPER(), AND IF RESCALING HAS ACTUALLY BEEN DONE APPLY TO BOXES AS WELL
        input_size = inputs.shape[2:]
        inputs, targets = super(DetectionMultiscalePrePredictionCallback, self).__call__(inputs, targets, batch_idx)
        new_input_size = inputs.shape[2:]
        scale_y = new_input_size[0] / input_size[0]
        scale_x = new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            targets[..., 2::2] = targets[..., 2::2] * scale_x
            targets[..., 3::2] = targets[..., 3::2] * scale_y
        return inputs, targets

MultiScaleCollateFunction

Bases: AbstractCollateFunction

a collate function to implement multi-scale data augmentation according to https://arxiv.org/pdf/1612.08242.pdf

Source code in src/super_gradients/training/datasets/datasets_utils.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@register_collate_function()
class MultiScaleCollateFunction(AbstractCollateFunction):
    """
    a collate function to implement multi-scale data augmentation
    according to https://arxiv.org/pdf/1612.08242.pdf
    """

    _counter = AtomicInteger(0)
    _current_size = AtomicInteger(0)
    _lock = Lock()

    def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
        """
        set parameters for the multi-scale collate function
        the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
        a new size will be randomly selected every change_frequency calls to the collate_fn()
            :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
            :param min_image_size: the minimum size to scale down to (in pixels)
            :param max_image_size: the maximum size to scale up to (in pixels)
            :param image_size_steps: typically, the stride of the net, which defines the possible image
                    size multiplications
            :param change_frequency:
        """
        assert target_size is not None or (
            max_image_size is not None and min_image_size is not None
        ), "either target_size or min_image_size and max_image_size has to be set"
        assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

        if target_size is not None:
            min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
            max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

        print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

        self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self._current_size = random.choice(self.sizes)

    def __call__(self, batch):

        with self._lock:

            # Important: this implementation was tailored for a specific input. it assumes the batch is a tuple where
            # the images are the first item
            assert isinstance(batch, tuple), "this collate function expects the input to be a tuple (images, labels)"
            images = batch[0]
            if self._counter % self.frequency == 0:
                self._current_size = random.choice(self.sizes)
            self._counter += 1

            assert images.shape[2] % self.image_size_steps == 0 and images.shape[3] % self.image_size_steps == 0, (
                "images sized not divisible by %d. (resize images before calling multi_scale)" % self.image_size_steps
            )

            if self._current_size != max(images.shape[2:]):
                ratio = float(self._current_size) / max(images.shape[2:])
                new_size = (int(round(images.shape[2] * ratio)), int(round(images.shape[3] * ratio)))
                images = F.interpolate(images, size=new_size, mode="bilinear", align_corners=False)

            return images, batch[1]

__init__(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)

set parameters for the multi-scale collate function the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps a new size will be randomly selected every change_frequency calls to the collate_fn() :param target_size: scales will be [0.66 * target_size, 1.5 * target_size] :param min_image_size: the minimum size to scale down to (in pixels) :param max_image_size: the maximum size to scale up to (in pixels) :param image_size_steps: typically, the stride of the net, which defines the possible image size multiplications :param change_frequency:

Source code in src/super_gradients/training/datasets/datasets_utils.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
    """
    set parameters for the multi-scale collate function
    the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
    a new size will be randomly selected every change_frequency calls to the collate_fn()
        :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
        :param min_image_size: the minimum size to scale down to (in pixels)
        :param max_image_size: the maximum size to scale up to (in pixels)
        :param image_size_steps: typically, the stride of the net, which defines the possible image
                size multiplications
        :param change_frequency:
    """
    assert target_size is not None or (
        max_image_size is not None and min_image_size is not None
    ), "either target_size or min_image_size and max_image_size has to be set"
    assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

    if target_size is not None:
        min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
        max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

    print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

    self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
    self.image_size_steps = image_size_steps
    self.frequency = change_frequency
    self._current_size = random.choice(self.sizes)

MultiscalePrePredictionCallback

Bases: AbstractPrePredictionCallback

Mutiscale pre-prediction callback pass function.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps)

Parameters:

Name Type Description Default
multiscale_range int

Range of values for resize sizes as discussed above (default=5)

5
image_size_steps int

Image step sizes as discussed abov (default=32)

32
change_frequency int

The frequency to apply change in input size.

10
Source code in src/super_gradients/training/datasets/datasets_utils.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
class MultiscalePrePredictionCallback(AbstractPrePredictionCallback):
    """
    Mutiscale pre-prediction callback pass function.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps)


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.
    """

    def __init__(self, multiscale_range: int = 5, image_size_steps: int = 32, change_frequency: int = 10):

        self.multiscale_range = multiscale_range
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self.rank = None
        self.is_distributed = None
        self.sampled_imres_once = False
        self.new_input_size = None

    def __call__(self, inputs, targets, batch_idx):
        if self.rank is None:
            self.rank = get_local_rank()
        if self.is_distributed is None:
            self.is_distributed = get_world_size() > 1

        # GENERATE A NEW SIZE AND BROADCAST IT TO THE THE OTHER RANKS SO THEY HAVE THE SAME SCALE
        input_size = inputs.shape[2:]
        if batch_idx % self.frequency == 0:
            tensor = torch.LongTensor(2).to(inputs.device)

            if self.rank == 0:
                size_factor = input_size[1] * 1.0 / input_size[0]
                min_size = int(input_size[0] / self.image_size_steps) - self.multiscale_range
                max_size = int(input_size[0] / self.image_size_steps) + self.multiscale_range
                random_size = (min_size, max_size)
                if self.sampled_imres_once:
                    size = random.randint(*random_size)
                else:
                    # sample the biggest resolution first to make sure the run fits into the GPU memory
                    size = max_size
                    self.sampled_imres_once = True
                size = (int(self.image_size_steps * size), self.image_size_steps * int(size * size_factor))
                tensor[0] = size[0]
                tensor[1] = size[1]

            if self.is_distributed:
                dist.barrier()
                dist.broadcast(tensor, 0)

            self.new_input_size = (tensor[0].item(), tensor[1].item())

        scale_y = self.new_input_size[0] / input_size[0]
        scale_x = self.new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            inputs = torch.nn.functional.interpolate(inputs, size=self.new_input_size, mode="bilinear", align_corners=False)
        return inputs, targets

RandomResizedCropAndInterpolation

Bases: RandomResizedCrop

Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks.

Parameters:

Name Type Description Default
size

Expected output size of each edge

required
scale

Range of size of the origin size cropped

(0.08, 1.0)
ratio

Range of aspect ratio of the origin aspect ratio cropped

(3.0 / 4.0, 4.0 / 3.0)
interpolation

Default: PIL.Image.BILINEAR

'default'
Source code in src/super_gradients/training/datasets/datasets_utils.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
@register_transform(Transforms.RandomResizedCropAndInterpolation)
class RandomResizedCropAndInterpolation(RandomResizedCrop):
    """
    Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
    is finally resized to given size.
    This is popularly used to train the Inception networks.

    :param size: Expected output size of each edge
    :param scale: Range of size of the origin size cropped
    :param ratio: Range of aspect ratio of the origin aspect ratio cropped
    :param interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="default"):
        super(RandomResizedCropAndInterpolation, self).__init__(size=size, scale=scale, ratio=ratio, interpolation=interpolation)
        if interpolation == "random":
            self.interpolation = _RANDOM_INTERPOLATION
        elif interpolation == "default":
            self.interpolation = InterpolationMode.BILINEAR
        else:
            self.interpolation = _pil_interp(interpolation)

    def forward(self, img: Image) -> Image:
        """
        :param img: Image to be cropped and resized.
        :return: Image: Randomly cropped and resized image.
        """
        i, j, h, w = self.get_params(img, self.scale, self.ratio)
        if isinstance(self.interpolation, (tuple, list)):
            interpolation = random.choice(self.interpolation)
        else:
            interpolation = self.interpolation
        return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

    def __repr__(self):
        if isinstance(self.interpolation, (tuple, list)):
            interpolate_str = " ".join([_pil_interpolation_to_str[x] for x in self.interpolation])
        else:
            interpolate_str = _pil_interpolation_to_str[self.interpolation]
        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
        format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale))
        format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio))
        format_string += ", interpolation={0})".format(interpolate_str)
        return format_string

forward(img)

Parameters:

Name Type Description Default
img Image

Image to be cropped and resized.

required

Returns:

Type Description
Image

Image: Randomly cropped and resized image.

Source code in src/super_gradients/training/datasets/datasets_utils.py
344
345
346
347
348
349
350
351
352
353
354
def forward(self, img: Image) -> Image:
    """
    :param img: Image to be cropped and resized.
    :return: Image: Randomly cropped and resized image.
    """
    i, j, h, w = self.get_params(img, self.scale, self.ratio)
    if isinstance(self.interpolation, (tuple, list)):
        interpolation = random.choice(self.interpolation)
    else:
        interpolation = self.interpolation
    return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])

Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned according to rand_augment_config_string

Parameters:

Name Type Description Default
rand_augment_config_string str

string which defines the auto augment configurations. If none, color jitter will be returned. For possibile values see auto_augment.py

required
color_jitter tuple

tuple for color jitter value.

required
crop_size

relevant only for auto augment

224
img_mean

relevant only for auto augment

[0.485, 0.456, 0.406]

Returns:

Type Description

RandAugment transform or ColorJitter

Source code in src/super_gradients/training/datasets/datasets_utils.py
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
def get_color_augmentation(rand_augment_config_string: str, color_jitter: tuple, crop_size=224, img_mean=[0.485, 0.456, 0.406]):
    """
    Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned
    according to rand_augment_config_string

    :param rand_augment_config_string: string which defines the auto augment configurations.
                                       If none, color jitter will be returned. For possibile values see auto_augment.py
    :param color_jitter: tuple for color jitter value.
    :param crop_size: relevant only for auto augment
    :param img_mean: relevant only for auto augment
    :return: RandAugment transform or ColorJitter
    """
    if rand_augment_config_string:
        color_augmentation = rand_augment_transform(rand_augment_config_string, crop_size, img_mean)

    else:  # RandAugment includes colorjitter like augmentations, both cannot be applied together.
        color_augmentation = transforms.ColorJitter(*color_jitter)
    return color_augmentation

get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)

A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

Parameters:

Name Type Description Default
data_dir

String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"

None
dataloader

a torch DataLoader, as it would feed the data into the trainer (including transforms etc).

None
RandomResizeSize

Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet, this value should be 224).

224

Returns:

Type Description

2 lists,mean and std, each one of len 3 (1 for each channel)

Source code in src/super_gradients/training/datasets/datasets_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224):
    """
    A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

    :param data_dir: String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"
    :param dataloader: a torch DataLoader, as it would feed the data into the trainer (including transforms etc).
    :param RandomResizeSize: Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet,
    this value should be 224).
    :return: 2 lists,mean and std, each one of len 3 (1 for each channel)
    """
    assert data_dir is None or dataloader is None, "Please provide either path to data folder or DataLoader, not both."

    if dataloader is None:
        traindir = os.path.join(os.path.abspath(data_dir), "train")
        trainset = ImageFolder(
            traindir, transforms.Compose([transforms.RandomResizedCrop(RandomResizeSize), transforms.RandomHorizontalFlip(), transforms.ToTensor()])
        )
        dataloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=num_workers)

    print(f"Calculating on {len(dataloader.dataset.targets)} Training Samples")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    h, w = 0, 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            h, w = inputs.size(2), inputs.size(3)
            print(f"Min: {inputs.min()}, Max: {inputs.max()}")
            chsum = inputs.sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += inputs.sum(dim=(0, 2, 3), keepdim=True)
    mean = chsum / len(trainset) / h / w
    print(f"mean: {mean.view(-1)}")

    chsum = None
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            chsum = (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
    std = torch.sqrt(chsum / (len(trainset) * h * w - 1))
    print(f"std: {std.view(-1)}")
    return mean.view(-1).cpu().numpy().tolist(), std.view(-1).cpu().numpy().tolist()

worker_init_reset_seed(worker_id)

Make sure each process has different random seed, especially for 'fork' method. Check https://github.com/pytorch/pytorch/issues/63311 for more details.

Parameters:

Name Type Description Default
worker_id

placeholder (needs to be passed to DataLoader init).

required
Source code in src/super_gradients/training/datasets/datasets_utils.py
657
658
659
660
661
662
663
664
665
666
667
def worker_init_reset_seed(worker_id):
    """
    Make sure each process has different random seed, especially for 'fork' method.
    Check https://github.com/pytorch/pytorch/issues/63311 for more details.

    :param worker_id: placeholder (needs to be passed to DataLoader init).
    """
    seed = uuid.uuid4().int % 2**32
    random.seed(seed)
    torch.set_rng_state(torch.manual_seed(seed).get_state())
    np.random.seed(seed)

AbstractDepthEstimationDataset

Bases: Dataset

Abstract class for datasets for depth estimation task.

Attempting to follow principles provided in pose_etimation_dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
class AbstractDepthEstimationDataset(Dataset):
    """
    Abstract class for datasets for depth estimation task.

    Attempting to follow principles provided in pose_etimation_dataset.
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(self, transforms: List[AbstractDepthEstimationTransform] = None):
        super().__init__()
        self.transforms = transforms or []

    @abc.abstractmethod
    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample from the dataset.

        :param index: Index of the sample to load.
        :return: Instance of DepthEstimationSample.

        If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
        ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
        Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
        """
        raise NotImplementedError()

    def load_random_sample(self) -> DepthEstimationSample:
        """
        Return a random sample from the dataset

        :return: Instance of DepthEstimationSample
        """
        num_samples = len(self)
        random_index = random.randrange(0, num_samples)
        return self.load_sample(random_index)

    def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get a transformed depth estimation sample from the dataset.

        :param index: Index of the sample to retrieve.
        :return: Tuple containing the transformed image and depth map as np.ndarrays.

        After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
        a 2D array (e.g., Height x Width).

        Before returning the image and depth map, the image's channels are moved to CHW format and additional
         dummy dimension is added to the depth map resulting 1HW shape.
        """
        sample = self.load_sample(index)
        for transform in self.transforms:
            sample = transform(sample)
        return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

    def plot(
        self,
        max_samples_per_plot: int = 8,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        color_scheme: Optional[int] = None,
        drop_extreme_percentage: float = 0,
        inverse: bool = False,
    ):
        """
        Combine samples of images with depth maps into plots and display the result.

        :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
        :param n_plots:                 Number of plots to display.
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                        If False, the plot will be over the raw samples (i.e., on load_sample).
        :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                        - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                        - If `inverse=False`, the default is COLORMAP_MAGMA.


        :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
        :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

        :return: None
        """
        plot_counter = 0

        for plot_i in range(n_plots):
            fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * max_samples_per_plot
                if plot_transformed_data:
                    image, depth_map = self[index]

                    # Transpose to HWC format for visualization
                    image = image.transpose(1, 2, 0)
                    depth_map = depth_map.squeeze()  # Remove dummy dimension
                else:
                    sample = self.load_sample(index)
                    image, depth_map = sample.image, sample.depth_map

                # Plot the image
                axes[0, img_i].imshow(image)
                axes[0, img_i].axis("off")
                axes[0, img_i].set_title(f"Sample {index}")

                # Plot the depth map side by side with the selected color scheme
                depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
                axes[1, img_i].imshow(depth_map)
                axes[1, img_i].axis("off")
                axes[1, img_i].set_title(f"Depth Map {index}")

            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

__getitem__(index)

Get a transformed depth estimation sample from the dataset.

Parameters:

Name Type Description Default
index int

Index of the sample to retrieve.

required

Returns:

Type Description
Tuple[np.ndarray, np.ndarray]

Tuple containing the transformed image and depth map as np.ndarrays. After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be a 2D array (e.g., Height x Width). Before returning the image and depth map, the image's channels are moved to CHW format and additional dummy dimension is added to the depth map resulting 1HW shape.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Get a transformed depth estimation sample from the dataset.

    :param index: Index of the sample to retrieve.
    :return: Tuple containing the transformed image and depth map as np.ndarrays.

    After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
    a 2D array (e.g., Height x Width).

    Before returning the image and depth map, the image's channels are moved to CHW format and additional
     dummy dimension is added to the depth map resulting 1HW shape.
    """
    sample = self.load_sample(index)
    for transform in self.transforms:
        sample = transform(sample)
    return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

load_random_sample()

Return a random sample from the dataset

Returns:

Type Description
DepthEstimationSample

Instance of DepthEstimationSample

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
44
45
46
47
48
49
50
51
52
def load_random_sample(self) -> DepthEstimationSample:
    """
    Return a random sample from the dataset

    :return: Instance of DepthEstimationSample
    """
    num_samples = len(self)
    random_index = random.randrange(0, num_samples)
    return self.load_sample(random_index)

load_sample(index) abstractmethod

Load a depth estimation sample from the dataset.

Parameters:

Name Type Description Default
index int

Index of the sample to load.

required

Returns:

Type Description
DepthEstimationSample

Instance of DepthEstimationSample. If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas, ensure that the same value is used as the ignore_val argument in your metric and loss functions. Fill the entries in the depth map that are supposed to be ignored with the ignore_val after loading the sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
30
31
32
33
34
35
36
37
38
39
40
41
42
@abc.abstractmethod
def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample from the dataset.

    :param index: Index of the sample to load.
    :return: Instance of DepthEstimationSample.

    If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
    ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
    Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
    """
    raise NotImplementedError()

plot(max_samples_per_plot=8, n_plots=1, plot_transformed_data=True, color_scheme=None, drop_extreme_percentage=0, inverse=False)

Combine samples of images with depth maps into plots and display the result.

Parameters:

Name Type Description Default
max_samples_per_plot int

Maximum number of samples (image with depth map) to be displayed per plot.

8
n_plots int

Number of plots to display.

1
plot_transformed_data bool

If True, the plot will be over samples after applying transforms (i.e., on getitem). If False, the plot will be over the raw samples (i.e., on load_sample).

True
color_scheme Optional[int]

OpenCV color scheme for the depth map visualization. If not specified: - If inverse=True, the default is COLORMAP_VIRIDIS. - If inverse=False, the default is COLORMAP_MAGMA.

None
drop_extreme_percentage float

Percentage of extreme values to drop on both ends of the depth spectrum.

0
inverse bool

Apply inversion (1 / depth) if True to the depth map.

False

Returns:

Type Description

None

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def plot(
    self,
    max_samples_per_plot: int = 8,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    color_scheme: Optional[int] = None,
    drop_extreme_percentage: float = 0,
    inverse: bool = False,
):
    """
    Combine samples of images with depth maps into plots and display the result.

    :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
    :param n_plots:                 Number of plots to display.
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                    If False, the plot will be over the raw samples (i.e., on load_sample).
    :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                    - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                    - If `inverse=False`, the default is COLORMAP_MAGMA.


    :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
    :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

    :return: None
    """
    plot_counter = 0

    for plot_i in range(n_plots):
        fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * max_samples_per_plot
            if plot_transformed_data:
                image, depth_map = self[index]

                # Transpose to HWC format for visualization
                image = image.transpose(1, 2, 0)
                depth_map = depth_map.squeeze()  # Remove dummy dimension
            else:
                sample = self.load_sample(index)
                image, depth_map = sample.image, sample.depth_map

            # Plot the image
            axes[0, img_i].imshow(image)
            axes[0, img_i].axis("off")
            axes[0, img_i].set_title(f"Sample {index}")

            # Plot the depth map side by side with the selected color scheme
            depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
            axes[1, img_i].imshow(depth_map)
            axes[1, img_i].axis("off")
            axes[1, img_i].set_title(f"Depth Map {index}")

        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

NYUv2DepthEstimationDataset

Bases: AbstractDepthEstimationDataset

Dataset class for NYU Depth V2 dataset for depth estimation.

Parameters:

Name Type Description Default
root str

Root directory containing the dataset.

required
df_path str

Path to the CSV file containing image and depth map file paths, relative to root.

required
transforms

Transforms to be applied to the samples. To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows: - Root directory (specified as 'root' when initializing the dataset) - nyu2_train (or any other split) - scene_category_1 - image_1.jpg - image_2.png - ... - scene_category_2 - image_1.jpg - image_2.png - ... - ... - nyu2_test (or any other split) - 00000_colors.png - 00001_colors.png - 00002_colors.png ... The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns: path to the color images, path to depth maps (both relative to the root). Example CSV content: data/nyu2_train/scene_category_1/image_1.jpg, data/nyu2_train/scene_category_1/image_1_depth.png data/nyu2_train/scene_category_1/image_2.jpg, data/nyu2_train/scene_category_1/image_2_depth.png data/nyu2_train/scene_category_2/image_1.jpg, data/nyu2_train/scene_category_2/image_1_depth.png Note: As of 14/12/2023 official downlaod link is broken. Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input ...

None
Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
@register_dataset(Datasets.NYUV2_DEPTH_ESTIMATION_DATASET)
class NYUv2DepthEstimationDataset(AbstractDepthEstimationDataset):
    """
    Dataset class for NYU Depth V2 dataset for depth estimation.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths, relative to root.
    :param transforms: Transforms to be applied to the samples.

    To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows:

    - Root directory (specified as 'root' when initializing the dataset)
      - nyu2_train (or any other split)
        - scene_category_1
          - image_1.jpg
          - image_2.png
          - ...
        - scene_category_2
          - image_1.jpg
          - image_2.png
          - ...
        - ...
      - nyu2_test (or any other split)
        - 00000_colors.png
        - 00001_colors.png
        - 00002_colors.png
        ...

    The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns:
     path to the color images,  path to depth maps (both relative to the root).

    Example CSV content:
    data/nyu2_train/scene_category_1/image_1.jpg,   data/nyu2_train/scene_category_1/image_1_depth.png
    data/nyu2_train/scene_category_1/image_2.jpg,   data/nyu2_train/scene_category_1/image_2_depth.png
    data/nyu2_train/scene_category_2/image_1.jpg,   data/nyu2_train/scene_category_2/image_1_depth.png

    Note: As of 14/12/2023 official downlaod link is broken.
     Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input
    ...
    """

    def __init__(self, root: str, df_path: str, transforms=None):
        """
        Initialize NYUv2Dataset.

        :param root: Root directory containing the dataset.
        :param df_path: Path to the CSV file containing image and depth map file paths.
        :param transforms: Transforms to be applied to the samples.
        """
        super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
        self.root = root
        self.df = self._read_df(df_path)
        self._check_paths_exist()

    def _read_df(self, df_path: str) -> pd.DataFrame:
        """
        Read the CSV file containing image and depth map file paths.

        :param df_path: Path to the CSV file.

        :return: DataFrame containing image and depth map file paths.
        """
        df = pd.read_csv(df_path, header=None)
        df[0] = df[0].map(lambda x: os.path.join(self.root, x))
        df[1] = df[1].map(lambda x: os.path.join(self.root, x))
        return df

    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample at the specified index.

        :param index: Index of the sample.

        :return: Loaded depth estimation sample.
        """
        sample_paths = self.df.iloc[index, :]
        image_path, dp_path = sample_paths[0], sample_paths[1]
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
        return DepthEstimationSample(image=image, depth_map=depth_map)

    def __len__(self):
        """
        Get the number of samples in the dataset.

        :return: Number of samples in the dataset.
        """
        return len(self.df)

    def _check_paths_exist(self):
        """
        Check if the paths in self.df exist. Remove lines with missing paths and print information about removed paths.
        Raise an error if all lines are removed.
        """
        valid_paths = []
        for _, row in self.df.iterrows():
            paths_exist = all(os.path.exists(path) for path in row)
            if paths_exist:
                valid_paths.append(row)
            else:
                warnings.warn(f"Warning: Removed the following line as one or more paths do not exist: {row}")

        if not valid_paths:
            raise FileNotFoundError("All lines in the dataset have been removed as some paths do not exist. " "Please check the paths and dataset structure.")

        self.df = pd.DataFrame(valid_paths, columns=[0, 1])

__init__(root, df_path, transforms=None)

Initialize NYUv2Dataset.

Parameters:

Name Type Description Default
root str

Root directory containing the dataset.

required
df_path str

Path to the CSV file containing image and depth map file paths.

required
transforms

Transforms to be applied to the samples.

None
Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(self, root: str, df_path: str, transforms=None):
    """
    Initialize NYUv2Dataset.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths.
    :param transforms: Transforms to be applied to the samples.
    """
    super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
    self.root = root
    self.df = self._read_df(df_path)
    self._check_paths_exist()

__len__()

Get the number of samples in the dataset.

Returns:

Type Description

Number of samples in the dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
 94
 95
 96
 97
 98
 99
100
def __len__(self):
    """
    Get the number of samples in the dataset.

    :return: Number of samples in the dataset.
    """
    return len(self.df)

load_sample(index)

Load a depth estimation sample at the specified index.

Parameters:

Name Type Description Default
index int

Index of the sample.

required

Returns:

Type Description
DepthEstimationSample

Loaded depth estimation sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py
80
81
82
83
84
85
86
87
88
89
90
91
92
def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample at the specified index.

    :param index: Index of the sample.

    :return: Loaded depth estimation sample.
    """
    sample_paths = self.df.iloc[index, :]
    image_path, dp_path = sample_paths[0], sample_paths[1]
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
    return DepthEstimationSample(image=image, depth_map=depth_map)

COCODetectionDataset

Bases: COCOFormatDetectionDataset

Dataset for COCO object detection.

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset:
    >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
@register_dataset(Datasets.COCO_DETECTION_DATASET)
class COCODetectionDataset(COCOFormatDetectionDataset):
    """Dataset for COCO object detection.

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset:
            >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(
        self,
        json_file: str = "instances_train2017.json",
        subdir: str = "images/train2017",
        *args,
        **kwargs,
    ):
        """
        :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
        :param subdir:              Sub directory of data_dir containing the data.
        :param with_crowd: Add the crowd groundtruths to __getitem__

        kwargs:
            all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
        """
        super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

__init__(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)

Parameters:

Name Type Description Default
json_file str

Name of the coco json file, that resides in data_dir/annotations/json_file.

'instances_train2017.json'
subdir str

Sub directory of data_dir containing the data.

'images/train2017'
with_crowd

Add the crowd groundtruths to getitem kwargs: all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.

required
Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    json_file: str = "instances_train2017.json",
    subdir: str = "images/train2017",
    *args,
    **kwargs,
):
    """
    :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
    :param subdir:              Sub directory of data_dir containing the data.
    :param with_crowd: Add the crowd groundtruths to __getitem__

    kwargs:
        all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
    """
    super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

COCOFormatDetectionDataset

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the COCO dataset. - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh). - One folder with all the images.

Output format: (x, y, x, y, class_id)

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@register_dataset("COCOFormatDetectionDataset")
class COCOFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the COCO dataset.
    - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh).
    - One folder with all the images.

    Output format: (x, y, x, y, class_id)
    """

    @deprecated_parameter(
        "tight_box_rotation",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
    )
    def __init__(
        self,
        data_dir: str,
        json_annotation_file: str,
        images_dir: str,
        with_crowd: bool = True,
        class_ids_to_ignore: Optional[List[int]] = None,
        tight_box_rotation=None,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
        :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
        :param with_crowd:              Add the crowd groundtruths to __getitem__
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
        """
        if tight_box_rotation is not None:
            logger.warning(
                "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
            )
        self.images_dir = images_dir
        self.json_annotation_file = json_annotation_file
        self.with_crowd = with_crowd
        self.class_ids_to_ignore = class_ids_to_ignore or []

        target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
        kwargs["target_fields"] = target_fields
        kwargs["output_fields"] = ["image", *target_fields]
        kwargs["original_target_format"] = XYXY_LABEL
        super().__init__(data_dir=data_dir, *args, **kwargs)

        if len(self.original_classes) != len(self.all_classes_list):
            if set(self.all_classes_list).issubset(set(self.original_classes)):
                raise ParameterMismatchException(
                    "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                    "Please use `class_inclusion_list` to train with reduced number of classes",
                )
            else:
                raise DatasetValidationException(
                    "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                    "Most likely this indicates an error in your all_classes_list parameter"
                )

    def _setup_data_source(self) -> int:
        """
        Parse COCO annotation file
        :return: Number of images in annotation JSON
        """
        if os.path.isabs(self.json_annotation_file):
            annotation_file_path = self.json_annotation_file
        else:
            annotation_file_path = os.path.join(self.data_dir, self.json_annotation_file)
        if not os.path.exists(annotation_file_path):
            raise ValueError("Could not find annotation file under " + str(annotation_file_path))

        all_class_names, annotations = parse_coco_into_detection_annotations(
            annotation_file_path,
            exclude_classes=None,
            include_classes=None,
            # This parameter exists solely for the purpose of keeping the backward compatibility with the old code.
            # Once we refactor base dataset, we can remove this parameter and use only exclude_classes/include_classes
            # at parsing time instead.
            class_ids_to_ignore=self.class_ids_to_ignore,
            image_path_prefix=os.path.join(self.data_dir, self.images_dir),
        )

        self.original_classes = list(all_class_names)
        self.classes = copy.deepcopy(self.original_classes)
        self._annotations = annotations
        return len(annotations)

    @property
    def _all_classes(self) -> List[str]:
        return self.original_classes

    def _load_annotation(self, sample_id: int) -> dict:
        """
        Load relevant information of a specific image.

        :param sample_id:               Sample_id in the dataset
        :return target:                 Target Bboxes (detection) in XYXY_LABEL format
        :return crowd_target:           Crowd target Bboxes (detection) in XYXY_LABEL format
        :return target_segmentation:    Segmentation
        :return initial_img_shape:      Image (height, width)
        :return resized_img_shape:      Resides image (height, width)
        :return img_path:               Path to the associated image
        """

        annotation = self._annotations[sample_id]

        width = annotation.image_width
        height = annotation.image_height

        # Make a copy of the annotations, so that we can modify them
        boxes_xyxy = change_bbox_bounds_for_image_size(annotation.ann_boxes_xyxy, img_shape=(height, width), inplace=False)
        iscrowd = annotation.ann_is_crowd.copy()
        labels = annotation.ann_labels.copy()

        # Exclude boxes with invalid dimensions (x1 > x2 or y1 > y2)
        mask = np.logical_and(boxes_xyxy[:, 2] >= boxes_xyxy[:, 0], boxes_xyxy[:, 3] >= boxes_xyxy[:, 1])
        boxes_xyxy = boxes_xyxy[mask]
        iscrowd = iscrowd[mask]
        labels = labels[mask]

        # Currently, the base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        initial_img_shape = (height, width)
        if self.input_dim is not None:
            scale_factor = min(self.input_dim[0] / height, self.input_dim[1] / width)
            resized_img_shape = (int(height * scale_factor), int(width * scale_factor))
        else:
            resized_img_shape = initial_img_shape
            scale_factor = 1

        targets = np.concatenate([boxes_xyxy[~iscrowd] * scale_factor, labels[~iscrowd, None]], axis=1).astype(np.float32)
        crowd_targets = np.concatenate([boxes_xyxy[iscrowd] * scale_factor, labels[iscrowd, None]], axis=1).astype(np.float32)

        annotation = {
            "target": targets,
            "crowd_target": crowd_targets,
            "initial_img_shape": initial_img_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": annotation.image_path,
        }
        return annotation

__init__(data_dir, json_annotation_file, images_dir, with_crowd=True, class_ids_to_ignore=None, tight_box_rotation=None, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
json_annotation_file str

Name of the coco json file. Path can be either absolute, or relative to data_dir.

required
images_dir str

Name of the directory that includes all the images. Path relative to data_dir.

required
with_crowd bool

Add the crowd groundtruths to getitem

True
class_ids_to_ignore Optional[List[int]]

List of class ids to ignore in the dataset. By default, doesnt ignore any class.

None
tight_box_rotation

This parameter is deprecated and will be removed in a SuperGradients 3.8.

None
Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
@deprecated_parameter(
    "tight_box_rotation",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
)
def __init__(
    self,
    data_dir: str,
    json_annotation_file: str,
    images_dir: str,
    with_crowd: bool = True,
    class_ids_to_ignore: Optional[List[int]] = None,
    tight_box_rotation=None,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
    :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
    :param with_crowd:              Add the crowd groundtruths to __getitem__
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
    """
    if tight_box_rotation is not None:
        logger.warning(
            "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
        )
    self.images_dir = images_dir
    self.json_annotation_file = json_annotation_file
    self.with_crowd = with_crowd
    self.class_ids_to_ignore = class_ids_to_ignore or []

    target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
    kwargs["target_fields"] = target_fields
    kwargs["output_fields"] = ["image", *target_fields]
    kwargs["original_target_format"] = XYXY_LABEL
    super().__init__(data_dir=data_dir, *args, **kwargs)

    if len(self.original_classes) != len(self.all_classes_list):
        if set(self.all_classes_list).issubset(set(self.original_classes)):
            raise ParameterMismatchException(
                "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                "Please use `class_inclusion_list` to train with reduced number of classes",
            )
        else:
            raise DatasetValidationException(
                "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                "Most likely this indicates an error in your all_classes_list parameter"
            )

parse_coco_into_detection_annotations(ann, exclude_classes=None, include_classes=None, class_ids_to_ignore=None, image_path_prefix=None)

Load COCO detection dataset from annotation file.

Parameters:

Name Type Description Default
ann str

A path to the JSON annotation file in COCO format.

required
exclude_classes Optional[List[str]]

List of classes to exclude from the dataset. All other classes will be included. This parameter is mutually exclusive with include_classes and class_ids_to_ignore.

None
include_classes Optional[List[str]]

List of classes to include in the dataset. All other classes will be excluded. This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.

None
class_ids_to_ignore Optional[List[int]]

List of category ids to ignore in the dataset. All other classes will be included. This parameter added for the purpose of backward compatibility with the class_ids_to_ignore argument of COCOFormatDetectionDataset but will be removed in future in favor of include_classes/exclude_classes. This parameter is mutually exclusive with exclude_classes and include_classes.

None
image_path_prefix

A prefix to add to the image paths in the annotation file.

None

Returns:

Type Description
Tuple[List[str], List[DetectionAnnotation]]

Tuple (class_names, annotations) where class_names is a list of class names (respecting include_classes/exclude_classes/class_ids_to_ignore) and annotations is a list of DetectionAnnotation objects.

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def parse_coco_into_detection_annotations(
    ann: str,
    exclude_classes: Optional[List[str]] = None,
    include_classes: Optional[List[str]] = None,
    class_ids_to_ignore: Optional[List[int]] = None,
    image_path_prefix=None,
) -> Tuple[List[str], List[DetectionAnnotation]]:
    """
    Load COCO detection dataset from annotation file.
    :param ann: A path to the JSON annotation file in COCO format.
    :param exclude_classes: List of classes to exclude from the dataset. All other classes will be included.
                                This parameter is mutually exclusive with include_classes and class_ids_to_ignore.

    :param include_classes:     List of classes to include in the dataset. All other classes will be excluded.
                                This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.
    :param class_ids_to_ignore: List of category ids to ignore in the dataset. All other classes will be included.
                                This parameter added for the purpose of backward compatibility with the class_ids_to_ignore
                                argument of COCOFormatDetectionDataset but will be
                                removed in future in favor of include_classes/exclude_classes.
                                This parameter is mutually exclusive with exclude_classes and include_classes.
    :param image_path_prefix:   A prefix to add to the image paths in the annotation file.
    :return:                    Tuple (class_names, annotations) where class_names is a list of class names
                                (respecting include_classes/exclude_classes/class_ids_to_ignore) and
                                annotations is a list of DetectionAnnotation objects.
    """
    with open(ann, "r") as f:
        coco = json.load(f)

    # Extract class names and class ids
    category_ids = np.array([category["id"] for category in coco["categories"]], dtype=int)
    category_names = np.array([category["name"] for category in coco["categories"]], dtype=str)

    # Extract box annotations
    ann_box_xyxy = xywh_to_xyxy_inplace(np.array([annotation["bbox"] for annotation in coco["annotations"]], dtype=np.float32).reshape(-1, 4), image_shape=None)

    ann_category_id = np.array([annotation["category_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)
    ann_iscrowd = np.array([annotation["iscrowd"] for annotation in coco["annotations"]], dtype=bool).reshape(-1)
    ann_image_ids = np.array([annotation["image_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)

    # Extract image stuff
    img_ids = [img["id"] for img in coco["images"]]
    img_paths = [img["file_name"] if "file_name" in img else "{:012}".format(img["id"]) + ".jpg" for img in coco["images"]]
    img_width_height = [(img["width"], img["height"]) for img in coco["images"]]

    # Now, we can drop the annotations that belongs to the excluded classes
    if int(class_ids_to_ignore is not None) + int(exclude_classes is not None) + int(include_classes is not None) > 1:
        raise ValueError("Only one of exclude_classes, class_ids_to_ignore or include_classes can be specified")
    elif exclude_classes is not None:
        if len(exclude_classes) != len(set(exclude_classes)):
            raise ValueError("The excluded classes must be unique")
        classes_not_in_dataset = set(exclude_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the excluded classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, exclude_classes, invert=True)
    elif class_ids_to_ignore is not None:
        if len(class_ids_to_ignore) != len(set(class_ids_to_ignore)):
            raise ValueError("The ignored classes must be unique")
        classes_not_in_dataset = set(class_ids_to_ignore).difference(set(category_ids))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the ignored classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_ids, class_ids_to_ignore, invert=True)
    elif include_classes is not None:
        if len(include_classes) != len(set(include_classes)):
            raise ValueError("The included classes must be unique")
        classes_not_in_dataset = set(include_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the included classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, include_classes)
    else:
        keep_classes_mask = None

    if keep_classes_mask is not None:
        category_ids = category_ids[keep_classes_mask]
        category_names = category_names[keep_classes_mask]

        keep_anns_mask = np.isin(ann_category_id, category_ids)
        ann_category_id = ann_category_id[keep_anns_mask]

    # category_ids can be non-sequential and not ordered
    num_categories = len(category_ids)

    # Make sequential
    order = np.argsort(category_ids, kind="stable")
    category_ids = category_ids[order]  #
    category_names = category_names[order]

    # Remap category ids to be in range [0, num_categories)
    class_label_table = np.zeros(np.max(category_ids) + 1, dtype=int) - 1
    new_class_ids = np.arange(num_categories, dtype=int)
    class_label_table[category_ids] = new_class_ids

    # Remap category ids in annotations
    ann_category_id = class_label_table[ann_category_id]
    if (ann_category_id < 0).any():
        raise ValueError("Some annotations have class ids that are not in the list of classes. This probably indicates a bug in the annotation file")

    annotations = []

    img_id2ann_box_xyxy = defaultdict(list)
    img_id2ann_iscrowd = defaultdict(list)
    img_id2ann_category_id = defaultdict(list)
    for ann_image_id, _ann_box_xyxy, _ann_iscrowd, _ann_category_id in zip(ann_image_ids, ann_box_xyxy, ann_iscrowd, ann_category_id):
        img_id2ann_box_xyxy[ann_image_id].append(_ann_box_xyxy)
        img_id2ann_iscrowd[ann_image_id].append(_ann_iscrowd)
        img_id2ann_category_id[ann_image_id].append(_ann_category_id)

    for img_id, image_path, (image_width, image_height) in zip(img_ids, img_paths, img_width_height):
        if image_path_prefix is not None:
            image_path = os.path.join(image_path_prefix, image_path)

        ann = DetectionAnnotation(
            image_id=img_id,
            image_path=image_path,
            image_width=image_width,
            image_height=image_height,
            ann_boxes_xyxy=np.asarray(img_id2ann_box_xyxy[img_id], dtype=np.float32).reshape(-1, 4),
            ann_is_crowd=np.asarray(img_id2ann_iscrowd[img_id], dtype=bool).reshape(-1),
            ann_labels=np.asarray(img_id2ann_category_id[img_id], dtype=int).reshape(-1),
        )
        annotations.append(ann)

    return category_names, annotations

DetectionDataset

Bases: Dataset, HasPreprocessingParams, HasClassesInformation

Detection dataset.

This is a boilerplate class to facilitate the implementation of datasets.

HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ? - Inherit from DetectionDataSet - implement the method self.load_annotation to return at least the fields "target" and "img_path" - Call super().__init_ with the required params. //!\ super().init will call self.load_annotation, so make sure that every required attributes are set up before calling super().__init_ (ideally just call it last)

WORKFLOW: - On instantiation: - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

- On call (__getitem__) for a specific image index:
    - The image and annotations are grouped together in a dict called SAMPLE
    - the sample is processed according to th transform
    - Only the specified fields are returned by __getitem__

TERMINOLOGY - TARGET: Groundtruth, made of bboxes. The format can vary from one dataset to another - ANNOTATION: Combination of targets (groundtruth) and metadata of the image, but without the image itself. > Has to include the fields "target" and "img_path" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - SAMPLE: Outout of the dataset: > Has to include the fields "target" and "image" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - Index: Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1 - Sample ID: Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
@register_dataset(Datasets.DETECTION_DATASET)
class DetectionDataset(Dataset, HasPreprocessingParams, HasClassesInformation):
    """Detection dataset.

    This is a boilerplate class to facilitate the implementation of datasets.

    HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ?
        - Inherit from DetectionDataSet
        - implement the method self._load_annotation to return at least the fields "target" and "img_path"
        - Call super().__init__ with the required params.
                //!\\ super().__init__ will call self._load_annotation, so make sure that every required
                      attributes are set up before calling super().__init__ (ideally just call it last)

    WORKFLOW:
        - On instantiation:
            - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

        - On call (__getitem__) for a specific image index:
            - The image and annotations are grouped together in a dict called SAMPLE
            - the sample is processed according to th transform
            - Only the specified fields are returned by __getitem__

    TERMINOLOGY
        - TARGET:       Groundtruth, made of bboxes. The format can vary from one dataset to another
        - ANNOTATION:   Combination of targets (groundtruth) and metadata of the image, but without the image itself.
                            > Has to include the fields "target" and "img_path"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - SAMPLE:       Outout of the dataset:
                            > Has to include the fields "target" and "image"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - Index:        Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        - Sample ID:    Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(
        self,
        data_dir: str,
        original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        target_fields: List[str] = None,
        output_fields: List[str] = None,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """Detection dataset.

        :param data_dir:                Where the data is stored
        :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                        None means that the image will be loaded as is.
                                        Scalar (size) - Image will be resized to (size, size)
                                        Tuple (rows,cols) - Image will be resized to (rows, cols)
        :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                        differ based on transforms.
        :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
        :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                        but requires more RAM and more time to instantiate the dataset when working on very large datasets.
        :param transforms:              List of transforms to apply sequentially on sample.
        :param all_classes_list:        All the class names.
        :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                        Classes not in this list will excluded from training.
                                        Thus, number of classes in model must be adjusted accordingly.
        :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                                will be ignored.
        :param target_fields:                   List of the fields target fields. This has to include regular target,
                                                but can also include crowd target, segmentation target, ...
                                                It has to include at least "target" but can include other.
        :param output_fields:                   Fields that will be outputed by __getitem__.
                                                It has to include at least "image" and "target" but can include other.
        :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
        :param show_all_warnings:       Whether to show all warnings or not.
        :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        """
        if cache is not None:
            warnings.warn(
                "cache parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )
        if cache_dir is not None:
            warnings.warn(
                "cache_dir parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )

        super().__init__()
        self.verbose = verbose
        self.show_all_warnings = show_all_warnings

        if isinstance(original_target_format, DetectionTargetsFormat):
            logger.warning(
                "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
                "Support for DetectionTargetsFormat will be removed in 3.1"
            )

        self.data_dir = data_dir
        if not Path(data_dir).exists():
            raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

        # Number of images that are available (regardless of ignored images)
        n_dataset_samples = self._setup_data_source()
        if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
            raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
        n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

        self.input_dim = ensure_is_tuple_of_two(input_dim)
        self.original_target_format = original_target_format

        if len(all_classes_list) != len(set(all_classes_list)):
            raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

        if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
            raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

        self.all_classes_list = all_classes_list or self._all_classes
        self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
        self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
        self.classes = self.class_inclusion_list or self.all_classes_list
        if len(set(self.classes) - set(self.all_classes_list)) > 0:
            wrong_classes = set(self.classes) - set(all_classes_list)
            raise DatasetValidationException(
                f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
            )

        self.ignore_empty_annotations = ignore_empty_annotations
        self.target_fields = target_fields or ["target"]
        if "target" not in self.target_fields:
            raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

        self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

        self.transforms = transforms

        self.output_fields = output_fields or ["image", "target"]
        if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
            raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

        self._cache_annotations = cache_annotations
        self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

        # Maps (dataset index) -> (non-empty sample ids)
        self._non_empty_sample_ids: Optional[List[int]] = None

        # Some transform may require non-empty annotations to be indexed.
        transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

        # Iterate over the whole dataset to index the images with/without annotations.
        if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
            if self._cache_annotations:
                logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
            elif self.ignore_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
                )
            elif transform_require_non_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. "
                    "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
                )

            # Map indexes to sample annotations.
            non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
            if self._cache_annotations:
                if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                    self._cached_annotations = non_empty_annotations
                else:
                    # Non overlapping dicts. since they map unique sample_ids -> sample
                    self._cached_annotations = {**non_empty_annotations, **empty_annotations}

            if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
                raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

            self._non_empty_sample_ids = list(non_empty_annotations.keys())

        self._n_samples = n_samples  # Regardless of any filtering

    @property
    def _all_classes(self):
        """Placeholder to setup the class names. This is an alternative to passing "all_classes_list" to __init__.
        This is usefull when all_classes_list is not known in advance, only after loading the dataset."""
        raise NotImplementedError

    def _setup_data_source(self) -> int:
        """Set up the data source and store relevant objects as attributes.

        :return: Number of available samples, (i.e. how many images we have, regardless of any filter we might want to use)"""
        raise NotImplementedError

    def _load_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load annotations associated to a specific sample.
        Please note that the targets should be resized according to self.input_dim!

        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        :return:            Annotation, a dict with any field but has to include at least the fields specified in self._required_annotation_fields.
        """
        raise NotImplementedError

    def _get_sample_annotations(self, index: int, ignore_empty_annotations: bool) -> Dict[str, Union[np.ndarray, Any]]:
        """Get the annotation associated to a specific sample. Use cache if enabled.
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    Whether to ignore empty annotations or not.
        :return:                            Dict representing the annotation of a specific image
        """
        sample_id = self._non_empty_sample_ids[index] if ignore_empty_annotations else index
        if self._cache_annotations:
            return self._cached_annotations[sample_id]
        else:
            return self._load_sample_annotation(sample_id=sample_id)

    def _load_sample_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load the annotation associated to a specific sample and apply subclassing.
        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        """
        sample_annotations = self._load_annotation(sample_id=sample_id)
        if not self._required_annotation_fields.issubset(set(sample_annotations.keys())):
            raise KeyError(
                f"_load_annotation is expected to return at least the fields {self._required_annotation_fields}, but got {set(sample_annotations.keys())}"
            )

        # Filter out classes that are not in self.class_inclusion_list
        if self.class_inclusion_list is not None:
            sample_annotations = self._sub_class_annotation(annotation=sample_annotations)

        return sample_annotations

    def _load_all_annotations(self, n_samples: int) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[str, Any]]]:
        """Load ALL the annotations into memory. This is usually required when `ignore_empty_annotations=True`,
        because we have to iterate over the whole dataset once in order to know which sample is empty and which is not.
        Question: Why not just check if annotation is empty on the fly ?
        Answer: When running with DDP, we split the dataset into small chunks.
                Therefore, we need to make sure that each chunk includes a similar subset of index.
                If we were to check on the fly, we would not know in advance the size of dataset/chunks
                and this means that some chunks would be smaller than others

        :param n_samples:   Number of samples in the datasets (including samples without annotations).
        :return:            A tuple of two dicts, one for non-empty annotations and one for empty annotations
                                - non_empty_annotations: Dict mapping dataset index -> non-empty annotations
                                - empty_annotations:     Dict mapping dataset index -> empty annotations
        """
        n_invalid_bbox = 0
        non_empty_annotations, empty_annotations = {}, {}

        for index in tqdm(range(n_samples), desc="Indexing dataset annotations", disable=not self.verbose):
            sample_annotations = self._load_sample_annotation(sample_id=index)
            n_invalid_bbox += sample_annotations.get("n_invalid_labels", 0)

            is_annotation_non_empty = any(len(sample_annotations[field]) != 0 for field in self.target_fields)
            if is_annotation_non_empty:
                non_empty_annotations[index] = sample_annotations if self._cache_annotations else None
            else:
                empty_annotations[index] = sample_annotations if self._cache_annotations else None

        if len(non_empty_annotations) + len(empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        if n_invalid_bbox > 0:
            logger.warning(f"Found {n_invalid_bbox} invalid bbox that were ignored. For more information, please set `show_all_warnings=True`.")

        return non_empty_annotations, empty_annotations

    def _sub_class_annotation(self, annotation: dict) -> Union[dict, None]:
        """Subclass every field listed in self.target_fields. It could be targets, crowd_targets, ...

        :param annotation: Dict representing the annotation of a specific image
        :return:           Subclassed annotation if non-empty after subclassing, otherwise None
        """
        class_index = _get_class_index_in_target(target_format=self.original_target_format)
        for field in self.target_fields:
            annotation[field] = self._sub_class_target(targets=annotation[field], class_index=class_index)
        return annotation

    def _sub_class_target(self, targets: np.ndarray, class_index: int) -> np.ndarray:
        """Sublass targets of a specific image.

        :param targets:     Target array to subclass of shape [n_targets, 5], 5 representing a bbox
        :param class_index:    Position of the class id in a bbox
                                ex: 0 if bbox of format label_xyxy | -1 if bbox of format xyxy_label
        :return:            Subclassed target
        """
        targets_kept = []
        for target in targets:
            cls_id = int(target[class_index])
            cls_name = self.all_classes_list[cls_id]
            if cls_name in self.class_inclusion_list:
                # Replace the target cls_id in self.all_classes_list by cls_id in self.class_inclusion_list
                target[class_index] = self.class_inclusion_list.index(cls_name)
                targets_kept.append(target)

        return np.array(targets_kept) if len(targets_kept) > 0 else np.zeros((0, 5), dtype=np.float32)

    def _load_resized_img(self, image_path: str) -> np.ndarray:
        """Load an image and resize it to the desired size (If relevant).
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img = self._load_image(image_path=image_path)

        if self.input_dim is not None:
            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)

        return img

    def _load_image(self, image_path: str) -> np.ndarray:
        """Load an image.
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img_file = os.path.join(image_path)
        img = cv2.imread(img_file)

        if img is None:
            raise FileNotFoundError(f"{img_file} was no found. Please make sure that the dataset was" f"downloaded and that the path is correct")
        return img

    def __len__(self) -> int:
        """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
        return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

    def __getitem__(self, index: int) -> Tuple:
        """Get the sample post transforms at a specific index of the dataset.
        The output of this function will be collated to form batches.

        :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :return:        Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
        sample = self.apply_transforms(sample)
        for field in self.output_fields:
            if field not in sample.keys():
                raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
        return tuple(sample[field] for field in self.output_fields)

    def get_random_item(self):
        return self[self.get_random_sample(ignore_empty_annotations=self.ignore_empty_annotations)]

    def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        """Get raw sample, before any transform (beside subclassing).
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    If True, empty annotations will be ignored
        :return:                            Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
        image = self._load_resized_img(image_path=sample_annotations["img_path"])
        return {"image": image, **deepcopy(sample_annotations)}

    def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
        """
        Applies self.transforms sequentially to sample

        If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
         sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
         only additional samples with objects in them.

        :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
        :return: Transformed sample
        """

        has_crowd_target = "crowd_target" in sample
        detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
        target_format_transform: Optional[DetectionTargetsFormatTransform] = None

        for transform in self.transforms:
            detection_sample.additional_samples = [
                LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
            ]
            detection_sample = transform.apply_to_sample(sample=detection_sample)

            detection_sample.additional_samples = None
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format_transform = transform

        transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
        if target_format_transform is not None:
            transformed_dict = target_format_transform(sample=transformed_dict)
        return transformed_dict

    def _get_additional_inputs_for_transform(self, transform: AbstractDetectionTransform) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Add additional inputs required by a transform to the sample"""
        additional_samples_count = transform.additional_samples_count if hasattr(transform, "additional_samples_count") else 0
        non_empty_annotations = transform.non_empty_annotations if hasattr(transform, "non_empty_annotations") else False
        return self.get_random_samples(count=additional_samples_count, ignore_empty_annotations=non_empty_annotations)

    def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Load random samples.

        :param count: The number of samples wanted
        :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
        :return: A list of samples satisfying input params
        """
        return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

    def get_random_sample(self, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        n_relevant_samples = len(self._non_empty_sample_ids) if ignore_empty_annotations else self._n_samples
        random_index = random.randint(0, n_relevant_samples - 1)
        return self.get_sample(index=random_index, ignore_empty_annotations=ignore_empty_annotations)

    @property
    def output_target_format(self):
        target_format = self.original_target_format
        for transform in self.transforms:
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format = transform.output_format
        return target_format

    @staticmethod
    def _standardize_image(image):
        # Normalize the image to have minimum of 0 and maximum of 1
        image_min = image.min()
        image_max = image.max()
        normalized_image = (image - image_min) / (image_max - image_min + 1e-8)

        # Rescale the normalized image to 0-255
        standardized_image = (normalized_image * 255).astype(np.uint8)

        return standardized_image

    def plot(
        self,
        max_samples_per_plot: int = 16,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        box_thickness: int = 2,
    ):
        """Combine samples of images with bbox into plots and display the result.

        :param max_samples_per_plot:    Maximum number of images to be displayed per plot
        :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                        If False, the plot will be over the raw samples (i.e. on get_sample)
        :return:
        """
        plot_counter = 0
        input_format = self.output_target_format if plot_transformed_data else self.original_target_format
        if isinstance(input_format, DetectionTargetsFormat):
            raise ValueError(
                "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
            )

        for plot_i in range(n_plots):
            fig = plt.figure(figsize=(10, 10))

            n_subplot = int(np.ceil(max_samples_per_plot**0.5))

            # Plot `max_samples_per_plot` images.
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * 16

                # LOAD IMAGE/TARGETS
                if plot_transformed_data:
                    # Access to the image and the target AFTER self.transform
                    image, targets, *_ = self[img_i + plot_i * 16]
                else:
                    # Access to the image and the target BEFORE self.transform
                    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                    image, targets = sample["image"], sample["target"]

                # FORMAT TARGETS
                if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                    image = image.transpose((1, 2, 0))

                image = self._standardize_image(image)
                image = image.astype(np.uint8)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

                # Convert to XYXY_LABEL format
                targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
                targets_label_xyxy = targets_format_converter(targets)

                image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

                plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
                plt.imshow(image)
                plt.axis("off")

            fig.tight_layout()
            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as as list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = [Processings.ReverseImageChannels]
        if self.input_dim is not None:
            pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(
            class_names=self.classes,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            iou=0.65,
            conf=0.5,
        )
        return params

    def get_sample_classes_information(self, index) -> np.ndarray:
        target = self._get_sample_annotations(index=index, ignore_empty_annotations=self.ignore_empty_annotations)["target"]
        if len(target) == 0:  # in case of no objects in the sample
            return np.zeros(len(self.classes))

        target_class_index = _get_class_index_in_target(target_format=self.original_target_format)  # can be sped-up with a property rather computing per index
        classes = target[:, target_class_index].astype(int)

        return np.bincount(classes, minlength=len(self.classes))

    def get_dataset_classes_information(self) -> np.ndarray:
        return np.row_stack([self.get_sample_classes_information(index=index) for index in range(len(self))])

__getitem__(index)

Get the sample post transforms at a specific index of the dataset. The output of this function will be collated to form batches.

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required

Returns:

Type Description
Tuple

Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
367
368
369
370
371
372
373
374
375
376
377
378
379
def __getitem__(self, index: int) -> Tuple:
    """Get the sample post transforms at a specific index of the dataset.
    The output of this function will be collated to form batches.

    :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :return:        Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
    sample = self.apply_transforms(sample)
    for field in self.output_fields:
        if field not in sample.keys():
            raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
    return tuple(sample[field] for field in self.output_fields)

__init__(data_dir, original_target_format, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, target_fields=None, output_fields=None, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Detection dataset.

Parameters:

Name Type Description Default
data_dir str

Where the data is stored

required
input_dim Union[int, Tuple[int, int], None]

Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols). None means that the image will be loaded as is. Scalar (size) - Image will be resized to (size, size) Tuple (rows,cols) - Image will be resized to (rows, cols)

None
original_target_format Union[ConcatenatedTensorFormat, DetectionTargetsFormat]

Format of targets stored on disk. raw data format, the output format might differ based on transforms.

required
max_num_samples int

If not None, set the maximum size of the dataset by only indexing the first n annotations/images.

None
cache_annotations bool

Whether to cache annotations or not. This reduces training time by pre-loading all the annotations, but requires more RAM and more time to instantiate the dataset when working on very large datasets.

True
transforms List[AbstractDetectionTransform]

List of transforms to apply sequentially on sample.

[]
all_classes_list Optional[List[str]]

All the class names.

[]
class_inclusion_list Optional[List[str]]

If not None, define the subset of classes to be included as targets. Classes not in this list will excluded from training. Thus, number of classes in model must be adjusted accordingly.

None
ignore_empty_annotations bool

If True and class_inclusion_list not None, images without any target will be ignored.

True
target_fields List[str]

List of the fields target fields. This has to include regular target, but can also include crowd target, segmentation target, ... It has to include at least "target" but can include other.

None
output_fields List[str]

Fields that will be outputed by getitem. It has to include at least "image" and "target" but can include other.

None
verbose bool

Whether to show additional information or not, such as loading progress. (doesnt include warnings)

True
show_all_warnings bool

Whether to show all warnings or not.

False
cache

Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8

None
cache_dir

Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8

None
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
@resolve_param("transforms", ListFactory(TransformsFactory()))
def __init__(
    self,
    data_dir: str,
    original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    target_fields: List[str] = None,
    output_fields: List[str] = None,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """Detection dataset.

    :param data_dir:                Where the data is stored
    :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                    None means that the image will be loaded as is.
                                    Scalar (size) - Image will be resized to (size, size)
                                    Tuple (rows,cols) - Image will be resized to (rows, cols)
    :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                    differ based on transforms.
    :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
    :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                    but requires more RAM and more time to instantiate the dataset when working on very large datasets.
    :param transforms:              List of transforms to apply sequentially on sample.
    :param all_classes_list:        All the class names.
    :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                    Classes not in this list will excluded from training.
                                    Thus, number of classes in model must be adjusted accordingly.
    :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                            will be ignored.
    :param target_fields:                   List of the fields target fields. This has to include regular target,
                                            but can also include crowd target, segmentation target, ...
                                            It has to include at least "target" but can include other.
    :param output_fields:                   Fields that will be outputed by __getitem__.
                                            It has to include at least "image" and "target" but can include other.
    :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
    :param show_all_warnings:       Whether to show all warnings or not.
    :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    """
    if cache is not None:
        warnings.warn(
            "cache parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )
    if cache_dir is not None:
        warnings.warn(
            "cache_dir parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )

    super().__init__()
    self.verbose = verbose
    self.show_all_warnings = show_all_warnings

    if isinstance(original_target_format, DetectionTargetsFormat):
        logger.warning(
            "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
            "Support for DetectionTargetsFormat will be removed in 3.1"
        )

    self.data_dir = data_dir
    if not Path(data_dir).exists():
        raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

    # Number of images that are available (regardless of ignored images)
    n_dataset_samples = self._setup_data_source()
    if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
        raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
    n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

    self.input_dim = ensure_is_tuple_of_two(input_dim)
    self.original_target_format = original_target_format

    if len(all_classes_list) != len(set(all_classes_list)):
        raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

    if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
        raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

    self.all_classes_list = all_classes_list or self._all_classes
    self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
    self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
    self.classes = self.class_inclusion_list or self.all_classes_list
    if len(set(self.classes) - set(self.all_classes_list)) > 0:
        wrong_classes = set(self.classes) - set(all_classes_list)
        raise DatasetValidationException(
            f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
        )

    self.ignore_empty_annotations = ignore_empty_annotations
    self.target_fields = target_fields or ["target"]
    if "target" not in self.target_fields:
        raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

    self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

    self.transforms = transforms

    self.output_fields = output_fields or ["image", "target"]
    if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
        raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

    self._cache_annotations = cache_annotations
    self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

    # Maps (dataset index) -> (non-empty sample ids)
    self._non_empty_sample_ids: Optional[List[int]] = None

    # Some transform may require non-empty annotations to be indexed.
    transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

    # Iterate over the whole dataset to index the images with/without annotations.
    if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
        if self._cache_annotations:
            logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
        elif self.ignore_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
            )
        elif transform_require_non_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. "
                "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
            )

        # Map indexes to sample annotations.
        non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
        if self._cache_annotations:
            if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                self._cached_annotations = non_empty_annotations
            else:
                # Non overlapping dicts. since they map unique sample_ids -> sample
                self._cached_annotations = {**non_empty_annotations, **empty_annotations}

        if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        self._non_empty_sample_ids = list(non_empty_annotations.keys())

    self._n_samples = n_samples  # Regardless of any filtering

__len__()

Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant).

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
363
364
365
def __len__(self) -> int:
    """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
    return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

apply_transforms(sample)

Applies self.transforms sequentially to sample

If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load only additional samples with objects in them.

Parameters:

Name Type Description Default
sample Dict[str, Union[np.ndarray, Any]]

Sample to apply the transforms on to (loaded with self.get_sample)

required

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Transformed sample

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
    """
    Applies self.transforms sequentially to sample

    If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
     sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
     only additional samples with objects in them.

    :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
    :return: Transformed sample
    """

    has_crowd_target = "crowd_target" in sample
    detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
    target_format_transform: Optional[DetectionTargetsFormatTransform] = None

    for transform in self.transforms:
        detection_sample.additional_samples = [
            LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
        ]
        detection_sample = transform.apply_to_sample(sample=detection_sample)

        detection_sample.additional_samples = None
        if isinstance(transform, DetectionTargetsFormatTransform):
            target_format_transform = transform

    transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
    if target_format_transform is not None:
        transformed_dict = target_format_transform(sample=transformed_dict)
    return transformed_dict

get_dataset_preprocessing_params()

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as as list of dicts to be resolved by processing factory.

Returns:

Type Description
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as as list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = [Processings.ReverseImageChannels]
    if self.input_dim is not None:
        pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(
        class_names=self.classes,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        iou=0.65,
        conf=0.5,
    )
    return params

get_random_samples(count, ignore_empty_annotations=False)

Load random samples.

Parameters:

Name Type Description Default
count int

The number of samples wanted

required
ignore_empty_annotations bool

If true, only return samples with at least 1 annotation

False

Returns:

Type Description
List[Dict[str, Union[np.ndarray, Any]]]

A list of samples satisfying input params

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
431
432
433
434
435
436
437
438
def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
    """Load random samples.

    :param count: The number of samples wanted
    :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
    :return: A list of samples satisfying input params
    """
    return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

get_sample(index, ignore_empty_annotations=False)

Get raw sample, before any transform (beside subclassing).

Parameters:

Name Type Description Default
index int

Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1

required
ignore_empty_annotations bool

If True, empty annotations will be ignored

False

Returns:

Type Description
Dict[str, Union[np.ndarray, Any]]

Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
384
385
386
387
388
389
390
391
392
def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
    """Get raw sample, before any transform (beside subclassing).
    :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :param ignore_empty_annotations:    If True, empty annotations will be ignored
    :return:                            Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
    image = self._load_resized_img(image_path=sample_annotations["img_path"])
    return {"image": image, **deepcopy(sample_annotations)}

plot(max_samples_per_plot=16, n_plots=1, plot_transformed_data=True, box_thickness=2)

Combine samples of images with bbox into plots and display the result.

Parameters:

Name Type Description Default
max_samples_per_plot int

Maximum number of images to be displayed per plot

16
n_plots int

Number of plots to display (each plot being a combination of img with bbox)

1
plot_transformed_data bool

If True, the plot will be over samples after applying transforms (i.e. on getitem). If False, the plot will be over the raw samples (i.e. on get_sample)

True

Returns:

Type Description
Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
def plot(
    self,
    max_samples_per_plot: int = 16,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    box_thickness: int = 2,
):
    """Combine samples of images with bbox into plots and display the result.

    :param max_samples_per_plot:    Maximum number of images to be displayed per plot
    :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                    If False, the plot will be over the raw samples (i.e. on get_sample)
    :return:
    """
    plot_counter = 0
    input_format = self.output_target_format if plot_transformed_data else self.original_target_format
    if isinstance(input_format, DetectionTargetsFormat):
        raise ValueError(
            "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
        )

    for plot_i in range(n_plots):
        fig = plt.figure(figsize=(10, 10))

        n_subplot = int(np.ceil(max_samples_per_plot**0.5))

        # Plot `max_samples_per_plot` images.
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * 16

            # LOAD IMAGE/TARGETS
            if plot_transformed_data:
                # Access to the image and the target AFTER self.transform
                image, targets, *_ = self[img_i + plot_i * 16]
            else:
                # Access to the image and the target BEFORE self.transform
                sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                image, targets = sample["image"], sample["target"]

            # FORMAT TARGETS
            if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                image = image.transpose((1, 2, 0))

            image = self._standardize_image(image)
            image = image.astype(np.uint8)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

            # Convert to XYXY_LABEL format
            targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
            targets_label_xyxy = targets_format_converter(targets)

            image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

            plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
            plt.imshow(image)
            plt.axis("off")

        fig.tight_layout()
        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

PascalVOCDetectionDataset

Bases: PascalVOCFormatDetectionDataset

Dataset for Pascal VOC object detection

Parameters:
    data_dir (str): Base directory where the dataset is stored.
    images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
    labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
    images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
    download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@register_dataset(Datasets.PASCAL_VOC_DETECTION_DATASET)
class PascalVOCDetectionDataset(PascalVOCFormatDetectionDataset):
    """Dataset for Pascal VOC object detection

        Parameters:
            data_dir (str): Base directory where the dataset is stored.
            images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
            labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
            images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
            download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_sub_directory: Optional[str] = None,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        download: bool = False,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        # Adding a check for deprecated usage alongside new parameters
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )

        elif images_sub_directory is not None:
            images_dir = images_sub_directory
            labels_dir = images_sub_directory.replace("images", "labels")
        elif images_dir is None or labels_dir is None:
            raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

        if download:
            self.download(data_dir)

        super().__init__(
            data_dir=data_dir,
            images_dir=images_dir,
            labels_dir=labels_dir,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
            all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
        )

    @staticmethod
    def download(data_dir: str) -> None:
        """Download Pascal dataset in XYXY_LABEL format.

        Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
        """

        def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
            """Parse and save the labels of an image in XYXY_LABEL format."""

            with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
                xml_parser = ElementTree.parse(f).getroot()

            labels = []
            for obj in xml_parser.iter("object"):
                cls = obj.find("name").text
                if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                    xml_box = obj.find("bndbox")

                    def get_coord(box_coord):
                        return xml_box.find(box_coord).text

                    xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                    labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

            with open(new_label_path, "w") as f:
                f.write("\n".join(labels))

        urls = [
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
        ]  # 1.86G, 17125 images
        data_dir = Path(data_dir)
        download_and_untar_from_url(urls, dir=data_dir / "images")

        # Convert
        data_path = data_dir / "images" / "VOCdevkit"
        for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
            dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
            dest_imgs_path.mkdir(exist_ok=True, parents=True)

            dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
            dest_labels_path.mkdir(exist_ok=True, parents=True)

            with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
                image_ids = f.read().strip().split()

            for id in tqdm(image_ids, desc=f"{image_set}{year}"):
                img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
                new_img_path = dest_imgs_path / img_path.name
                new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
                img_path.rename(new_img_path)  # Move image to dest folder
                _parse_and_save_labels(data_path, new_label_path, year, id)

__init__(data_dir, images_sub_directory=None, images_dir=None, labels_dir=None, download=False, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@deprecated_parameter(
    "images_sub_directory",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
)
def __init__(
    self,
    data_dir: str,
    images_sub_directory: Optional[str] = None,
    images_dir: Optional[str] = None,
    labels_dir: Optional[str] = None,
    download: bool = False,
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """
    Initialize the Pascal VOC Detection Dataset.

    """

    # Adding a check for deprecated usage alongside new parameters
    if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
        logger.warning(
            "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
            "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
            DeprecationWarning,
        )

    elif images_sub_directory is not None:
        images_dir = images_sub_directory
        labels_dir = images_sub_directory.replace("images", "labels")
    elif images_dir is None or labels_dir is None:
        raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

    if download:
        self.download(data_dir)

    super().__init__(
        data_dir=data_dir,
        images_dir=images_dir,
        labels_dir=labels_dir,
        max_num_samples=max_num_samples,
        cache_annotations=cache_annotations,
        input_dim=input_dim,
        transforms=transforms,
        class_inclusion_list=class_inclusion_list,
        ignore_empty_annotations=ignore_empty_annotations,
        verbose=verbose,
        show_all_warnings=show_all_warnings,
        cache=cache,
        cache_dir=cache_dir,
        all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
    )

download(data_dir) staticmethod

Download Pascal dataset in XYXY_LABEL format.

Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@staticmethod
def download(data_dir: str) -> None:
    """Download Pascal dataset in XYXY_LABEL format.

    Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
    """

    def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
        """Parse and save the labels of an image in XYXY_LABEL format."""

        with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
            xml_parser = ElementTree.parse(f).getroot()

        labels = []
        for obj in xml_parser.iter("object"):
            cls = obj.find("name").text
            if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                xml_box = obj.find("bndbox")

                def get_coord(box_coord):
                    return xml_box.find(box_coord).text

                xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

        with open(new_label_path, "w") as f:
            f.write("\n".join(labels))

    urls = [
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
    ]  # 1.86G, 17125 images
    data_dir = Path(data_dir)
    download_and_untar_from_url(urls, dir=data_dir / "images")

    # Convert
    data_path = data_dir / "images" / "VOCdevkit"
    for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
        dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
        dest_imgs_path.mkdir(exist_ok=True, parents=True)

        dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
        dest_labels_path.mkdir(exist_ok=True, parents=True)

        with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
            image_ids = f.read().strip().split()

        for id in tqdm(image_ids, desc=f"{image_set}{year}"):
            img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
            new_img_path = dest_imgs_path / img_path.name
            new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
            img_path.rename(new_img_path)  # Move image to dest folder
            _parse_and_save_labels(data_path, new_label_path, year, id)

PascalVOCUnifiedDetectionTrainDataset

Bases: ConcatDataset

Unified Dataset for Pascal VOC object detection.

Unified Dataset class for training on Pascal VOC object detection datasets.

This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

Parameters: data_dir (str): Base directory where the dataset is stored. input_dim (tuple): Input dimension that the images should be resized to. cache (optional): Cache configuration. cache_dir (optional): Directory for cache. transforms (List[AbstractDetectionTransform], optional): List of transforms to apply. class_inclusion_list (Optional[List[str]], optional): List of classes to include. max_num_samples (int, optional): Maximum number of samples to include from each dataset part. download (bool, optional): If True, downloads the dataset parts to data_dir. Defaults to False. images_dir (Optional[str], optional): Directory containing all the images, relative to data_dir. Should only be used without 'images_sub_directory'. labels_dir (Optional[str], optional): Directory containing all the labels, relative to data_dir. Should only be used without 'images_sub_directory'. images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.

Example Dataset structure:

    ./data/pascal_voc/
    ├─images
    │   ├─ train2012
    │   ├─ val2012
    │   ├─ VOCdevkit
    │   │    ├─ VOC2007
    │   │    │  ├──JPEGImages
    │   │    │  ├──SegmentationClass
    │   │    │  ├──ImageSets
    │   │    │  ├──ImageSets/Segmentation
    │   │    │  ├──ImageSets/Main
    │   │    │  ├──ImageSets/Layout
    │   │    │  ├──Annotations
    │   │    │  └──SegmentationObject
    │   │    └──VOC2012
    │   │       ├──JPEGImages
    │   │       ├──SegmentationClass
    │   │       ├──ImageSets
    │   │       ├──ImageSets/Segmentation
    │   │       ├──ImageSets/Main
    │   │       ├──ImageSets/Action
    │   │       ├──ImageSets/Layout
    │   │       ├──Annotations
    │   │       └──SegmentationObject
    │   ├─train2007
    │   ├─test2007
    │   └─val2007
    └─labels
        ├─train2012
        ├─val2012
        ├─train2007
        ├─test2007
        └─val2007
    Usage:
unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                        input_dim=(512, 512),
                                                        download=True,
                                                        images_dir="images",
                                                        labels_dir="labels")
Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class PascalVOCUnifiedDetectionTrainDataset(ConcatDataset):
    """Unified Dataset for Pascal VOC object detection.

    Unified Dataset class for training on Pascal VOC object detection datasets.

    This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

    Parameters:
        data_dir (str): Base directory where the dataset is stored.
        input_dim (tuple): Input dimension that the images should be resized to.
        cache (optional): Cache configuration.
        cache_dir (optional): Directory for cache.
        transforms (List[AbstractDetectionTransform], optional): List of transforms to apply.
        class_inclusion_list (Optional[List[str]], optional): List of classes to include.
        max_num_samples (int, optional): Maximum number of samples to include from each dataset part.
        download (bool, optional): If True, downloads the dataset parts to `data_dir`. Defaults to False.
        images_dir (Optional[str], optional): Directory containing all the images, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        labels_dir (Optional[str], optional): Directory containing all the labels, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.


        Example Dataset structure:

            ./data/pascal_voc/
            ├─images
            │   ├─ train2012
            │   ├─ val2012
            │   ├─ VOCdevkit
            │   │    ├─ VOC2007
            │   │    │  ├──JPEGImages
            │   │    │  ├──SegmentationClass
            │   │    │  ├──ImageSets
            │   │    │  ├──ImageSets/Segmentation
            │   │    │  ├──ImageSets/Main
            │   │    │  ├──ImageSets/Layout
            │   │    │  ├──Annotations
            │   │    │  └──SegmentationObject
            │   │    └──VOC2012
            │   │       ├──JPEGImages
            │   │       ├──SegmentationClass
            │   │       ├──ImageSets
            │   │       ├──ImageSets/Segmentation
            │   │       ├──ImageSets/Main
            │   │       ├──ImageSets/Action
            │   │       ├──ImageSets/Layout
            │   │       ├──Annotations
            │   │       └──SegmentationObject
            │   ├─train2007
            │   ├─test2007
            │   └─val2007
            └─labels
                ├─train2012
                ├─val2012
                ├─train2007
                ├─test2007
                └─val2007
            Usage:
        unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                                input_dim=(512, 512),
                                                                download=True,
                                                                images_dir="images",
                                                                labels_dir="labels")

    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility. Please use " "'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        input_dim: tuple,
        cache=None,
        cache_dir=None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        max_num_samples: int = None,
        download: bool = False,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        images_sub_directory: Optional[str] = None,  # Marked for deprecation.
    ):
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )
        if download:
            PascalVOCDetectionDataset.download(data_dir=data_dir)

        train_dataset_names = ["train2007", "val2007", "train2012", "val2012"]
        if max_num_samples:
            max_num_samples_per_train_dataset = [len(segment) for segment in np.array_split(range(max_num_samples), len(train_dataset_names))]
        else:
            max_num_samples_per_train_dataset = [None] * len(train_dataset_names)

        train_sets = []
        for i, trainset_name in enumerate(train_dataset_names):
            dataset_kwargs = {
                "data_dir": data_dir,
                "input_dim": input_dim,
                "cache": cache,
                "cache_dir": cache_dir,
                "transforms": transforms,
                "class_inclusion_list": class_inclusion_list,
                "max_num_samples": max_num_samples_per_train_dataset[i],
            }
            if images_dir is not None and labels_dir is not None:
                dataset_kwargs["images_dir"] = os.path.join(images_dir, trainset_name)
                dataset_kwargs["labels_dir"] = os.path.join(labels_dir, trainset_name)
            elif images_sub_directory is not None:
                deprecated_images_path = os.path.join("images", trainset_name)
                deprecated_labels_path = os.path.join("labels", trainset_name)
                dataset_kwargs["images_dir"] = deprecated_images_path
                dataset_kwargs["labels_dir"] = deprecated_labels_path
            else:
                raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

            train_sets.append(PascalVOCDetectionDataset(**dataset_kwargs))
            super(PascalVOCUnifiedDetectionTrainDataset, self).__init__(train_sets)

PascalVOCFormatDetectionDataset

Bases: DetectionDataset

Dataset for Pascal VOC object detection

Parameters: data_dir (str): Base directory where the dataset is stored.

images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
 n annotations/images. Defaults to None.

cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
 but requires more RAM. Defaults to True.

input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
 or a tuple (height, width). Defaults to None.

transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
 Defaults to an empty list.

all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
 Adjust the number of model classes accordingly. Defaults to None.

ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
 ignored. Defaults to True.

verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

show_all_warnings (bool): If True, displays all warnings. Defaults to False.

cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
 future version.

cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
 a future version.



Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
@register_dataset("PascalVOCFormatDetectionDataset")
class PascalVOCFormatDetectionDataset(DetectionDataset):
    """Dataset for Pascal VOC object detection

    Parameters:
        data_dir (str): Base directory where the dataset is stored.

        images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

        labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

        max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
         n annotations/images. Defaults to None.

        cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
         but requires more RAM. Defaults to True.

        input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
         or a tuple (height, width). Defaults to None.

        transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
         Defaults to an empty list.

        all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

        class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
         Adjust the number of model classes accordingly. Defaults to None.

        ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
         ignored. Defaults to True.

        verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

        show_all_warnings (bool): If True, displays all warnings. Defaults to False.

        cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
         future version.

        cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
         a future version.



        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        self.data_dir = data_dir

        self.images_dir = os.path.join(data_dir, images_dir)
        self.labels_dir = os.path.join(data_dir, labels_dir)

        super(PascalVOCFormatDetectionDataset, self).__init__(
            data_dir=data_dir,
            original_target_format=XYXY_LABEL,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            all_classes_list=all_classes_list,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
        )

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: List of tuples made of (img_path,target_path)
        """
        if not Path(self.images_dir).exists():
            raise FileNotFoundError(f"{self.images_dir} not found.")

        img_files = list(sorted(glob.glob(os.path.join(self.images_dir, "*.jpg"))))
        if len(img_files) == 0:
            raise FileNotFoundError(f"No image files found in {self.images_dir}")

        target_files = [os.path.join(self.labels_dir, os.path.basename(img_file).replace(".jpg", ".txt")) for img_file in img_files]

        img_and_target_path_list = [(img_file, target_file) for img_file, target_file in zip(img_files, target_files) if os.path.exists(target_file)]
        if len(img_and_target_path_list) == 0:
            raise FileNotFoundError("No target files associated with the images were found")

        num_missing_files = len(img_files) - len(img_and_target_path_list)
        if num_missing_files > 0:
            logger.warning(f"{num_missing_files} label files were not loaded out of {len(img_files)} image files")

        self.img_and_target_path_list = img_and_target_path_list
        return len(self.img_and_target_path_list)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load annotations for a given sample.

        :return: Annotation including:
                    - target in XYXY_LABEL format
                    - img_path
        """
        img_path, target_path = self.img_and_target_path_list[sample_id]
        with open(target_path, "r") as file:
            target = np.array([x.split() for x in file.read().splitlines()], dtype=np.float32)

        height, width = get_image_size_from_path(img_path)
        r = min(self.input_dim[1] / height, self.input_dim[0] / width)
        target[:, :4] *= r
        resized_img_shape = (int(height * r), int(width * r))

        return {"img_path": img_path, "target": target, "resized_img_shape": resized_img_shape}

__init__(data_dir, images_dir, labels_dir, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
@deprecated_parameter(
    "images_sub_directory",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
)
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    labels_dir: str,
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """
    Initialize the Pascal VOC Detection Dataset.

    """

    self.data_dir = data_dir

    self.images_dir = os.path.join(data_dir, images_dir)
    self.labels_dir = os.path.join(data_dir, labels_dir)

    super(PascalVOCFormatDetectionDataset, self).__init__(
        data_dir=data_dir,
        original_target_format=XYXY_LABEL,
        max_num_samples=max_num_samples,
        cache_annotations=cache_annotations,
        input_dim=input_dim,
        transforms=transforms,
        all_classes_list=all_classes_list,
        class_inclusion_list=class_inclusion_list,
        ignore_empty_annotations=ignore_empty_annotations,
        verbose=verbose,
        show_all_warnings=show_all_warnings,
        cache=cache,
        cache_dir=cache_dir,
    )

RoboflowDetectionDataset

Bases: COCOFormatDetectionDataset

Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection. Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

To use this Dataset you need to:

- Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
    //!\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

- Your dataset should look like this:
    rf100
    ├── 4-fold-defect
    │      ├─ train
    │      │    ├─ 000000000001.jpg
    │      │    ├─ ...
    │      │    └─ _annotations.coco.json
    │      ├─ valid
    │      │    └─ ...
    │      └─ test
    │           └─ ...
    ├── abdomen-mri
    │      └─ ...
    └── ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
    >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
    >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

Note: dataset_name refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets) OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class RoboflowDetectionDataset(COCOFormatDetectionDataset):
    """Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection.
    Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

    To use this Dataset you need to:

        - Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
            //!\\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

        - Your dataset should look like this:
            rf100
            ├── 4-fold-defect
            │      ├─ train
            │      │    ├─ 000000000001.jpg
            │      │    ├─ ...
            │      │    └─ _annotations.coco.json
            │      ├─ valid
            │      │    └─ ...
            │      └─ test
            │           └─ ...
            ├── abdomen-mri
            │      └─ ...
            └── ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
            >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
            >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

    Note: `dataset_name` refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
          OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6
    """

    def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
        """
        :param data_dir:        Where the data is stored.
        :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
        :param split:           train, valid or test.
        """
        if split not in ("train", "valid", "test"):
            raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

        self.dataset_name = dataset_name
        dataset_split_dir = os.path.join(dataset_name, split)
        json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

        super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

    @staticmethod
    def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
        """List all available datasets of specified categories. By default, list all the datasets."""
        return list_datasets(categories=categories)

    @property
    def metadata(self) -> Optional[Dict[str, Union[str, int]]]:
        """Category of the dataset. Note that each dataset has one and only one category."""
        return get_dataset_metadata(self.dataset_name)

metadata: Optional[Dict[str, Union[str, int]]] property

Category of the dataset. Note that each dataset has one and only one category.

__init__(data_dir, dataset_name, split, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
dataset_name str

One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)

required
split str

train, valid or test.

required
Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
    """
    :param data_dir:        Where the data is stored.
    :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
    :param split:           train, valid or test.
    """
    if split not in ("train", "valid", "test"):
        raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

    self.dataset_name = dataset_name
    dataset_split_dir = os.path.join(dataset_name, split)
    json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

    super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

list_datasets(categories=None) staticmethod

List all available datasets of specified categories. By default, list all the datasets.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py
60
61
62
63
@staticmethod
def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    return list_datasets(categories=categories)

get_dataset_metadata(dataset_name)

Get the metadata of a specific roboflow dataset.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv

required

Returns:

Type Description
Optional[Dict[str, Union[str, int]]]

Metadata of the dataset

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
15
16
17
18
19
20
21
22
23
24
def get_dataset_metadata(dataset_name: str) -> Optional[Dict[str, Union[str, int]]]:
    """Get the metadata of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Metadata of the dataset
    """
    dataset_metadata = DATASETS_METADATA.get(dataset_name)
    if dataset_metadata is None:
        logger.warning(f"No metadata found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return dataset_metadata

get_dataset_num_classes(dataset_name)

Get the number of classes of a specific roboflow dataset.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv

required

Returns:

Type Description
int

Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
27
28
29
30
31
32
33
34
35
36
def get_dataset_num_classes(dataset_name: str) -> int:
    """Get the number of classes of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.
    """
    metadata = get_dataset_metadata(dataset_name)
    if metadata is None:
        raise ValueError(f"No num_classes found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return metadata["num_classes_found"]

list_datasets(categories=None)

List all available datasets of specified categories. By default, list all the datasets.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py
 9
10
11
12
def list_datasets(categories: List[str] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    categories = categories or DATASETS_CATEGORIES
    return [dataset_name for dataset_name, metadata in DATASETS_METADATA.items() if metadata["category"] in categories]

YoloDarknetFormatDetectionDataset

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

Note: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

The dataset can have any structure, as long as images_dir and labels_dir inside data_dir. Each image is expected to have a file with the same name as the label.

Example1: data_dir ├── images │ ├─ 0001.jpg │ ├─ 0002.jpg │ └─ ... └── labels ├─ 0001.txt ├─ 0002.txt └─ ... >> data_set = YoloDarknetFormatDetectionDataset(data_dir='/data_dir', images_dir="images", labels_dir="labels", classes=[])

Example2: data_dir ├── train │ ├── images │ │ ├─ 0001.jpg │ │ ├─ 0002.jpg │ │ └─ ... │ └── labels │ ├─ 0001.txt │ ├─ 0002.txt │ └─ ... └── val ├── images │ ├─ 434343.jpg │ ├─ 434344.jpg │ └─ ... └── labels ├─ 434343.txt ├─ 434344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
    )
>> val_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
    )

Example3: data_dir ├── train │ ├─ 0001.jpg │ ├─ 0001.txt │ ├─ 0002.jpg │ ├─ 0002.txt │ └─ ... └── val ├─ 4343.jpg ├─ 4343.txt ├─ 4344.jpg ├─ 4344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
>> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

Each label file being in LABEL_NORMALIZED_CXCYWH format: 0 0.33 0.33 0.50 0.44 1 0.21 0.54 0.30 0.60 ...

Output format: XYXY_LABEL (x, y, x, y, class_id)

Source code in src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
@register_dataset("YoloDarknetFormatDetectionDataset")
class YoloDarknetFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

    **Note**: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

    The dataset can have any structure, as long as `images_dir` and `labels_dir` inside `data_dir`.
    Each image is expected to have a file with the same name as the label.

    Example1:
        data_dir
        ├── images
        │      ├─ 0001.jpg
        │      ├─ 0002.jpg
        │      └─ ...
        └── labels
               ├─ 0001.txt
               ├─ 0002.txt
               └─ ...
        >> data_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="images", labels_dir="labels", classes=[<to-fill>])

    Example2:
        data_dir
        ├── train
        │   ├── images
        │   │      ├─ 0001.jpg
        │   │      ├─ 0002.jpg
        │   │      └─ ...
        │   └── labels
        │          ├─ 0001.txt
        │          ├─ 0002.txt
        │          └─ ...
        └── val
            ├── images
            │      ├─ 434343.jpg
            │      ├─ 434344.jpg
            │      └─ ...
            └── labels
                   ├─ 434343.txt
                   ├─ 434344.txt
                   └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
            )
        >> val_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
            )

    Example3:
        data_dir
        ├── train
        │      ├─ 0001.jpg
        │      ├─ 0001.txt
        │      ├─ 0002.jpg
        │      ├─ 0002.txt
        │      └─ ...
        └── val
               ├─ 4343.jpg
               ├─ 4343.txt
               ├─ 4344.jpg
               ├─ 4344.txt
               └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
        >> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

    Each label file being in LABEL_NORMALIZED_CXCYWH format:
        0 0.33 0.33 0.50 0.44
        1 0.21 0.54 0.30 0.60
        ...


    Output format: XYXY_LABEL (x, y, x, y, class_id)
    """

    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        classes: List[str],
        class_ids_to_ignore: Optional[List[int]] = None,
        ignore_invalid_labels: bool = True,
        show_all_warnings: bool = False,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
        :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
        :param classes:                 List of class names.
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
        """
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.class_ids_to_ignore = class_ids_to_ignore or []
        self.classes = classes
        self.ignore_invalid_labels = ignore_invalid_labels
        self.show_all_warnings = show_all_warnings

        kwargs["target_fields"] = ["target"]
        kwargs["output_fields"] = ["image", "target"]
        kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
        super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

    @property
    def _all_classes(self) -> List[str]:
        return self.classes

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: number of images in the dataset
        """
        self.images_folder = os.path.join(self.data_dir, self.images_dir)
        self.labels_folder = os.path.join(self.data_dir, self.labels_dir)

        all_images_file_names = list(image_name for image_name in os.listdir(self.images_folder) if is_image(image_name))
        all_labels_file_names = list(label_name for label_name in os.listdir(self.labels_folder) if label_name.endswith(".txt"))

        remove_file_extension = lambda file_name: os.path.splitext(os.path.basename(file_name))[0]
        unique_image_file_base_names = set(remove_file_extension(image_file_name) for image_file_name in all_images_file_names)
        unique_label_file_base_names = set(remove_file_extension(label_file_name) for label_file_name in all_labels_file_names)

        images_not_in_labels = unique_image_file_base_names - unique_label_file_base_names
        if images_not_in_labels:
            logger.warning(f"{len(images_not_in_labels)} images are note associated to any label file")

        labels_not_in_images = unique_label_file_base_names - unique_image_file_base_names
        if labels_not_in_images:
            logger.warning(f"{len(labels_not_in_images)} label files are not associated to any image.")

        # Only keep names that are in both the images and the labels
        valid_base_names = unique_image_file_base_names & unique_label_file_base_names
        if len(valid_base_names) != len(all_images_file_names):
            logger.warning(
                f"As a consequence, "
                f"{len(valid_base_names)}/{len(all_images_file_names)} images and "
                f"{len(valid_base_names)}/{len(all_labels_file_names)} label files will be used."
            )

        self.images_file_names = []
        self.labels_file_names = []
        for image_full_name in all_images_file_names:
            base_name = remove_file_extension(image_full_name)
            if base_name in valid_base_names:
                self.images_file_names.append(image_full_name)
                self.labels_file_names.append(base_name + ".txt")
        return len(self.images_file_names)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load relevant information of a specific image.

        :param sample_id:   Sample_id in the dataset
        :return:            Dictionary with the following keys:
            - "target":             Target Bboxes (detection) in XYXY_LABEL format
            - "initial_img_shape":  Image (height, width)
            - "resized_img_shape":  Resides image (height, width)
            - "img_path":           Path to the associated image
        """
        image_path = os.path.join(self.images_folder, self.images_file_names[sample_id])
        label_path = os.path.join(self.labels_folder, self.labels_file_names[sample_id])

        image_width, image_height = imagesize.get(image_path)
        image_shape = (image_height, image_width)

        yolo_format_target, invalid_labels = self._parse_yolo_label_file(
            label_file_path=label_path,
            num_classes=len(self.all_classes_list),
            ignore_invalid_labels=self.ignore_invalid_labels,
            show_warnings=self.show_all_warnings,
        )

        converter = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_CXCYWH, output_format=XYXY_LABEL, image_shape=image_shape)
        target = converter(yolo_format_target)

        # The base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        if self.input_dim is not None:
            r = min(self.input_dim[0] / image_height, self.input_dim[1] / image_width)
            target[:, :4] *= r
            resized_img_shape = (int(image_height * r), int(image_width * r))
        else:
            resized_img_shape = image_shape

        annotation = {
            "target": target,
            "initial_img_shape": image_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": image_path,
            "id": np.array([sample_id]),
            "n_invalid_labels": len(invalid_labels),
        }
        return annotation

    @staticmethod
    def _parse_yolo_label_file(
        label_file_path: str,
        ignore_invalid_labels: bool = True,
        show_warnings: bool = True,
        num_classes: Optional[int] = None,
    ) -> Tuple[np.ndarray, List[str]]:
        """Parse a single label file in yolo format.

        #TODO: Add support for additional fields (with ConcatenatedTensorFormat)
        :param label_file_path:         Path to the label file in yolo format.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_warnings:           Whether to show the warnings or not.
        :param num_classes:             Number of classes in the dataset. Used to ensure that class ids are within the range [0, num_classes - 1].
                                        If None, ignore.

        :return:
            - labels:           np.ndarray of shape (n_labels, 5) in yolo format (LABEL_NORMALIZED_CXCYWH)
            - invalid_labels:   List of lines that failed to be parsed
        """
        with open(label_file_path, "r") as f:
            lines = f.readlines()

        labels_yolo_format, invalid_labels = [], []
        for line in filter(lambda x: x != "\n", lines):
            try:
                label_id, cx, cw, w, h = line.split()
                label_id, cx, cw, w, h = int(label_id), float(cx), float(cw), float(w), float(h)

                if (num_classes is not None) and (label_id not in range(num_classes)):
                    raise ValueError(f"`class_id={label_id}` invalid. It should be between (0 - {num_classes - 1}).")

                labels_yolo_format.append([label_id, cx, cw, w, h])
            except Exception as e:
                error_msg = (
                    f"Line `{line}` of file {label_file_path} will be ignored because not cannot be parsed to (label, cx, cy, w, h) format, "
                    f"with Exception:\n{e}"
                )
                if ignore_invalid_labels:
                    invalid_labels.append(line)
                    if show_warnings:
                        logger.warning(error_msg)
                else:
                    raise RuntimeError(error_msg)
        return np.array(labels_yolo_format) if labels_yolo_format else np.zeros((0, 5)), invalid_labels

__init__(data_dir, images_dir, labels_dir, classes, class_ids_to_ignore=None, ignore_invalid_labels=True, show_all_warnings=False, *args, **kwargs)

Parameters:

Name Type Description Default
data_dir str

Where the data is stored.

required
images_dir str

Local path to directory that includes all the images. Path relative to data_dir. Can be the same as labels_dir.

required
labels_dir str

Local path to directory that includes all the labels. Path relative to data_dir. Can be the same as images_dir.

required
classes List[str]

List of class names.

required
class_ids_to_ignore Optional[List[int]]

List of class ids to ignore in the dataset. By default, doesnt ignore any class.

None
ignore_invalid_labels bool

Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.

True
show_all_warnings bool

Whether to show every yolo format parser warnings or not.

False
Source code in src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    labels_dir: str,
    classes: List[str],
    class_ids_to_ignore: Optional[List[int]] = None,
    ignore_invalid_labels: bool = True,
    show_all_warnings: bool = False,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
    :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
    :param classes:                 List of class names.
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
    :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
    """
    self.images_dir = images_dir
    self.labels_dir = labels_dir
    self.class_ids_to_ignore = class_ids_to_ignore or []
    self.classes = classes
    self.ignore_invalid_labels = ignore_invalid_labels
    self.show_all_warnings = show_all_warnings

    kwargs["target_fields"] = ["target"]
    kwargs["output_fields"] = ["image", "target"]
    kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
    super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

Mixup and Cutmix

Papers: mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)

CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)

Code Reference: CutMix: https://github.com/clovaai/CutMix-PyTorch CutMix by timm: https://github.com/rwightman/pytorch-image-models/timm

CollateMixup

Collate with Mixup/Cutmix that applies different params to each element or whole batch A Mixup impl that's performed while collating the batches.

Source code in src/super_gradients/training/datasets/mixup.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
@register_collate_function()
class CollateMixup:
    """
    Collate with Mixup/Cutmix that applies different params to each element or whole batch
    A Mixup impl that's performed while collating the batches.
    """

    def __init__(
        self,
        mixup_alpha: float = 1.0,
        cutmix_alpha: float = 0.0,
        cutmix_minmax: List[float] = None,
        prob: float = 1.0,
        switch_prob: float = 0.5,
        mode: str = "batch",
        correct_lam: bool = True,
        label_smoothing: float = 0.1,
        num_classes: int = 1000,
    ):
        """
        Mixup/Cutmix that applies different params to each element or whole batch

        :param mixup_alpha: mixup alpha value, mixup is active if > 0.
        :param cutmix_alpha: cutmix alpha value, cutmix is active if > 0.
        :param cutmix_minmax: cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        :param prob: probability of applying mixup or cutmix per batch or element
        :param switch_prob: probability of switching to cutmix instead of mixup when both are active
        :param mode: how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        :param correct_lam: apply lambda correction when cutmix bbox clipped by image borders
        :param label_smoothing: apply label smoothing to the mixed target tensor
        :param num_classes: number of classes for target
        """
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
        if self.cutmix_minmax is not None:
            assert len(self.cutmix_minmax) == 2
            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
            self.cutmix_alpha = 1.0
        self.mix_prob = prob
        self.switch_prob = switch_prob
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

    def _params_per_elem(self, batch_size):
        """
        generate two random masks to define which elements of the batch will be mixed and how (depending on the
        self.mixup_enabled, self.mixup_alpha, self.cutmix_alpha parameters

        :param batch_size:
        :return: two tensors with shape=batch_size - the first contains the lambda value per batch element
        and the second is a binary flag indicating use of cutmix per batch element
        """
        lam = torch.ones(batch_size, dtype=torch.float32)
        use_cutmix = torch.zeros(batch_size, dtype=torch.bool)
        if self.mixup_enabled:
            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
                use_cutmix = torch.rand(batch_size) < self.switch_prob
                lam_mix = torch.where(
                    use_cutmix,
                    torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample(sample_shape=batch_size),
                    torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample(sample_shape=batch_size),
                )
            elif self.mixup_alpha > 0.0:
                lam_mix = torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample(sample_shape=batch_size)
            elif self.cutmix_alpha > 0.0:
                use_cutmix = torch.ones(batch_size, dtype=torch.bool)
                lam_mix = torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample(sample_shape=batch_size)
            else:
                raise IllegalDatasetParameterException("One of mixup_alpha > 0., cutmix_alpha > 0., " "cutmix_minmax not None should be true.")
            lam = torch.where(torch.rand(batch_size) < self.mix_prob, lam_mix.type(torch.float32), lam)
        return lam, use_cutmix

    def _params_per_batch(self):
        """
        generate two random parameters to define if batch will be mixed and how (depending on the
        self.mixup_enabled, self.mixup_alpha, self.cutmix_alpha parameters

        :return: two parameters - the first contains the lambda value for the whole batch
        and the second is a binary flag indicating use of cutmix for the batch
        """
        lam = 1.0
        use_cutmix = False

        if self.mixup_enabled and torch.rand(1) < self.mix_prob:
            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
                use_cutmix = torch.rand(1) < self.switch_prob
                lam_mix = (
                    torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample()
                    if use_cutmix
                    else torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample()
                )
            elif self.mixup_alpha > 0.0:
                lam_mix = torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample()
            elif self.cutmix_alpha > 0.0:
                use_cutmix = True
                lam_mix = torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample()
            else:
                raise IllegalDatasetParameterException("One of mixup_alpha > 0., cutmix_alpha > 0., " "cutmix_minmax not None should be true.")
            lam = float(lam_mix)
        return lam, use_cutmix

    def _mix_elem_collate(self, output: torch.Tensor, batch: list, half: bool = False):
        """
        This is the implementation for 'elem' or 'half' modes
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: a tensor containing the lambda values used for the mixing (this vector can be used for
        mixing the labels as well)
        """
        batch_size = len(batch)
        num_elem = batch_size // 2 if half else batch_size
        assert len(output) == num_elem
        lam_batch, use_cutmix = self._params_per_elem(num_elem)
        for i in range(num_elem):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed = batch[i][0]
            if lam != 1.0:
                if use_cutmix[i]:
                    if not half:
                        mixed = torch.clone(mixed)
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    mixed = mixed * lam + batch[j][0] * (1 - lam)
            output[i] += mixed
        if half:
            lam_batch = torch.cat((lam_batch, torch.ones(num_elem)))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_pair_collate(self, output: torch.Tensor, batch: list):
        """
        This is the implementation for 'pair' mode
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: a tensor containing the lambda values used for the mixing (this vector can be used for
        mixing the labels as well)
        """
        batch_size = len(batch)
        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
        for i in range(batch_size // 2):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed_i = batch[i][0]
            mixed_j = batch[j][0]
            assert 0 <= lam <= 1.0
            if lam < 1.0:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    patch_i = torch.clone(mixed_i[:, yl:yh, xl:xh])
                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
                    mixed_j[:, yl:yh, xl:xh] = patch_i
                    lam_batch[i] = lam
                else:
                    mixed_temp = mixed_i.type(torch.float32) * lam + mixed_j.type(torch.float32) * (1 - lam)
                    mixed_j = mixed_j.type(torch.float32) * lam + mixed_i.type(torch.float32) * (1 - lam)
                    mixed_i = mixed_temp
                    torch.rint(mixed_j, out=mixed_j)
                    torch.rint(mixed_i, out=mixed_i)
            output[i] += mixed_i
            output[j] += mixed_j
        lam_batch = torch.cat((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_batch_collate(self, output: torch.Tensor, batch: list):
        """
        This is the implementation for 'batch' mode
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: the lambda value used for the mixing
        """
        batch_size = len(batch)
        lam, use_cutmix = self._params_per_batch()
        if use_cutmix:
            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
        for i in range(batch_size):
            j = batch_size - i - 1
            mixed = batch[i][0]
            if lam != 1.0:
                if use_cutmix:
                    mixed = torch.clone(mixed)  # don't want to modify the original while iterating
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                else:
                    mixed = mixed * lam + batch[j][0] * (1 - lam)
            output[i] += mixed
        return lam

    def __call__(self, batch, _=None):
        batch_size = len(batch)
        if batch_size % 2 != 0:
            raise IllegalDatasetParameterException("Batch size should be even when using this")
        half = "half" in self.mode
        if half:
            batch_size //= 2
        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.float32)
        if self.mode == "elem" or self.mode == "half":
            lam = self._mix_elem_collate(output, batch, half=half)
        elif self.mode == "pair":
            lam = self._mix_pair_collate(output, batch)
        else:
            lam = self._mix_batch_collate(output, batch)
        target = torch.tensor([b[1] for b in batch], dtype=torch.int32)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device="cpu")
        target = target[:batch_size]

        return output, target

__init__(mixup_alpha=1.0, cutmix_alpha=0.0, cutmix_minmax=None, prob=1.0, switch_prob=0.5, mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000)

Mixup/Cutmix that applies different params to each element or whole batch

Parameters:

Name Type Description Default
mixup_alpha float

mixup alpha value, mixup is active if > 0.

1.0
cutmix_alpha float

cutmix alpha value, cutmix is active if > 0.

0.0
cutmix_minmax List[float]

cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.

None
prob float

probability of applying mixup or cutmix per batch or element

1.0
switch_prob float

probability of switching to cutmix instead of mixup when both are active

0.5
mode str

how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)

'batch'
correct_lam bool

apply lambda correction when cutmix bbox clipped by image borders

True
label_smoothing float

apply label smoothing to the mixed target tensor

0.1
num_classes int

number of classes for target

1000
Source code in src/super_gradients/training/datasets/mixup.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def __init__(
    self,
    mixup_alpha: float = 1.0,
    cutmix_alpha: float = 0.0,
    cutmix_minmax: List[float] = None,
    prob: float = 1.0,
    switch_prob: float = 0.5,
    mode: str = "batch",
    correct_lam: bool = True,
    label_smoothing: float = 0.1,
    num_classes: int = 1000,
):
    """
    Mixup/Cutmix that applies different params to each element or whole batch

    :param mixup_alpha: mixup alpha value, mixup is active if > 0.
    :param cutmix_alpha: cutmix alpha value, cutmix is active if > 0.
    :param cutmix_minmax: cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
    :param prob: probability of applying mixup or cutmix per batch or element
    :param switch_prob: probability of switching to cutmix instead of mixup when both are active
    :param mode: how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
    :param correct_lam: apply lambda correction when cutmix bbox clipped by image borders
    :param label_smoothing: apply label smoothing to the mixed target tensor
    :param num_classes: number of classes for target
    """
    self.mixup_alpha = mixup_alpha
    self.cutmix_alpha = cutmix_alpha
    self.cutmix_minmax = cutmix_minmax
    if self.cutmix_minmax is not None:
        assert len(self.cutmix_minmax) == 2
        # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
        self.cutmix_alpha = 1.0
    self.mix_prob = prob
    self.switch_prob = switch_prob
    self.label_smoothing = label_smoothing
    self.num_classes = num_classes
    self.mode = mode
    self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
    self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None)

Generate bbox and apply lambda correction.

Source code in src/super_gradients/training/datasets/mixup.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def cutmix_bbox_and_lam(img_shape: tuple, lam: float, ratio_minmax: Union[tuple, list] = None, correct_lam: bool = True, count: int = None):
    """
    Generate bbox and apply lambda correction.
    """
    if ratio_minmax is not None:
        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
    else:
        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
    if correct_lam or ratio_minmax is not None:
        bbox_area = (yu - yl) * (xu - xl)
        lam = 1.0 - bbox_area / float(img_shape[-2] * img_shape[-1])
    return (yl, yu, xl, xu), lam

mixup_target(target, num_classes, lam=1.0, smoothing=0.0, device='cuda')

generate a smooth target (label) two-hot tensor to support the mixed images with different labels

Parameters:

Name Type Description Default
target torch.Tensor

the targets tensor

required
num_classes int

number of classes (to set the final tensor size)

required
lam float

percentage of label a range [0, 1] in the mixing

1.0
smoothing float

the smoothing multiplier

0.0
device str

usable device ['cuda', 'cpu']

'cuda'

Returns:

Type Description
Source code in src/super_gradients/training/datasets/mixup.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def mixup_target(target: torch.Tensor, num_classes: int, lam: float = 1.0, smoothing: float = 0.0, device: str = "cuda"):
    """
    generate a smooth target (label) two-hot tensor to support the mixed images with different labels
    :param target: the targets tensor
    :param num_classes: number of classes (to set the final tensor size)
    :param lam: percentage of label a range [0, 1] in the mixing
    :param smoothing: the smoothing multiplier
    :param device: usable device ['cuda', 'cpu']
    :return:
    """
    off_value = smoothing / num_classes
    on_value = 1.0 - smoothing + off_value
    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
    return y1 * lam + y2 * (1.0 - lam)

rand_bbox(img_shape, lam, margin=0.0, count=None)

Standard CutMix bounding-box Generates a random square bbox based on lambda value. This impl includes support for enforcing a border margin as percent of bbox dimensions.

Parameters:

Name Type Description Default
img_shape tuple

Image shape as tuple

required
lam float

Cutmix lambda value

required
margin float

Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)

0.0
count int

Number of bbox to generate

None
Source code in src/super_gradients/training/datasets/mixup.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def rand_bbox(img_shape: tuple, lam: float, margin: float = 0.0, count: int = None):
    """Standard CutMix bounding-box
    Generates a random square bbox based on lambda value. This impl includes
    support for enforcing a border margin as percent of bbox dimensions.

    :param img_shape: Image shape as tuple
    :param lam: Cutmix lambda value
    :param margin: Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
    :param count: Number of bbox to generate
    """
    ratio = np.sqrt(1 - lam)
    img_h, img_w = img_shape[-2:]
    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
    yl = np.clip(cy - cut_h // 2, 0, img_h)
    yh = np.clip(cy + cut_h // 2, 0, img_h)
    xl = np.clip(cx - cut_w // 2, 0, img_w)
    xh = np.clip(cx + cut_w // 2, 0, img_w)
    return yl, yh, xl, xh

rand_bbox_minmax(img_shape, minmax, count=None)

Min-Max CutMix bounding-box Inspired by Darknet cutmix impl, generates a random rectangular bbox based on min/max percent values applied to each dimension of the input image.

Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 range for max.

Parameters:

Name Type Description Default
img_shape tuple

Image shape as tuple

required
minmax Union[tuple, list]

Min and max bbox ratios (as percent of image size)

required
count int

Number of bbox to generate

None
Source code in src/super_gradients/training/datasets/mixup.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def rand_bbox_minmax(img_shape: tuple, minmax: Union[tuple, list], count: int = None):
    """Min-Max CutMix bounding-box
    Inspired by Darknet cutmix impl, generates a random rectangular bbox
    based on min/max percent values applied to each dimension of the input image.

    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.

    :param img_shape: Image shape as tuple
    :param minmax: Min and max bbox ratios (as percent of image size)
    :param count: Number of bbox to generate
    """
    assert len(minmax) == 2
    img_h, img_w = img_shape[-2:]
    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
    yl = np.random.randint(0, img_h - cut_h, size=count)
    xl = np.random.randint(0, img_w - cut_w, size=count)
    yu = yl + cut_h
    xu = xl + cut_w
    return yl, yu, xl, xu

AbstractPoseEstimationDataset

Bases: Dataset, HasPreprocessingParams

Abstract class for strongly typed dataset classes for pose estimation task. This new concept introduced in SG 3.3 and will be used in the future to replace the old BaseKeypointsDataset. The reasoning begin strongly typed dataset includes: 1. Introduction of a new concept of "data sample" that has clear definition (via @dataclass) thus reducing change of bugs/confusion. 2. Data sample becomes a central concept in data augmentation transforms and metrics. 3. Dataset implementation decoupled from the model & loss - now the dataset returns the data sample objects and model/loss specific conversion happens only in collate function.

Descendants should implement the load_sample method to read a sample from the disk and return PoseEstimationSample object.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class AbstractPoseEstimationDataset(Dataset, HasPreprocessingParams):
    """
    Abstract class for strongly typed dataset classes for pose estimation task.
    This new concept introduced in SG 3.3 and will be used in the future to replace the old BaseKeypointsDataset.
    The reasoning begin strongly typed dataset includes:
    1. Introduction of a new concept of "data sample" that has clear definition (via @dataclass) thus reducing change of bugs/confusion.
    2. Data sample becomes a central concept in data augmentation transforms and metrics.
    3. Dataset implementation decoupled from the model & loss - now the dataset returns the data sample objects
       and model/loss specific conversion happens only in collate function.

    Descendants should implement the load_sample method to read a sample from the disk and return PoseEstimationSample object.
    """

    def __init__(
        self,
        transforms: List[AbstractKeypointTransform],
        num_joints: int,
        edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param transforms: Transforms to be applied to the image & keypoints
        :param num_joints: Number of joints to be predicted
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """
        super().__init__()
        self.transforms = KeypointsCompose(
            transforms,
            load_sample_fn=self.load_random_sample,
        )
        self.num_joints = num_joints

        # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
        # This is necessary to ensure ListConfig objects do not leak to these properties
        # and from there - to checkpoint's state_dict.
        # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
        # and torch.load will attempt to unpickle lot of unnecessary classes.
        edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
        if edge_colors is not None:
            edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
        if keypoint_colors is not None:
            keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

        self.edge_links = edge_links
        self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
        self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

    @abc.abstractmethod
    def __len__(self) -> int:
        raise NotImplementedError()

    @abc.abstractmethod
    def load_sample(self, index: int) -> PoseEstimationSample:
        """
        Read a sample from the disk and return a PoseEstimationSample
        :param index: Sample index
        :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
        """
        raise NotImplementedError()

    def load_random_sample(self) -> PoseEstimationSample:
        """
        Return a random sample from the dataset

        :return: Instance of PoseEstimationSample
        """
        num_samples = len(self)
        random_index = random.randrange(0, num_samples)
        return self.load_sample(random_index)

    def __getitem__(self, index: int) -> PoseEstimationSample:
        sample = self.load_sample(index)
        sample = self.transforms.apply_to_sample(sample)
        return sample

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
        pipeline = self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

__init__(transforms, num_joints, edge_links, edge_colors, keypoint_colors)

Parameters:

Name Type Description Default
transforms List[AbstractKeypointTransform]

Transforms to be applied to the image & keypoints

required
num_joints int

Number of joints to be predicted

required
edge_links Union[ListConfig, List[Tuple[int, int]], np.ndarray]

Edge links between joints

required
edge_colors Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]

Color of the edge links. If None, the color will be generated randomly.

required
keypoint_colors Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]

Color of the keypoints. If None, the color will be generated randomly.

required
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __init__(
    self,
    transforms: List[AbstractKeypointTransform],
    num_joints: int,
    edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param transforms: Transforms to be applied to the image & keypoints
    :param num_joints: Number of joints to be predicted
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """
    super().__init__()
    self.transforms = KeypointsCompose(
        transforms,
        load_sample_fn=self.load_random_sample,
    )
    self.num_joints = num_joints

    # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
    # This is necessary to ensure ListConfig objects do not leak to these properties
    # and from there - to checkpoint's state_dict.
    # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
    # and torch.load will attempt to unpickle lot of unnecessary classes.
    edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
    if edge_colors is not None:
        edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
    if keypoint_colors is not None:
        keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

    self.edge_links = edge_links
    self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
    self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

get_dataset_preprocessing_params()

Returns:

Type Description
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
    pipeline = self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

load_random_sample()

Return a random sample from the dataset

Returns:

Type Description
PoseEstimationSample

Instance of PoseEstimationSample

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py
83
84
85
86
87
88
89
90
91
def load_random_sample(self) -> PoseEstimationSample:
    """
    Return a random sample from the dataset

    :return: Instance of PoseEstimationSample
    """
    num_samples = len(self)
    random_index = random.randrange(0, num_samples)
    return self.load_sample(random_index)

load_sample(index) abstractmethod

Read a sample from the disk and return a PoseEstimationSample

Parameters:

Name Type Description Default
index int

Sample index

required

Returns:

Type Description
PoseEstimationSample

Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py
74
75
76
77
78
79
80
81
@abc.abstractmethod
def load_sample(self, index: int) -> PoseEstimationSample:
    """
    Read a sample from the disk and return a PoseEstimationSample
    :param index: Sample index
    :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
    """
    raise NotImplementedError()

BaseKeypointsDataset

Bases: Dataset, HasPreprocessingParams

Base class for pose estimation datasets. Descendants should implement the load_sample method to read a sample from the disk and return (image, mask, joints, extras) tuple.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class BaseKeypointsDataset(Dataset, HasPreprocessingParams):
    """
    Base class for pose estimation datasets.
    Descendants should implement the load_sample method to read a sample from the disk and return (image, mask, joints, extras) tuple.
    """

    def __init__(
        self,
        target_generator: KeypointsTargetsGenerator,
        transforms: List[KeypointTransform],
        min_instance_area: float,
        num_joints: int,
        edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param target_generator: Target generator that will be used to generate the targets for the model.
            See DEKRTargetsGenerator for an example.
        :param transforms: Transforms to be applied to the image & keypoints
        :param min_instance_area: Minimum area of an instance to be included in the dataset
        :param num_joints: Number of joints to be predicted
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """
        super().__init__()
        self.target_generator = target_generator
        self.transforms = KeypointsCompose(transforms)
        self.min_instance_area = min_instance_area
        self.num_joints = num_joints

        # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
        # This is necessary to ensure ListConfig objects do not leak to these properties
        # and from there - to checkpoint's state_dict.
        # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
        # and torch.load will attempt to unpickle lot of unnecessary classes.
        edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
        if edge_colors is not None:
            edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
        if keypoint_colors is not None:
            keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

        self.edge_links = edge_links
        self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
        self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

    @abc.abstractmethod
    def __len__(self) -> int:
        raise NotImplementedError()

    @abc.abstractmethod
    def load_sample(self, index) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
        """
        Read a sample from the disk and return (image, mask, joints, extras) tuple
        :param index: Sample index
        :return: Tuple of (image, mask, joints, extras)
            image - Numpy array of [H,W,3] shape, which represents input RGB image
            mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an
                    ignored region which should not be used for training (contribute to loss)
            joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances
            extras - Dictionary of extra information about the sample that should be included in `extras` dictionary.
        """
        raise NotImplementedError()

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, Any, Mapping[str, Any]]:
        img, mask, joints, extras = self.load_sample(index)
        img, mask, joints, _, _ = self.transforms(img, mask, joints, areas=None, bboxes=None)

        joints = self.filter_joints(joints, img)

        targets = self.target_generator(img, joints, mask)
        return img, targets, {"gt_joints": joints, **extras}

    def compute_area(self, joints: np.ndarray) -> np.ndarray:
        """
        Compute area of a bounding box for each instance.
        :param joints:  [Num Instances, Num Joints, 3]
        :return: [Num Instances]
        """
        w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
        h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
        return w * h

    def filter_joints(self, joints: np.ndarray, image: np.ndarray) -> np.ndarray:
        """
        Filter instances that are either too small or do not have visible keypoints
        :param joints: Array of shape [Num Instances, Num Joints, 3]
        :param image:
        :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
        """
        # Update visibility of joints for those that are outside the image
        outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image.shape[1]) | (joints[:, :, 1] >= image.shape[0])
        joints[outside_image_mask, 2] = 0

        # Filter instances with all invisible keypoints
        instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
        joints = joints[instances_with_visible_joints]

        # Remove instances with too small area
        areas = self.compute_area(joints)
        joints = joints[areas > self.min_instance_area]

        return joints

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        pipeline = self.transforms.get_equivalent_preprocessing()
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

__init__(target_generator, transforms, min_instance_area, num_joints, edge_links, edge_colors, keypoint_colors)

Parameters:

Name Type Description Default
target_generator KeypointsTargetsGenerator

Target generator that will be used to generate the targets for the model. See DEKRTargetsGenerator for an example.

required
transforms List[KeypointTransform]

Transforms to be applied to the image & keypoints

required
min_instance_area float

Minimum area of an instance to be included in the dataset

required
num_joints int

Number of joints to be predicted

required
edge_links Union[ListConfig, List[Tuple[int, int]], np.ndarray]

Edge links between joints

required
edge_colors Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]

Color of the edge links. If None, the color will be generated randomly.

required
keypoint_colors Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]

Color of the keypoints. If None, the color will be generated randomly.

required
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __init__(
    self,
    target_generator: KeypointsTargetsGenerator,
    transforms: List[KeypointTransform],
    min_instance_area: float,
    num_joints: int,
    edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param target_generator: Target generator that will be used to generate the targets for the model.
        See DEKRTargetsGenerator for an example.
    :param transforms: Transforms to be applied to the image & keypoints
    :param min_instance_area: Minimum area of an instance to be included in the dataset
    :param num_joints: Number of joints to be predicted
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """
    super().__init__()
    self.target_generator = target_generator
    self.transforms = KeypointsCompose(transforms)
    self.min_instance_area = min_instance_area
    self.num_joints = num_joints

    # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
    # This is necessary to ensure ListConfig objects do not leak to these properties
    # and from there - to checkpoint's state_dict.
    # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
    # and torch.load will attempt to unpickle lot of unnecessary classes.
    edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
    if edge_colors is not None:
        edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
    if keypoint_colors is not None:
        keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

    self.edge_links = edge_links
    self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
    self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

compute_area(joints)

Compute area of a bounding box for each instance.

Parameters:

Name Type Description Default
joints np.ndarray

[Num Instances, Num Joints, 3]

required

Returns:

Type Description
np.ndarray

[Num Instances]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
 95
 96
 97
 98
 99
100
101
102
103
def compute_area(self, joints: np.ndarray) -> np.ndarray:
    """
    Compute area of a bounding box for each instance.
    :param joints:  [Num Instances, Num Joints, 3]
    :return: [Num Instances]
    """
    w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
    h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
    return w * h

filter_joints(joints, image)

Filter instances that are either too small or do not have visible keypoints

Parameters:

Name Type Description Default
joints np.ndarray

Array of shape [Num Instances, Num Joints, 3]

required
image np.ndarray required

Returns:

Type Description
np.ndarray

[New Num Instances, Num Joints, 3], New Num Instances <= Num Instances

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def filter_joints(self, joints: np.ndarray, image: np.ndarray) -> np.ndarray:
    """
    Filter instances that are either too small or do not have visible keypoints
    :param joints: Array of shape [Num Instances, Num Joints, 3]
    :param image:
    :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
    """
    # Update visibility of joints for those that are outside the image
    outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image.shape[1]) | (joints[:, :, 1] >= image.shape[0])
    joints[outside_image_mask, 2] = 0

    # Filter instances with all invisible keypoints
    instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
    joints = joints[instances_with_visible_joints]

    # Remove instances with too small area
    areas = self.compute_area(joints)
    joints = joints[areas > self.min_instance_area]

    return joints

get_dataset_preprocessing_params()

Returns:

Type Description
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    pipeline = self.transforms.get_equivalent_preprocessing()
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

load_sample(index) abstractmethod

Read a sample from the disk and return (image, mask, joints, extras) tuple

Parameters:

Name Type Description Default
index

Sample index

required

Returns:

Type Description
Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]

Tuple of (image, mask, joints, extras) image - Numpy array of [H,W,3] shape, which represents input RGB image mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an ignored region which should not be used for training (contribute to loss) joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances extras - Dictionary of extra information about the sample that should be included in extras dictionary.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
72
73
74
75
76
77
78
79
80
81
82
83
84
@abc.abstractmethod
def load_sample(self, index) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
    """
    Read a sample from the disk and return (image, mask, joints, extras) tuple
    :param index: Sample index
    :return: Tuple of (image, mask, joints, extras)
        image - Numpy array of [H,W,3] shape, which represents input RGB image
        mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an
                ignored region which should not be used for training (contribute to loss)
        joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances
        extras - Dictionary of extra information about the sample that should be included in `extras` dictionary.
    """
    raise NotImplementedError()

KeypointsCollate

Collate image & targets, return extras as is.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
@register_collate_function()
class KeypointsCollate:
    """
    Collate image & targets, return extras as is.
    """

    def __call__(self, batch):
        images = []
        targets = []
        extras = []
        for image, target, extra in batch:
            images.append(image)
            targets.append(target)
            extras.append(extra)

        extras = {k: [dic[k] for dic in extras] for k in extras[0]}  # Convert list of dicts to dict of lists

        images = default_collate(images)
        targets = default_collate(targets)
        return images, targets, extras

COCOKeypointsDataset

Bases: BaseKeypointsDataset

Dataset class for training pose estimation models on COCO Keypoints dataset. Use should pass a target generator class that is model-specific and generates the targets for the model.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@register_dataset(Datasets.COCO_KEY_POINTS_DATASET)
class COCOKeypointsDataset(BaseKeypointsDataset):
    """
    Dataset class for training pose estimation models on COCO Keypoints dataset.
    Use should pass a target generator class that is model-specific and generates the targets for the model.
    """

    @resolve_param("transforms", TransformsFactory())
    @resolve_param("target_generator", TargetGeneratorsFactory())
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        json_file: str,
        include_empty_samples: bool,
        target_generator,
        transforms: List[KeypointTransform],
        min_instance_area: float,
        edge_links: Union[List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param data_dir: Root directory of the COCO dataset
        :param images_dir: path suffix to the images directory inside the dataset_root
        :param json_file: path suffix to the json file inside the dataset_root
        :param include_empty_samples: if True, images without any annotations will be included in the dataset.
            Otherwise, they will be filtered out.
        :param target_generator: Target generator that will be used to generate the targets for the model.
            See DEKRTargetsGenerator for an example.
        :param transforms: Transforms to be applied to the image & keypoints
        :param min_instance_area: Minimum area of an instance to be included in the dataset
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """

        json_file = os.path.join(data_dir, json_file)
        self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
            json_file,
            image_path_prefix=os.path.join(data_dir, images_dir),
            remove_duplicate_annotations=False,
            crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
        )
        num_joints = len(self.joints)
        super().__init__(
            transforms=transforms,
            target_generator=target_generator,
            min_instance_area=min_instance_area,
            num_joints=num_joints,
            edge_links=edge_links,
            edge_colors=edge_colors,
            keypoint_colors=keypoint_colors,
        )

        self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
        self.include_empty_samples = include_empty_samples

    def __len__(self):
        if self.include_empty_samples:
            return len(self.annotations)
        else:
            return len(self.non_empty_annotation_indexes)

    def __getitem__(self, index: int) -> Tuple[Tensor, Any, Mapping[str, Any]]:
        img, mask, gt_joints, gt_areas, gt_bboxes, gt_iscrowd = self.load_sample(index)
        img, mask, gt_joints, gt_areas, gt_bboxes = self.transforms(img, mask, gt_joints, areas=gt_areas, bboxes=gt_bboxes)

        image_shape = img.size(1), img.size(2)
        gt_joints, gt_areas, gt_bboxes, gt_iscrowd = self.filter_joints(image_shape, gt_joints, gt_areas, gt_bboxes, gt_iscrowd)

        targets = self.target_generator(img, gt_joints, mask)
        return img, targets, {"gt_joints": gt_joints, "gt_bboxes": gt_bboxes, "gt_iscrowd": gt_iscrowd, "gt_areas": gt_areas}

    def load_sample(self, index):
        if not self.include_empty_samples:
            index = self.non_empty_annotation_indexes[index]
        ann = self.annotations[index]

        image_shape = (ann.image_height, ann.image_width)

        gt_iscrowd = ann.ann_is_crowd.copy()
        gt_joints = ann.ann_keypoints.copy()
        gt_bboxes = ann.ann_boxes_xyxy.copy()
        gt_segmentations = ann.ann_segmentations
        gt_areas = ann.ann_areas.copy()

        orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if orig_image is None:
            # This is a nice fallback/hack to handle case when OpenCV cannot read some images
            # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
            # But we generaly want to read with OpenCV since it's much faster than PIL
            from PIL import Image

            orig_image = Image.open(ann.image_path).convert("BGR")

        if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
            raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

        # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
        image_height, image_width = orig_image.shape[:2]
        gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
        gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
        gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
        gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
        gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

        mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

        return orig_image, mask, gt_joints, gt_areas, gt_bboxes_xywh, gt_iscrowd

    def filter_joints(
        self,
        image_shape,
        joints: np.ndarray,
        areas: np.ndarray,
        bboxes: np.ndarray,
        is_crowd: np.ndarray,
    ):
        """
        Filter instances that are either too small or do not have visible keypoints.

        :param image: Image if [H,W,C] shape. Used to infer image boundaries
        :param joints: Array of shape [Num Instances, Num Joints, 3]
        :param areas: Array of shape [Num Instances] with area of each instance.
                      Instance area comes from segmentation mask from COCO annotation file.
        :param bboxes: Array of shape [Num Instances, 4] for bounding boxes in XYWH format.
                       Bounding boxes comes from segmentation mask from COCO annotation file.
        :param: is_crowd: Array of shape [Num Instances] indicating whether an instance is a crowd target.
        :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
        """

        # Update visibility of joints for those that are outside the image
        outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image_shape[1]) | (joints[:, :, 1] >= image_shape[0])
        joints[outside_image_mask, 2] = 0

        # Filter instances with all invisible keypoints
        instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
        instances_with_good_area = areas > self.min_instance_area

        keep_mask = instances_with_visible_joints & instances_with_good_area

        joints = joints[keep_mask]
        areas = areas[keep_mask]
        bboxes = bboxes[keep_mask]
        is_crowd = is_crowd[keep_mask]

        return joints, areas, bboxes, is_crowd

    def _get_crowd_mask(self, segmentations: List[str], image_shape: Tuple[int, int]) -> np.ndarray:
        """
        This method computes ignore mask, which describes crowd objects / objects w/o keypoints to exclude these predictions from contributing to the loss
        :return: Float mask of [H,W] shape (same as image dimensions),
            where 1.0 values corresponds to pixels that should contribute to the loss, and 0.0 pixels indicates areas that should be excluded.
        """
        m = np.zeros(image_shape, dtype=bool)

        for segmentation in segmentations:
            mask = segmentation2mask(segmentation, image_shape)
            m[mask] = True

        return (m < 0.5).astype(np.float32)

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
        # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
        # to match with the expected input of the model.
        pipeline = [Processings.ReverseImageChannels] + self.transforms.get_equivalent_preprocessing()
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

__init__(data_dir, images_dir, json_file, include_empty_samples, target_generator, transforms, min_instance_area, edge_links, edge_colors, keypoint_colors)

Parameters:

Name Type Description Default
data_dir str

Root directory of the COCO dataset

required
images_dir str

path suffix to the images directory inside the dataset_root

required
json_file str

path suffix to the json file inside the dataset_root

required
include_empty_samples bool

if True, images without any annotations will be included in the dataset. Otherwise, they will be filtered out.

required
target_generator

Target generator that will be used to generate the targets for the model. See DEKRTargetsGenerator for an example.

required
transforms List[KeypointTransform]

Transforms to be applied to the image & keypoints

required
min_instance_area float

Minimum area of an instance to be included in the dataset

required
edge_links Union[List[Tuple[int, int]], np.ndarray]

Edge links between joints

required
edge_colors Union[List[Tuple[int, int, int]], np.ndarray, None]

Color of the edge links. If None, the color will be generated randomly.

required
keypoint_colors Union[List[Tuple[int, int, int]], np.ndarray, None]

Color of the keypoints. If None, the color will be generated randomly.

required
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
@resolve_param("transforms", TransformsFactory())
@resolve_param("target_generator", TargetGeneratorsFactory())
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    json_file: str,
    include_empty_samples: bool,
    target_generator,
    transforms: List[KeypointTransform],
    min_instance_area: float,
    edge_links: Union[List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param data_dir: Root directory of the COCO dataset
    :param images_dir: path suffix to the images directory inside the dataset_root
    :param json_file: path suffix to the json file inside the dataset_root
    :param include_empty_samples: if True, images without any annotations will be included in the dataset.
        Otherwise, they will be filtered out.
    :param target_generator: Target generator that will be used to generate the targets for the model.
        See DEKRTargetsGenerator for an example.
    :param transforms: Transforms to be applied to the image & keypoints
    :param min_instance_area: Minimum area of an instance to be included in the dataset
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """

    json_file = os.path.join(data_dir, json_file)
    self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
        json_file,
        image_path_prefix=os.path.join(data_dir, images_dir),
        remove_duplicate_annotations=False,
        crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
    )
    num_joints = len(self.joints)
    super().__init__(
        transforms=transforms,
        target_generator=target_generator,
        min_instance_area=min_instance_area,
        num_joints=num_joints,
        edge_links=edge_links,
        edge_colors=edge_colors,
        keypoint_colors=keypoint_colors,
    )

    self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
    self.include_empty_samples = include_empty_samples

filter_joints(image_shape, joints, areas, bboxes, is_crowd)

Filter instances that are either too small or do not have visible keypoints.

Parameters:

Name Type Description Default
image

Image if [H,W,C] shape. Used to infer image boundaries

required
joints np.ndarray

Array of shape [Num Instances, Num Joints, 3]

required
areas np.ndarray

Array of shape [Num Instances] with area of each instance. Instance area comes from segmentation mask from COCO annotation file.

required
bboxes np.ndarray

Array of shape [Num Instances, 4] for bounding boxes in XYWH format. Bounding boxes comes from segmentation mask from COCO annotation file.

required

Returns:

Type Description

[New Num Instances, Num Joints, 3], New Num Instances <= Num Instances

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def filter_joints(
    self,
    image_shape,
    joints: np.ndarray,
    areas: np.ndarray,
    bboxes: np.ndarray,
    is_crowd: np.ndarray,
):
    """
    Filter instances that are either too small or do not have visible keypoints.

    :param image: Image if [H,W,C] shape. Used to infer image boundaries
    :param joints: Array of shape [Num Instances, Num Joints, 3]
    :param areas: Array of shape [Num Instances] with area of each instance.
                  Instance area comes from segmentation mask from COCO annotation file.
    :param bboxes: Array of shape [Num Instances, 4] for bounding boxes in XYWH format.
                   Bounding boxes comes from segmentation mask from COCO annotation file.
    :param: is_crowd: Array of shape [Num Instances] indicating whether an instance is a crowd target.
    :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
    """

    # Update visibility of joints for those that are outside the image
    outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image_shape[1]) | (joints[:, :, 1] >= image_shape[0])
    joints[outside_image_mask, 2] = 0

    # Filter instances with all invisible keypoints
    instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
    instances_with_good_area = areas > self.min_instance_area

    keep_mask = instances_with_visible_joints & instances_with_good_area

    joints = joints[keep_mask]
    areas = areas[keep_mask]
    bboxes = bboxes[keep_mask]
    is_crowd = is_crowd[keep_mask]

    return joints, areas, bboxes, is_crowd

get_dataset_preprocessing_params()

Returns:

Type Description
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = [Processings.ReverseImageChannels] + self.transforms.get_equivalent_preprocessing()
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

COCOPoseEstimationDataset

Bases: AbstractPoseEstimationDataset

Dataset class for training pose estimation models using COCO format dataset. Please note that COCO annotations must have exactly one category (e.g. "person") and keypoints must be defined for this category.

Compatible datasets are - COCO2017 dataset - CrowdPose dataset - Any other dataset that is compatible with COCO format

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@register_dataset(Datasets.COCO_POSE_ESTIMATION_DATASET)
class COCOPoseEstimationDataset(AbstractPoseEstimationDataset):
    """
    Dataset class for training pose estimation models using COCO format dataset.
    Please note that COCO annotations must have exactly one category (e.g. "person") and
    keypoints must be defined for this category.

    Compatible datasets are
    - COCO2017 dataset
    - CrowdPose dataset
    - Any other dataset that is compatible with COCO format

    """

    @resolve_param("transforms", TransformsFactory())
    @resolve_param("crowd_annotations_action", TypeFactory.from_enum_cls(CrowdAnnotationActionEnum))
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        json_file: str,
        include_empty_samples: bool,
        transforms: List[AbstractKeypointTransform],
        edge_links: Union[List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        remove_duplicate_annotations: bool = False,
        crowd_annotations_action: CrowdAnnotationActionEnum = CrowdAnnotationActionEnum.NO_ACTION,
    ):
        """

        :param data_dir:                     Root directory of the COCO dataset
        :param images_dir:                   Path suffix to the images directory inside the dataset_root
        :param json_file:                    Path suffix to the json file inside the dataset_root
        :param include_empty_samples:        If True, images without any annotations will be included in the dataset.
                                             Otherwise, they will be filtered out.
        :param transforms:                   Transforms to be applied to the image & keypoints
        :param edge_links:                   Edge links between joints
        :param edge_colors:                  Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors:              Color of the keypoints. If None, the color will be generated randomly.
        :param remove_duplicate_annotations: If True will remove duplicate instances from the dataset.
                                             It is known issue of COCO dataset - it has some duplicate annotations that affects the
                                             AP metric on validation greatly. This option allows to remove these duplicates.
                                             However, it is disabled by default to preserve backward compatibility with COCO evaluation.
                                             When remove_duplicate_annotations is False no action will be taken and these duplicate
                                             instances will be left unchanged. Default value is False.
        :param crowd_annotations_action:     Action to take for annotations with iscrowd=1. Can be one of the following:
                                             "drop_sample" - Samples with crowd annotations will be dropped from the dataset.
                                             "drop_annotation" - Crowd annotations will be dropped from the dataset.
                                             "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations.
                                             "no_action" - No action will be taken for crowd annotations.
        """
        json_file = os.path.join(data_dir, json_file)
        if not os.path.exists(json_file) or not os.path.isfile(json_file):
            raise FileNotFoundError(f"Annotation file {json_file} does not exist")

        self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
            json_file,
            image_path_prefix=os.path.join(data_dir, images_dir),
            remove_duplicate_annotations=remove_duplicate_annotations,
            crowd_annotations_action=crowd_annotations_action,
        )

        num_joints = len(self.joints)

        super().__init__(
            transforms=transforms,
            num_joints=num_joints,
            edge_links=edge_links,
            edge_colors=edge_colors,
            keypoint_colors=keypoint_colors,
        )
        self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
        self.include_empty_samples = include_empty_samples

    def __len__(self):
        if self.include_empty_samples:
            return len(self.annotations)
        else:
            return len(self.non_empty_annotation_indexes)

    def load_sample(self, index: int) -> PoseEstimationSample:
        """
        Read a sample from the disk and return a PoseEstimationSample
        :param index: Sample index
        :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
        """
        if not self.include_empty_samples:
            index = self.non_empty_annotation_indexes[index]
        ann = self.annotations[index]

        image_shape = (ann.image_height, ann.image_width)

        gt_iscrowd = ann.ann_is_crowd.copy()
        gt_joints = ann.ann_keypoints.copy()
        gt_bboxes = ann.ann_boxes_xyxy.copy()
        gt_segmentations = ann.ann_segmentations
        gt_areas = ann.ann_areas.copy()

        orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if orig_image is None:
            # This is a nice fallback/hack to handle case when OpenCV cannot read some images
            # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
            # But we generaly want to read with OpenCV since it's much faster than PIL
            from PIL import Image

            orig_image = Image.open(ann.image_path).convert("BGR")

        if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
            raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

        # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
        image_height, image_width = orig_image.shape[:2]
        gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
        gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
        gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
        gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
        gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

        mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

        return PoseEstimationSample(
            image=orig_image, mask=mask, joints=gt_joints, areas=gt_areas, bboxes_xywh=gt_bboxes_xywh, is_crowd=gt_iscrowd, additional_samples=None
        )

    def _get_crowd_mask(self, segmentations: List[str], image_shape: Tuple[int, int]) -> np.ndarray:
        """
        This method computes ignore mask, which describes crowd objects / objects w/o keypoints to exclude these predictions from contributing to the loss
        :return: Float mask of [H,W] shape (same as image dimensions),
            where 1.0 values corresponds to pixels that should contribute to the loss, and 0.0 pixels indicates areas that should be excluded.
        """
        m = np.zeros(image_shape, dtype=bool)

        for segmentation in segmentations:
            mask = segmentation2mask(segmentation, image_shape)
            m[mask] = True

        return (m < 0.5).astype(np.float32)

    def get_dataset_preprocessing_params(self) -> dict:
        """
        This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.
        :return:
        """
        rgb_to_bgr = {Processings.ReverseImageChannels: {}}
        image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
        pipeline = [rgb_to_bgr] + self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

__init__(data_dir, images_dir, json_file, include_empty_samples, transforms, edge_links, edge_colors, keypoint_colors, remove_duplicate_annotations=False, crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION)

Parameters:

Name Type Description Default
data_dir str

Root directory of the COCO dataset

required
images_dir str

Path suffix to the images directory inside the dataset_root

required
json_file str

Path suffix to the json file inside the dataset_root

required
include_empty_samples bool

If True, images without any annotations will be included in the dataset. Otherwise, they will be filtered out.

required
transforms List[AbstractKeypointTransform]

Transforms to be applied to the image & keypoints

required
edge_links Union[List[Tuple[int, int]], np.ndarray]

Edge links between joints

required
edge_colors Union[List[Tuple[int, int, int]], np.ndarray, None]

Color of the edge links. If None, the color will be generated randomly.

required
keypoint_colors Union[List[Tuple[int, int, int]], np.ndarray, None]

Color of the keypoints. If None, the color will be generated randomly.

required
remove_duplicate_annotations bool

If True will remove duplicate instances from the dataset. It is known issue of COCO dataset - it has some duplicate annotations that affects the AP metric on validation greatly. This option allows to remove these duplicates. However, it is disabled by default to preserve backward compatibility with COCO evaluation. When remove_duplicate_annotations is False no action will be taken and these duplicate instances will be left unchanged. Default value is False.

False
crowd_annotations_action CrowdAnnotationActionEnum

Action to take for annotations with iscrowd=1. Can be one of the following: "drop_sample" - Samples with crowd annotations will be dropped from the dataset. "drop_annotation" - Crowd annotations will be dropped from the dataset. "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations. "no_action" - No action will be taken for crowd annotations.

CrowdAnnotationActionEnum.NO_ACTION
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
@resolve_param("transforms", TransformsFactory())
@resolve_param("crowd_annotations_action", TypeFactory.from_enum_cls(CrowdAnnotationActionEnum))
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    json_file: str,
    include_empty_samples: bool,
    transforms: List[AbstractKeypointTransform],
    edge_links: Union[List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    remove_duplicate_annotations: bool = False,
    crowd_annotations_action: CrowdAnnotationActionEnum = CrowdAnnotationActionEnum.NO_ACTION,
):
    """

    :param data_dir:                     Root directory of the COCO dataset
    :param images_dir:                   Path suffix to the images directory inside the dataset_root
    :param json_file:                    Path suffix to the json file inside the dataset_root
    :param include_empty_samples:        If True, images without any annotations will be included in the dataset.
                                         Otherwise, they will be filtered out.
    :param transforms:                   Transforms to be applied to the image & keypoints
    :param edge_links:                   Edge links between joints
    :param edge_colors:                  Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors:              Color of the keypoints. If None, the color will be generated randomly.
    :param remove_duplicate_annotations: If True will remove duplicate instances from the dataset.
                                         It is known issue of COCO dataset - it has some duplicate annotations that affects the
                                         AP metric on validation greatly. This option allows to remove these duplicates.
                                         However, it is disabled by default to preserve backward compatibility with COCO evaluation.
                                         When remove_duplicate_annotations is False no action will be taken and these duplicate
                                         instances will be left unchanged. Default value is False.
    :param crowd_annotations_action:     Action to take for annotations with iscrowd=1. Can be one of the following:
                                         "drop_sample" - Samples with crowd annotations will be dropped from the dataset.
                                         "drop_annotation" - Crowd annotations will be dropped from the dataset.
                                         "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations.
                                         "no_action" - No action will be taken for crowd annotations.
    """
    json_file = os.path.join(data_dir, json_file)
    if not os.path.exists(json_file) or not os.path.isfile(json_file):
        raise FileNotFoundError(f"Annotation file {json_file} does not exist")

    self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
        json_file,
        image_path_prefix=os.path.join(data_dir, images_dir),
        remove_duplicate_annotations=remove_duplicate_annotations,
        crowd_annotations_action=crowd_annotations_action,
    )

    num_joints = len(self.joints)

    super().__init__(
        transforms=transforms,
        num_joints=num_joints,
        edge_links=edge_links,
        edge_colors=edge_colors,
        keypoint_colors=keypoint_colors,
    )
    self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
    self.include_empty_samples = include_empty_samples

get_dataset_preprocessing_params()

This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.

Returns:

Type Description
dict
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def get_dataset_preprocessing_params(self) -> dict:
    """
    This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.
    :return:
    """
    rgb_to_bgr = {Processings.ReverseImageChannels: {}}
    image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
    pipeline = [rgb_to_bgr] + self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

load_sample(index)

Read a sample from the disk and return a PoseEstimationSample

Parameters:

Name Type Description Default
index int

Sample index

required

Returns:

Type Description
PoseEstimationSample

Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def load_sample(self, index: int) -> PoseEstimationSample:
    """
    Read a sample from the disk and return a PoseEstimationSample
    :param index: Sample index
    :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
    """
    if not self.include_empty_samples:
        index = self.non_empty_annotation_indexes[index]
    ann = self.annotations[index]

    image_shape = (ann.image_height, ann.image_width)

    gt_iscrowd = ann.ann_is_crowd.copy()
    gt_joints = ann.ann_keypoints.copy()
    gt_bboxes = ann.ann_boxes_xyxy.copy()
    gt_segmentations = ann.ann_segmentations
    gt_areas = ann.ann_areas.copy()

    orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if orig_image is None:
        # This is a nice fallback/hack to handle case when OpenCV cannot read some images
        # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
        # But we generaly want to read with OpenCV since it's much faster than PIL
        from PIL import Image

        orig_image = Image.open(ann.image_path).convert("BGR")

    if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
        raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

    # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
    image_height, image_width = orig_image.shape[:2]
    gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
    gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
    gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
    gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
    gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

    mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

    return PoseEstimationSample(
        image=orig_image, mask=mask, joints=gt_joints, areas=gt_areas, bboxes_xywh=gt_bboxes_xywh, is_crowd=gt_iscrowd, additional_samples=None
    )

CrowdAnnotationActionEnum

Bases: str, Enum

Enum that contains possible actions to take for crowd annotations.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py
20
21
22
23
24
25
26
27
28
class CrowdAnnotationActionEnum(str, Enum):
    """
    Enum that contains possible actions to take for crowd annotations.
    """

    DROP_SAMPLE = "drop_sample"
    DROP_ANNOTATION = "drop_annotation"
    MASK_AS_NORMAL = "mask_as_normal"
    NO_ACTION = "no_action"

parse_coco_into_keypoints_annotations(ann, image_path_prefix=None, crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION, remove_duplicate_annotations=False)

Load COCO keypoints dataset from annotation file.

Parameters:

Name Type Description Default
ann str

A path to the JSON annotation file in COCO format.

required
image_path_prefix

A prefix to add to the image paths in the annotation file.

None

Returns:

Type Description
Tuple[str, Dict, List[KeypointsAnnotation]]

Tuple (class_names, annotations) where class_names is a list of class names (respecting include_classes/exclude_classes/class_ids_to_ignore) and annotations is a list of DetectionAnnotation objects.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def parse_coco_into_keypoints_annotations(
    ann: str,
    image_path_prefix=None,
    crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
    remove_duplicate_annotations: bool = False,
) -> Tuple[str, Dict, List[KeypointsAnnotation]]:
    """
    Load COCO keypoints dataset from annotation file.
    :param ann: A path to the JSON annotation file in COCO format.
    :param image_path_prefix:   A prefix to add to the image paths in the annotation file.
    :return:                    Tuple (class_names, annotations) where class_names is a list of class names
                                (respecting include_classes/exclude_classes/class_ids_to_ignore) and
                                annotations is a list of DetectionAnnotation objects.
    """
    with open(ann, "r") as f:
        coco = json.load(f)

    if len(coco["categories"]) != 1:
        raise ValueError("Dataset must contain exactly one category")

    # Extract class names and class ids
    category_name = coco["categories"][0]["name"]
    keypoints = coco["categories"][0]["keypoints"]
    num_keypoints = len(keypoints)

    # Extract box annotations
    ann_box_xyxy = xywh_to_xyxy_inplace(np.array([annotation["bbox"] for annotation in coco["annotations"]], dtype=np.float32), image_shape=None)
    ann_keypoints = np.stack([np.array(annotation["keypoints"], dtype=np.float32).reshape(num_keypoints, 3) for annotation in coco["annotations"]])
    ann_iscrowd = np.array([annotation["iscrowd"] for annotation in coco["annotations"]], dtype=bool)
    ann_image_ids = np.array([annotation["image_id"] for annotation in coco["annotations"]], dtype=int)
    ann_segmentations = np.array([annotation["segmentation"] for annotation in coco["annotations"]], dtype=np.object_)

    # We check whether the area is present in the annotations. If it does we use it, otherwise we compute it from the bbox.
    if "area" in coco["annotations"][0]:
        ann_areas = np.array([annotation["area"] for annotation in coco["annotations"]], dtype=np.float32)
    else:
        # Compute area from box
        # A multiplier of 0.53 is a heuristic from pycocotools to approximate the area of the pose instance
        # from the area of the bounding box.
        ann_areas = np.prod(ann_box_xyxy[:, 2:] - ann_box_xyxy[:, :2], axis=-1) * 0.53

    # Extract image stuff
    img_ids = np.array([img["id"] for img in coco["images"]], dtype=int)
    img_paths = np.array([img["file_name"] if "file_name" in img else "{:012}".format(img["id"]) + ".jpg" for img in coco["images"]], dtype=str)
    img_width_height = np.array([(img["width"], img["height"]) for img in coco["images"]], dtype=int)

    annotations = []

    if crowd_annotations_action == CrowdAnnotationActionEnum.MASK_AS_NORMAL:
        ann_iscrowd = np.zeros_like(ann_iscrowd, dtype=bool)
    elif crowd_annotations_action == CrowdAnnotationActionEnum.DROP_ANNOTATION:
        ann_box_xyxy = ann_box_xyxy[~ann_iscrowd]
        ann_keypoints = ann_keypoints[~ann_iscrowd]
        ann_areas = ann_areas[~ann_iscrowd]
        ann_segmentations = ann_segmentations[~ann_iscrowd]
        ann_image_ids = ann_image_ids[~ann_iscrowd]
        ann_iscrowd = ann_iscrowd[~ann_iscrowd]

    for img_id, image_path, (image_width, image_height) in zip(img_ids, img_paths, img_width_height):
        mask = ann_image_ids == img_id

        if image_path_prefix is not None:
            image_path = os.path.join(image_path_prefix, image_path)

        ann = KeypointsAnnotation(
            image_id=img_id,
            image_path=image_path,
            image_width=image_width,
            image_height=image_height,
            ann_boxes_xyxy=ann_box_xyxy[mask],
            ann_is_crowd=ann_iscrowd[mask],
            ann_areas=ann_areas[mask],
            ann_keypoints=ann_keypoints[mask],
            ann_segmentations=ann_segmentations[mask],
        )

        if remove_duplicate_annotations:
            joints = ann.ann_keypoints[:, :, :2]
            gt_joints1 = np.expand_dims(joints, axis=0)  # [1, Num_people, Num_joints, 2]
            gt_joints2 = np.expand_dims(joints, axis=1)  # [Num_people, 1, Num_joints, 2]
            diff = np.sqrt(np.sum((gt_joints1 - gt_joints2) ** 2, axis=-1))  # [Num_people, Num_people, Num_joints]
            diffmean = np.mean(diff, axis=-1)

            duplicate_mask = np.triu(diffmean < 2, k=1)
            duplicate_indexes_i, duplicate_indexes_j = np.nonzero(duplicate_mask)
            keep_mask = np.ones(len(ann.ann_boxes_xyxy), dtype=bool)
            for i, j in zip(duplicate_indexes_i, duplicate_indexes_j):
                keep_mask[j] = False

            ann.ann_boxes_xyxy = ann.ann_boxes_xyxy[keep_mask]
            ann.ann_keypoints = ann.ann_keypoints[keep_mask]
            ann.ann_areas = ann.ann_areas[keep_mask]
            ann.ann_segmentations = ann.ann_segmentations[keep_mask]
            ann.ann_is_crowd = ann.ann_is_crowd[keep_mask]

        if crowd_annotations_action == CrowdAnnotationActionEnum.DROP_SAMPLE:
            if ann.ann_is_crowd.any():
                continue

        annotations.append(ann)

    return category_name, keypoints, annotations

rle2mask(rle, image_shape)

Convert RLE to binary mask

Parameters:

Name Type Description Default
rle np.ndarray

A string containing RLE-encoded mask

required
image_shape Tuple[int, int]

Output image shape (rows, cols)

required

Returns:

Type Description

A decoded binary mask

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def rle2mask(rle: np.ndarray, image_shape: Tuple[int, int]):
    """
    Convert RLE to binary mask
    :param rle: A string containing RLE-encoded mask
    :param image_shape: Output image shape (rows, cols)
    :return: A decoded binary mask
    """
    rle = np.array(rle, dtype=int)

    value = 0
    start = 0
    img = np.zeros(image_shape[0] * image_shape[1], dtype=np.uint8)
    for offset in rle:
        img[start : start + offset] = value
        start += offset
        value = 1 - value

    return img.reshape(*reversed(image_shape)).T

segmentation2mask(segmentation, image_shape)

Decode segmentation annotation into binary mask

Parameters:

Name Type Description Default
segmentation

Input segmentation annotation. Can come in many forms: -

required
image_shape Tuple[int, int] required

Returns:

Type Description
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def segmentation2mask(segmentation, image_shape: Tuple[int, int]):
    """
    Decode segmentation annotation into binary mask
    :param segmentation: Input segmentation annotation. Can come in many forms:
                         -
    :param image_shape:
    :return:
    """
    m = np.zeros(image_shape, dtype=np.uint8)

    if isinstance(segmentation, list) and len(segmentation):
        if isinstance(segmentation[0], numbers.Number):
            if len(segmentation) == 4:
                # box?
                unsupported_input_repr = pprint.pformat(segmentation)
                raise ValueError(
                    "Box encoding is not supported yet.\n"
                    "Please open an issue on GitHub (https://github.com/Deci-AI/super-gradients/issues) and attach the following information:\n"
                    "```python\n"
                    f"image_shape = {image_shape}\n"
                    f"segmentation = {unsupported_input_repr}\n"
                    "```python\n"
                )
            else:
                poly2mask(segmentation, m)
        else:
            for seg_i in segmentation:
                poly2mask(seg_i, m)
    elif isinstance(segmentation, dict) and "counts" in segmentation and "size" in segmentation:
        rle = segmentation["counts"]
        m = rle2mask(rle, image_shape)
    else:
        unsupported_input_repr = pprint.pformat(segmentation)
        raise ValueError(
            "Unknown segmentation format\n"
            "Please open an issue on GitHub (https://github.com/Deci-AI/super-gradients/issues) and attach the following information:\n"
            "```python\n"
            f"image_shape = {image_shape}\n"
            f"segmentation = {unsupported_input_repr}\n"
            "```python\n"
        )
    return m

TrainRescoringDataset

Bases: RescoringDataset

Implementation of the dataset for training the rescoring network. In this implementation, the dataset is a list of individual poses and DataLoader randomly samples them to form a batch during training.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/rescoring_dataset.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class TrainRescoringDataset(RescoringDataset):
    """
    Implementation of the dataset for training the rescoring network.
    In this implementation, the dataset is a list of individual poses and DataLoader randomly samples
    them to form a batch during training.
    """

    def __init__(self, pkl_file: str):
        super().__init__(pkl_file)
        self.pred_poses = []
        self.pred_scores = []
        self.iou = []

        for sample in self.parse_pkl_file(pkl_file):
            pred_poses = sample["pred_poses"]
            pred_scores = sample["pred_scores"]
            iou = sample["iou"]

            self.pred_poses.extend(pred_poses)
            self.pred_scores.extend(pred_scores)
            self.iou.extend(iou)

        self.num_samples = len(self.pred_poses)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        inputs = torch.tensor(self.pred_poses[index])
        targets = torch.tensor([self.iou[index]])
        return inputs, targets

ValTrainRescoringDataset

Bases: RescoringDataset

Implementation of the dataset for validating the rescoring model. It differs from the training dataset implementation. Each sample represents a single image with all the poses on it, this enables us to compute pose estimation metrics after rescoring.

This dataset is intended to be used with DataLoader with batch_size=1. In this case we don't need to padd poses in collate_fn.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/rescoring_dataset.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class ValTrainRescoringDataset(RescoringDataset):
    """
    Implementation of the dataset for validating the rescoring model.
    It differs from the training dataset implementation. Each sample represents a single image with all the poses
    on it, this enables us to compute pose estimation metrics after rescoring.

    This dataset is intended to be used with DataLoader with batch_size=1.
    In this case we don't need to padd poses in collate_fn.
    """

    def __init__(self, pkl_file: str):
        super().__init__(pkl_file)

        self.pred_poses = []
        self.pred_scores = []
        self.extras = []
        self.gt_joints = []
        self.gt_is_crowd = []
        self.gt_area = []
        self.iou = []

        for sample in self.parse_pkl_file(pkl_file):
            pred_poses = sample["pred_poses"]
            pred_scores = sample["pred_scores"]
            extras = dict(gt_joints=sample["gt_joints"], gt_iscrowd=sample["gt_iscrowd"], gt_bboxes=sample["gt_bboxes"], gt_areas=sample["gt_areas"])
            iou = sample["iou"]

            self.pred_poses.append(pred_poses)
            self.pred_scores.append(pred_scores)
            self.extras.append(extras)
            self.iou.append(iou)

        self.num_joints = next(p.shape[1] for p in self.pred_poses if len(p))
        self.num_samples = len(self.pred_poses)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        inputs = torch.tensor(self.pred_poses[index]).reshape(-1, self.num_joints, 3)
        targets = torch.tensor(self.iou[index]).reshape(-1, 1)
        extras = self.extras[index]
        return inputs, targets, extras

DEKRTargetsGenerator

Bases: KeypointsTargetsGenerator

Target generator for pose estimation task tailored for the DEKR paper (https://arxiv.org/abs/2104.02300)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
@register_target_generator()
class DEKRTargetsGenerator(KeypointsTargetsGenerator):
    """
    Target generator for pose estimation task tailored for the DEKR paper (https://arxiv.org/abs/2104.02300)
    """

    def __init__(self, output_stride: int, sigma: float, center_sigma: float, bg_weight: float, offset_radius: float):
        """

        :param output_stride: Downsampling factor for target maps (w.r.t to input image resolution)
        :param sigma: Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)
        :param center_sigma: Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)
        :param bg_weight: Weight assigned to all background pixels (used to re-weight the heatmap loss)
        :param offset_radius: Radius for the offset encoding (in pixels)
        """
        self.output_stride = output_stride
        self.sigma = sigma
        self.center_sigma = center_sigma
        self.bg_weight = bg_weight
        self.offset_radius = offset_radius

    def get_heat_val(self, sigma: float, x, y, x0, y0) -> float:
        g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma**2))
        return g

    def compute_area(self, joints: np.ndarray) -> np.ndarray:
        """
        Compute area of a bounding box for each instance
        :param joints:  [Num Instances, Num Joints, 3]
        :return: [Num Instances]
        """
        w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
        h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
        return w * h

    def sort_joints_by_area(self, joints: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Rearrange joints in descending order of area of bounding box around them
        """
        area = self.compute_area(joints)
        order = np.argsort(-area)
        joints = joints[order]
        area = area[order]
        return joints, area

    def augment_with_center_joint(self, joints: np.ndarray) -> np.ndarray:
        """
        Augment set of joints with additional center joint.
        Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint.
        Only instances with at least one visible joint are returned.

        :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
        :return: [Num Instances, Num Joints + 1, 3]
        """
        augmented_joints = []
        num_joints = joints.shape[1]
        num_joints_with_center = num_joints + 1

        for keypoints in joints:
            # Computing a center point for each person
            visible_keypoints = keypoints[:, 2] > 0
            joints_sum = np.sum(keypoints[:, :2] * np.expand_dims(visible_keypoints, -1), axis=0)
            num_vis_joints = np.count_nonzero(visible_keypoints)
            if num_vis_joints == 0:
                raise ValueError("No visible joints found in instance. ")

            keypoints_with_center = np.zeros((num_joints_with_center, 3))
            keypoints_with_center[0:num_joints] = keypoints
            keypoints_with_center[-1, :2] = joints_sum / num_vis_joints
            keypoints_with_center[-1, 2] = 1

            augmented_joints.append(keypoints_with_center)

        joints = np.array(augmented_joints, dtype=np.float32).reshape((-1, num_joints_with_center, 3))
        return joints

    def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Encode the keypoints into dense targets that participate in loss computation.
        :param image: Image tensor [3, H, W]
        :param joints: [Instances, NumJoints, 3]
        :param mask: [H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.
        :return: Tuple of (heatmap, mask, offset, offset_weight)
            heatmap    - [NumJoints+1, H // Output Stride, W // Output Stride]
            mask       - [NumJoints+1, H // Output Stride, H // Output Stride]
            offset     - [NumJoints*2, H // Output Stride, W // Output Stride]
            offset_weight - [NumJoints*2, H // Output Stride, W // Output Stride]
        """
        if image.shape[1:3] != mask.shape[:2]:
            raise ValueError(f"Image and mask should have the same shape {image.shape[1:3]} != {mask.shape[:2]}")

        if image.shape[1] % self.output_stride != 0 or image.shape[2] % self.output_stride != 0:
            raise ValueError("Image shape should be divisible by output stride")

        num_instances, num_joints, _ = joints.shape
        num_joints_with_center = num_joints + 1

        joints, area = self.sort_joints_by_area(joints)
        joints = self.augment_with_center_joint(joints)

        # Compute the size of the target maps
        rows, cols = mask.shape
        output_rows, output_cols = rows // self.output_stride, cols // self.output_stride

        heatmaps = np.zeros(
            shape=(num_joints_with_center, output_rows, output_cols),
            dtype=np.float32,
        )

        ignored_hms = 2 * np.ones(
            shape=(num_joints_with_center, output_rows, output_cols),
            dtype=np.float32,
        )  # Start with 2 in all places

        offset_map = np.zeros(
            (num_joints * 2, output_rows, output_cols),
            dtype=np.float32,
        )
        offset_weight = np.zeros(
            (num_joints * 2, output_rows, output_cols),
            dtype=np.float32,
        )

        sx = output_cols / cols
        sy = output_rows / rows
        joints = joints.copy()
        joints[:, :, 0] *= sx
        joints[:, :, 1] *= sy

        for person_id, p in enumerate(joints):
            for idx, pt in enumerate(p):
                if idx < num_joints:  # Last joint index is object center
                    sigma = self.sigma
                else:
                    sigma = self.center_sigma

                if pt[2] > 0:
                    x, y = pt[0], pt[1]
                    if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                        continue

                    ul = int(np.floor(x - 3 * sigma - 1)), int(np.floor(y - 3 * sigma - 1))
                    br = int(np.ceil(x + 3 * sigma + 1)), int(np.ceil(y + 3 * sigma + 1))

                    aa, bb = max(0, ul[1]), min(br[1], output_rows)
                    cc, dd = max(0, ul[0]), min(br[0], output_cols)

                    joint_rg = np.zeros((bb - aa, dd - cc), dtype=np.float32)
                    for sy in range(aa, bb):
                        for sx in range(cc, dd):
                            # EK: Note we round x/y values here to obtain clear peak in the center of odd-sized heatmap
                            # joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, x, y)
                            joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, int(x), int(y))

                    # It is important for RFL loss to have 1.0 in heatmap. since 0.9999 would be interpreted as negative pixel
                    joint_rg[joint_rg.shape[0] // 2, joint_rg.shape[1] // 2] = 1

                    heatmaps[idx, aa:bb, cc:dd] = np.maximum(heatmaps[idx, aa:bb, cc:dd], joint_rg)
                    # print(heatmaps[-1, 0, 0])
                    ignored_hms[idx, aa:bb, cc:dd] = 1.0

        for person_id, p in enumerate(joints):
            person_area = area[person_id]
            offset_weight_factor = 1.0 / np.clip(np.sqrt(person_area), a_min=1, a_max=None)
            ct_x = int(p[-1, 0])
            ct_y = int(p[-1, 1])
            ct_v = int(p[-1, 2])
            if ct_v < 1 or ct_x < 0 or ct_y < 0 or ct_x >= output_cols or ct_y >= output_rows:
                continue

            for idx, pt in enumerate(p[:-1]):
                if pt[2] > 0:
                    x, y = pt[0], pt[1]
                    if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                        continue

                    start_x = max(int(ct_x - self.offset_radius), 0)
                    start_y = max(int(ct_y - self.offset_radius), 0)
                    end_x = min(int(ct_x + self.offset_radius), output_cols)
                    end_y = min(int(ct_y + self.offset_radius), output_rows)

                    for pos_x in range(start_x, end_x):
                        for pos_y in range(start_y, end_y):
                            offset_x = pos_x - x
                            offset_y = pos_y - y

                            offset_map[idx * 2, pos_y, pos_x] = offset_x
                            offset_map[idx * 2 + 1, pos_y, pos_x] = offset_y
                            offset_weight[idx * 2, pos_y, pos_x] = offset_weight_factor
                            offset_weight[idx * 2 + 1, pos_y, pos_x] = offset_weight_factor

        ignored_hms[ignored_hms == 2] = self.bg_weight

        mask = cv2.resize(mask, dsize=(output_cols, output_rows), interpolation=cv2.INTER_LINEAR)
        mask = (mask > 0).astype(np.float32)
        mask = mask * ignored_hms

        return heatmaps, mask, offset_map, offset_weight

__call__(image, joints, mask)

Encode the keypoints into dense targets that participate in loss computation.

Parameters:

Name Type Description Default
image Tensor

Image tensor [3, H, W]

required
joints np.ndarray

[Instances, NumJoints, 3]

required
mask np.ndarray

[H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.

required

Returns:

Type Description
Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]

Tuple of (heatmap, mask, offset, offset_weight) heatmap - [NumJoints+1, H // Output Stride, W // Output Stride] mask - [NumJoints+1, H // Output Stride, H // Output Stride] offset - [NumJoints2, H // Output Stride, W // Output Stride] offset_weight - [NumJoints2, H // Output Stride, W // Output Stride]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Encode the keypoints into dense targets that participate in loss computation.
    :param image: Image tensor [3, H, W]
    :param joints: [Instances, NumJoints, 3]
    :param mask: [H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.
    :return: Tuple of (heatmap, mask, offset, offset_weight)
        heatmap    - [NumJoints+1, H // Output Stride, W // Output Stride]
        mask       - [NumJoints+1, H // Output Stride, H // Output Stride]
        offset     - [NumJoints*2, H // Output Stride, W // Output Stride]
        offset_weight - [NumJoints*2, H // Output Stride, W // Output Stride]
    """
    if image.shape[1:3] != mask.shape[:2]:
        raise ValueError(f"Image and mask should have the same shape {image.shape[1:3]} != {mask.shape[:2]}")

    if image.shape[1] % self.output_stride != 0 or image.shape[2] % self.output_stride != 0:
        raise ValueError("Image shape should be divisible by output stride")

    num_instances, num_joints, _ = joints.shape
    num_joints_with_center = num_joints + 1

    joints, area = self.sort_joints_by_area(joints)
    joints = self.augment_with_center_joint(joints)

    # Compute the size of the target maps
    rows, cols = mask.shape
    output_rows, output_cols = rows // self.output_stride, cols // self.output_stride

    heatmaps = np.zeros(
        shape=(num_joints_with_center, output_rows, output_cols),
        dtype=np.float32,
    )

    ignored_hms = 2 * np.ones(
        shape=(num_joints_with_center, output_rows, output_cols),
        dtype=np.float32,
    )  # Start with 2 in all places

    offset_map = np.zeros(
        (num_joints * 2, output_rows, output_cols),
        dtype=np.float32,
    )
    offset_weight = np.zeros(
        (num_joints * 2, output_rows, output_cols),
        dtype=np.float32,
    )

    sx = output_cols / cols
    sy = output_rows / rows
    joints = joints.copy()
    joints[:, :, 0] *= sx
    joints[:, :, 1] *= sy

    for person_id, p in enumerate(joints):
        for idx, pt in enumerate(p):
            if idx < num_joints:  # Last joint index is object center
                sigma = self.sigma
            else:
                sigma = self.center_sigma

            if pt[2] > 0:
                x, y = pt[0], pt[1]
                if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                    continue

                ul = int(np.floor(x - 3 * sigma - 1)), int(np.floor(y - 3 * sigma - 1))
                br = int(np.ceil(x + 3 * sigma + 1)), int(np.ceil(y + 3 * sigma + 1))

                aa, bb = max(0, ul[1]), min(br[1], output_rows)
                cc, dd = max(0, ul[0]), min(br[0], output_cols)

                joint_rg = np.zeros((bb - aa, dd - cc), dtype=np.float32)
                for sy in range(aa, bb):
                    for sx in range(cc, dd):
                        # EK: Note we round x/y values here to obtain clear peak in the center of odd-sized heatmap
                        # joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, x, y)
                        joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, int(x), int(y))

                # It is important for RFL loss to have 1.0 in heatmap. since 0.9999 would be interpreted as negative pixel
                joint_rg[joint_rg.shape[0] // 2, joint_rg.shape[1] // 2] = 1

                heatmaps[idx, aa:bb, cc:dd] = np.maximum(heatmaps[idx, aa:bb, cc:dd], joint_rg)
                # print(heatmaps[-1, 0, 0])
                ignored_hms[idx, aa:bb, cc:dd] = 1.0

    for person_id, p in enumerate(joints):
        person_area = area[person_id]
        offset_weight_factor = 1.0 / np.clip(np.sqrt(person_area), a_min=1, a_max=None)
        ct_x = int(p[-1, 0])
        ct_y = int(p[-1, 1])
        ct_v = int(p[-1, 2])
        if ct_v < 1 or ct_x < 0 or ct_y < 0 or ct_x >= output_cols or ct_y >= output_rows:
            continue

        for idx, pt in enumerate(p[:-1]):
            if pt[2] > 0:
                x, y = pt[0], pt[1]
                if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                    continue

                start_x = max(int(ct_x - self.offset_radius), 0)
                start_y = max(int(ct_y - self.offset_radius), 0)
                end_x = min(int(ct_x + self.offset_radius), output_cols)
                end_y = min(int(ct_y + self.offset_radius), output_rows)

                for pos_x in range(start_x, end_x):
                    for pos_y in range(start_y, end_y):
                        offset_x = pos_x - x
                        offset_y = pos_y - y

                        offset_map[idx * 2, pos_y, pos_x] = offset_x
                        offset_map[idx * 2 + 1, pos_y, pos_x] = offset_y
                        offset_weight[idx * 2, pos_y, pos_x] = offset_weight_factor
                        offset_weight[idx * 2 + 1, pos_y, pos_x] = offset_weight_factor

    ignored_hms[ignored_hms == 2] = self.bg_weight

    mask = cv2.resize(mask, dsize=(output_cols, output_rows), interpolation=cv2.INTER_LINEAR)
    mask = (mask > 0).astype(np.float32)
    mask = mask * ignored_hms

    return heatmaps, mask, offset_map, offset_weight

__init__(output_stride, sigma, center_sigma, bg_weight, offset_radius)

Parameters:

Name Type Description Default
output_stride int

Downsampling factor for target maps (w.r.t to input image resolution)

required
sigma float

Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)

required
center_sigma float

Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)

required
bg_weight float

Weight assigned to all background pixels (used to re-weight the heatmap loss)

required
offset_radius float

Radius for the offset encoding (in pixels)

required
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(self, output_stride: int, sigma: float, center_sigma: float, bg_weight: float, offset_radius: float):
    """

    :param output_stride: Downsampling factor for target maps (w.r.t to input image resolution)
    :param sigma: Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)
    :param center_sigma: Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)
    :param bg_weight: Weight assigned to all background pixels (used to re-weight the heatmap loss)
    :param offset_radius: Radius for the offset encoding (in pixels)
    """
    self.output_stride = output_stride
    self.sigma = sigma
    self.center_sigma = center_sigma
    self.bg_weight = bg_weight
    self.offset_radius = offset_radius

augment_with_center_joint(joints)

Augment set of joints with additional center joint. Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint. Only instances with at least one visible joint are returned.

Parameters:

Name Type Description Default
joints np.ndarray

[Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)

required

Returns:

Type Description
np.ndarray

[Num Instances, Num Joints + 1, 3]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def augment_with_center_joint(self, joints: np.ndarray) -> np.ndarray:
    """
    Augment set of joints with additional center joint.
    Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint.
    Only instances with at least one visible joint are returned.

    :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
    :return: [Num Instances, Num Joints + 1, 3]
    """
    augmented_joints = []
    num_joints = joints.shape[1]
    num_joints_with_center = num_joints + 1

    for keypoints in joints:
        # Computing a center point for each person
        visible_keypoints = keypoints[:, 2] > 0
        joints_sum = np.sum(keypoints[:, :2] * np.expand_dims(visible_keypoints, -1), axis=0)
        num_vis_joints = np.count_nonzero(visible_keypoints)
        if num_vis_joints == 0:
            raise ValueError("No visible joints found in instance. ")

        keypoints_with_center = np.zeros((num_joints_with_center, 3))
        keypoints_with_center[0:num_joints] = keypoints
        keypoints_with_center[-1, :2] = joints_sum / num_vis_joints
        keypoints_with_center[-1, 2] = 1

        augmented_joints.append(keypoints_with_center)

    joints = np.array(augmented_joints, dtype=np.float32).reshape((-1, num_joints_with_center, 3))
    return joints

compute_area(joints)

Compute area of a bounding box for each instance

Parameters:

Name Type Description Default
joints np.ndarray

[Num Instances, Num Joints, 3]

required

Returns:

Type Description
np.ndarray

[Num Instances]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
54
55
56
57
58
59
60
61
62
def compute_area(self, joints: np.ndarray) -> np.ndarray:
    """
    Compute area of a bounding box for each instance
    :param joints:  [Num Instances, Num Joints, 3]
    :return: [Num Instances]
    """
    w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
    h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
    return w * h

sort_joints_by_area(joints)

Rearrange joints in descending order of area of bounding box around them

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
64
65
66
67
68
69
70
71
72
def sort_joints_by_area(self, joints: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Rearrange joints in descending order of area of bounding box around them
    """
    area = self.compute_area(joints)
    order = np.argsort(-area)
    joints = joints[order]
    area = area[order]
    return joints, area

KeypointsTargetsGenerator

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class KeypointsTargetsGenerator:
    @abc.abstractmethod
    def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]:
        """
        Encode input joints into target tensors

        :param image: [C,H,W] Input image tensor
        :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
        :param mask: [H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets
                           are not used during training and corresponding instances will be zero-masked.
                           Your implementation may use this mask when generating targets.
        :return: Encoded targets
        """
        raise NotImplementedError()

__call__(image, joints, mask) abstractmethod

Encode input joints into target tensors

Parameters:

Name Type Description Default
image Tensor

[C,H,W] Input image tensor

required
joints np.ndarray

[Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)

required
mask np.ndarray

[H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets are not used during training and corresponding instances will be zero-masked. Your implementation may use this mask when generating targets.

required

Returns:

Type Description
Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]

Encoded targets

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py
14
15
16
17
18
19
20
21
22
23
24
25
26
@abc.abstractmethod
def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]:
    """
    Encode input joints into target tensors

    :param image: [C,H,W] Input image tensor
    :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
    :param mask: [H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets
                       are not used during training and corresponding instances will be zero-masked.
                       Your implementation may use this mask when generating targets.
    :return: Encoded targets
    """
    raise NotImplementedError()

YoloNASPoseCollateFN

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@register_collate_function()
class YoloNASPoseCollateFN:
    def __init__(self, set_image_to_none: bool = True):
        """

        :param set_image_to_none: If True, image and mask properties for each sample will be set to None after collation.
                                  After we collate images from samples into batch individual images are not needed anymore.
                                  Keeping them in sample slows down data transfer time and slows training 2X.
                                  If True, image and mask properties will be set to None after collation.
                                  If False, image and mask properties will be converted to torch tensors and kept in the sample.
        """
        self.set_image_to_none = set_image_to_none

    def __call__(self, batch: List[PoseEstimationSample]) -> Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]:
        """
        Collate samples into a batch.
        This collate function is compatible with YoloNASPose model

        :param batch: A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)
        :return: Tuple of (images, (boxes, joints), extras)
        - images: [Batch, 3, H, W]
        - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch
        - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility)
        - extras: A dict of extra information per image need for metric computation
        """
        all_images = []
        all_boxes = []
        all_joints = []
        all_crowd_masks = []

        for sample in batch:
            # Generate targets
            boxes, joints, is_crowd = self._get_targets(sample)
            all_boxes.append(boxes)
            all_joints.append(joints)
            all_crowd_masks.append(is_crowd)

            # Convert image & mask to tensors
            # Change image layout from HWC to CHW
            sample.image = torch.from_numpy(np.transpose(sample.image, [2, 0, 1]))
            sample.mask = torch.from_numpy(sample.mask)
            all_images.append(sample.image)

            # Remove image and mask from sample because at this point we don't need them anymore
            if self.set_image_to_none:
                sample.image = None
                sample.mask = None

            # Make sure additional samples are None, so they don't get collated as it causes collate to slow down
            sample.additional_samples = None

        all_images = default_collate(all_images)
        boxes = flat_collate_tensors_with_batch_index(all_boxes)
        joints = flat_collate_tensors_with_batch_index(all_joints)
        is_crowd = flat_collate_tensors_with_batch_index(all_crowd_masks)
        extras = {"gt_samples": batch}
        return all_images, (boxes, joints, is_crowd), extras

    def _get_targets(self, sample: PoseEstimationSample) -> Tuple[Tensor, Tensor, Tensor]:
        """
        Generate targets for training YoloNASPose from a single PoseEstimationSample
        :param sample: Input PoseEstimationSample
        :return:       Tuple of (boxes, joints, is_crowd)
                       - boxes - [NumInstances, 4] - torch tensor of bounding boxe (XYXY) for each pose instance in a sample
                       - joints - [NumInstances, NumJoints, 3] - torch tensor of pose joints for each pose instance in a sample
                       - is_crowd - [NumInstances, 1] - torch tensor of boolean flags indicating if a pose instance is crowd
        """
        if sample.image.shape[:2] != sample.mask.shape[:2]:
            raise ValueError(f"Image and mask should have the same shape {sample.image.shape[:2]} != {sample.mask.shape[:2]}")

        boxes_xyxy = xywh_to_xyxy(sample.bboxes_xywh, image_shape=None)
        is_crowd = sample.is_crowd
        if is_crowd is None:
            is_crowd = np.zeros(len(boxes_xyxy))

        return torch.from_numpy(boxes_xyxy), torch.from_numpy(sample.joints), torch.from_numpy(is_crowd.astype(int).reshape((-1, 1)))

__call__(batch)

Collate samples into a batch. This collate function is compatible with YoloNASPose model

Parameters:

Name Type Description Default
batch List[PoseEstimationSample]

A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)

required

Returns:

Type Description
Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]

Tuple of (images, (boxes, joints), extras) - images: [Batch, 3, H, W] - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility) - extras: A dict of extra information per image need for metric computation

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def __call__(self, batch: List[PoseEstimationSample]) -> Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]:
    """
    Collate samples into a batch.
    This collate function is compatible with YoloNASPose model

    :param batch: A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)
    :return: Tuple of (images, (boxes, joints), extras)
    - images: [Batch, 3, H, W]
    - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch
    - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility)
    - extras: A dict of extra information per image need for metric computation
    """
    all_images = []
    all_boxes = []
    all_joints = []
    all_crowd_masks = []

    for sample in batch:
        # Generate targets
        boxes, joints, is_crowd = self._get_targets(sample)
        all_boxes.append(boxes)
        all_joints.append(joints)
        all_crowd_masks.append(is_crowd)

        # Convert image & mask to tensors
        # Change image layout from HWC to CHW
        sample.image = torch.from_numpy(np.transpose(sample.image, [2, 0, 1]))
        sample.mask = torch.from_numpy(sample.mask)
        all_images.append(sample.image)

        # Remove image and mask from sample because at this point we don't need them anymore
        if self.set_image_to_none:
            sample.image = None
            sample.mask = None

        # Make sure additional samples are None, so they don't get collated as it causes collate to slow down
        sample.additional_samples = None

    all_images = default_collate(all_images)
    boxes = flat_collate_tensors_with_batch_index(all_boxes)
    joints = flat_collate_tensors_with_batch_index(all_joints)
    is_crowd = flat_collate_tensors_with_batch_index(all_crowd_masks)
    extras = {"gt_samples": batch}
    return all_images, (boxes, joints, is_crowd), extras

__init__(set_image_to_none=True)

Parameters:

Name Type Description Default
set_image_to_none bool

If True, image and mask properties for each sample will be set to None after collation. After we collate images from samples into batch individual images are not needed anymore. Keeping them in sample slows down data transfer time and slows training 2X. If True, image and mask properties will be set to None after collation. If False, image and mask properties will be converted to torch tensors and kept in the sample.

True
Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py
17
18
19
20
21
22
23
24
25
26
def __init__(self, set_image_to_none: bool = True):
    """

    :param set_image_to_none: If True, image and mask properties for each sample will be set to None after collation.
                              After we collate images from samples into batch individual images are not needed anymore.
                              Keeping them in sample slows down data transfer time and slows training 2X.
                              If True, image and mask properties will be set to None after collation.
                              If False, image and mask properties will be converted to torch tensors and kept in the sample.
    """
    self.set_image_to_none = set_image_to_none

flat_collate_tensors_with_batch_index(labels_batch)

Concatenate tensors along the first dimension and add a sample index as the first element in the last dimension.

Parameters:

Name Type Description Default
labels_batch List[Tensor]

A list of targets per image (each of arbitrary length: [N1, ..., C], [N2, ..., C], [N3, ..., C],...)

required

Returns:

Type Description
Tensor

A single tensor of shape [N1+N2+N3+..., ..., C+1].

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def flat_collate_tensors_with_batch_index(labels_batch: List[Tensor]) -> Tensor:
    """
    Concatenate tensors along the first dimension and add a sample index as the first element in the last dimension.

    :param labels_batch: A list of targets per image (each of arbitrary length: [N1, ..., C], [N2, ..., C], [N3, ..., C],...)
    :return:             A single tensor of shape [N1+N2+N3+..., ..., C+1].
    """
    labels_batch_indexed = []
    for i, labels in enumerate(labels_batch):
        batch_column = labels.new_ones(labels.shape[:-1] + (1,)) * i
        labels = torch.cat((batch_column, labels), dim=-1)
        labels_batch_indexed.append(labels)
    return torch.cat(labels_batch_indexed, 0)

undo_flat_collate_tensors_with_batch_index(flat_tensor, batch_size)

Unrolls the flat tensor into list of tensors per batch item. As name suggest it undoes what flat_collate_tensors_with_batch_index does.

Parameters:

Name Type Description Default
flat_tensor Tensor

Tensor of shape [N1+N2+N3+..., ..., C+1].

required
batch_size int

The batch size (Number of items in the batch)

required

Returns:

Type Description
List[Tensor]

List of tensors [N1, ..., C], [N2, ..., C], [N3, ..., C],...

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def undo_flat_collate_tensors_with_batch_index(flat_tensor: Tensor, batch_size: int) -> List[Tensor]:
    """
    Unrolls the flat tensor into list of tensors per batch item.
    As name suggest it undoes what flat_collate_tensors_with_batch_index does.

    :param flat_tensor: Tensor of shape [N1+N2+N3+..., ..., C+1].
    :param batch_size:  The batch size (Number of items in the batch)
    :return:            List of tensors [N1, ..., C], [N2, ..., C], [N3, ..., C],...
    """
    items = []
    batch_index_roi = [slice(None)] + [0] * (flat_tensor.ndim - 1)
    batch_index = flat_tensor[batch_index_roi]
    for i in range(batch_size):
        mask = batch_index == i
        items.append(flat_tensor[mask][..., 1:])
    return items

ClassBalancedSampler

Bases: WeightedRandomSampler

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
@register_sampler(Samplers.CLASS_BALANCED)
class ClassBalancedSampler(WeightedRandomSampler):
    def __init__(
        self,
        dataset: Optional[HasClassesInformation] = None,
        precomputed_factors_file: Optional[str] = None,
        oversample_threshold: Optional[float] = None,
        oversample_aggressiveness: float = 0.5,
        num_samples: Optional[int] = None,
        generator=None,
    ) -> None:
        """
        Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.
        """

        if dataset is None and precomputed_factors_file is None:
            raise ValueError("`dataset` and `precomputed_factors` cannot be both None.")

        if dataset is not None and precomputed_factors_file is not None:
            # this logic is to simplify `_instantiate_sampler` method.
            warnings.warn("Both `dataset` and `precomputed_factors_file` are passed. `dataset` WILL BE IGNORED!")

        if precomputed_factors_file is not None:
            repeat_factors = ClassBalancer.from_precomputed_sample_repeat_factors(precomputed_factors_file)
        else:
            if not isinstance(dataset, HasClassesInformation):
                raise ValueError(f"`dataset` must be an instance of `{HasClassesInformation.__name__}`.")

            repeat_factors = ClassBalancer.get_sample_repeat_factors(dataset, oversample_threshold, oversample_aggressiveness)

        weights = np.array(repeat_factors) / sum(repeat_factors)

        super().__init__(weights=weights, num_samples=num_samples or len(weights), replacement=True, generator=generator)

__init__(dataset=None, precomputed_factors_file=None, oversample_threshold=None, oversample_aggressiveness=0.5, num_samples=None, generator=None)

Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def __init__(
    self,
    dataset: Optional[HasClassesInformation] = None,
    precomputed_factors_file: Optional[str] = None,
    oversample_threshold: Optional[float] = None,
    oversample_aggressiveness: float = 0.5,
    num_samples: Optional[int] = None,
    generator=None,
) -> None:
    """
    Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.
    """

    if dataset is None and precomputed_factors_file is None:
        raise ValueError("`dataset` and `precomputed_factors` cannot be both None.")

    if dataset is not None and precomputed_factors_file is not None:
        # this logic is to simplify `_instantiate_sampler` method.
        warnings.warn("Both `dataset` and `precomputed_factors_file` are passed. `dataset` WILL BE IGNORED!")

    if precomputed_factors_file is not None:
        repeat_factors = ClassBalancer.from_precomputed_sample_repeat_factors(precomputed_factors_file)
    else:
        if not isinstance(dataset, HasClassesInformation):
            raise ValueError(f"`dataset` must be an instance of `{HasClassesInformation.__name__}`.")

        repeat_factors = ClassBalancer.get_sample_repeat_factors(dataset, oversample_threshold, oversample_aggressiveness)

    weights = np.array(repeat_factors) / sum(repeat_factors)

    super().__init__(weights=weights, num_samples=num_samples or len(weights), replacement=True, generator=generator)

ClassBalancer

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class ClassBalancer:
    @staticmethod
    def get_sample_repeat_factors(
        class_information_provider: HasClassesInformation,
        oversample_threshold: Optional[float] = None,
        oversample_aggressiveness: float = 0.5,
    ) -> List[float]:
        """
        Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

        :param class_information_provider:      An object (probably a dataset) that provides the class information.
        :param oversample_threshold:            A frequency threshold (fraction, 0-1). Classes that are *less frequent* than this threshold will be oversampled.
                                                The default value is None. If None, the median of the class frequencies will be used.
        :param oversample_aggressiveness:       How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is.
                                                The default value is 0.5, and corresponds to the implementation in the paper.
                                                A value of 0.0 corresponds to no oversampling.


        The repeat factor is computed as followed:
        1. For each class c, compute the fraction # of images that contain it (its frequency): :math:`f(c)`
        2. For each class c, compute the category-level repeat factor: :math:`r(c) = max(1, aggressiveness(threshold/f(c)))`
        3. For each image I, compute the image-level repeat factor: :math:`r(I) = max_{c in I} r(c)`

        Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.
        """

        class_information = class_information_provider.get_dataset_classes_information()  # shape = (dataset_length, num_classes)

        # 1. For each category c, compute the fraction # of images that contain it: f(c)
        class_frequencies = np.sum(class_information, axis=0)
        class_frequencies = class_frequencies / len(class_information)

        # 2. For each class c, compute the class-level repeat factor: r(c) = max(1, sqrt(t/f(c)))
        category_repeat = {
            cat_id: cat_repeat
            for cat_id, cat_repeat in enumerate(_default_oversample_heuristic(class_frequencies, oversample_threshold, oversample_aggressiveness))
        }  # dict for ease of debugging

        # 3. For each image I, compute the image-level repeat factor: r(I) = max_{c in I} r(c)
        repeat_factors = list()
        categories = np.arange(class_information.shape[1])
        for sample_cat_freq in class_information:
            cat_ids = categories[sample_cat_freq != 0]
            if len(cat_ids) == 0:  # in case image doesn't have annotations, we will not over-sample nor ignore it
                repeat_factors.append(1.0)
            else:
                repeat_factors.append(max({category_repeat[cat_id] for cat_id in cat_ids}))

        return repeat_factors  # len = dataset_length

    @staticmethod
    def precompute_sample_repeat_factors(
        output_path: str,
        class_information_provider: HasClassesInformation,
        oversample_threshold: Optional[float] = None,
    ):
        repeat_factors: List[float] = ClassBalancer.get_sample_repeat_factors(
            class_information_provider=class_information_provider,
            oversample_threshold=oversample_threshold,
        )

        str_repeat_factors = [np.format_float_positional(rf, trim="0", precision=4) for rf in repeat_factors]

        with open(output_path, "w") as f:
            json.dump(str_repeat_factors, f)

    @staticmethod
    def from_precomputed_sample_repeat_factors(precomputed_path: str) -> List[float]:
        """
        Loads the repeat factors from a precomputed file.
        """
        if not os.path.exists(precomputed_path):
            raise FileNotFoundError(f"`{precomputed_path}` does not exist.")

        with open(precomputed_path, "r") as f:
            loaded = json.load(f)

        return list(map(lambda x: float(x), loaded))

from_precomputed_sample_repeat_factors(precomputed_path) staticmethod

Loads the repeat factors from a precomputed file.

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py
106
107
108
109
110
111
112
113
114
115
116
117
@staticmethod
def from_precomputed_sample_repeat_factors(precomputed_path: str) -> List[float]:
    """
    Loads the repeat factors from a precomputed file.
    """
    if not os.path.exists(precomputed_path):
        raise FileNotFoundError(f"`{precomputed_path}` does not exist.")

    with open(precomputed_path, "r") as f:
        loaded = json.load(f)

    return list(map(lambda x: float(x), loaded))

get_sample_repeat_factors(class_information_provider, oversample_threshold=None, oversample_aggressiveness=0.5) staticmethod

Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

Parameters:

Name Type Description Default
class_information_provider HasClassesInformation

An object (probably a dataset) that provides the class information.

required
oversample_threshold Optional[float]

A frequency threshold (fraction, 0-1). Classes that are less frequent than this threshold will be oversampled. The default value is None. If None, the median of the class frequencies will be used.

None
oversample_aggressiveness float

How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is. The default value is 0.5, and corresponds to the implementation in the paper. A value of 0.0 corresponds to no oversampling. The repeat factor is computed as followed: 1. For each class c, compute the fraction # of images that contain it (its frequency): :math:f(c) 2. For each class c, compute the category-level repeat factor: :math:r(c) = max(1, aggressiveness(threshold/f(c))) 3. For each image I, compute the image-level repeat factor: :math:r(I) = max_{c in I} r(c) Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.

0.5
Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@staticmethod
def get_sample_repeat_factors(
    class_information_provider: HasClassesInformation,
    oversample_threshold: Optional[float] = None,
    oversample_aggressiveness: float = 0.5,
) -> List[float]:
    """
    Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

    :param class_information_provider:      An object (probably a dataset) that provides the class information.
    :param oversample_threshold:            A frequency threshold (fraction, 0-1). Classes that are *less frequent* than this threshold will be oversampled.
                                            The default value is None. If None, the median of the class frequencies will be used.
    :param oversample_aggressiveness:       How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is.
                                            The default value is 0.5, and corresponds to the implementation in the paper.
                                            A value of 0.0 corresponds to no oversampling.


    The repeat factor is computed as followed:
    1. For each class c, compute the fraction # of images that contain it (its frequency): :math:`f(c)`
    2. For each class c, compute the category-level repeat factor: :math:`r(c) = max(1, aggressiveness(threshold/f(c)))`
    3. For each image I, compute the image-level repeat factor: :math:`r(I) = max_{c in I} r(c)`

    Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.
    """

    class_information = class_information_provider.get_dataset_classes_information()  # shape = (dataset_length, num_classes)

    # 1. For each category c, compute the fraction # of images that contain it: f(c)
    class_frequencies = np.sum(class_information, axis=0)
    class_frequencies = class_frequencies / len(class_information)

    # 2. For each class c, compute the class-level repeat factor: r(c) = max(1, sqrt(t/f(c)))
    category_repeat = {
        cat_id: cat_repeat
        for cat_id, cat_repeat in enumerate(_default_oversample_heuristic(class_frequencies, oversample_threshold, oversample_aggressiveness))
    }  # dict for ease of debugging

    # 3. For each image I, compute the image-level repeat factor: r(I) = max_{c in I} r(c)
    repeat_factors = list()
    categories = np.arange(class_information.shape[1])
    for sample_cat_freq in class_information:
        cat_ids = categories[sample_cat_freq != 0]
        if len(cat_ids) == 0:  # in case image doesn't have annotations, we will not over-sample nor ignore it
            repeat_factors.append(1.0)
        else:
            repeat_factors.append(max({category_repeat[cat_id] for cat_id in cat_ids}))

    return repeat_factors  # len = dataset_length

DatasetFromSampler

Bases: Dataset

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
class DatasetFromSampler(Dataset):
    def __init__(self, sampler: Sampler):
        self.sampler = sampler
        self.sampler_list = None

    def __getitem__(self, index: int):
        if self.sampler_list is None:  # we don't instantiate the list in __init__ because want to shuffle first (happens in DistributedSamplerWrapper.__iter__)
            self.sampler_list = list(self.sampler)
        return self.sampler_list[index]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.sampler)

__len__()

Returns: int: length of the dataset

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py
17
18
19
20
21
22
def __len__(self) -> int:
    """
    Returns:
        int: length of the dataset
    """
    return len(self.sampler)

DistributedSamplerWrapper

Bases: DistributedSampler

Wrapper over Sampler for distributed training. Allows you to use any sampler in distributed mode.

It is especially useful in conjunction with torch.nn.parallel.DistributedDataParallel. In such case, each process can pass a DistributedSamplerWrapper instance as a DataLoader sampler, and load a subset of subsampled data of the original dataset that is exclusive to it.

.. note:: Sampler is assumed to be of constant size.

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class DistributedSamplerWrapper(DistributedSampler):
    """
    Wrapper over `Sampler` for distributed training.
    Allows you to use any sampler in distributed mode.

    It is especially useful in conjunction with
    `torch.nn.parallel.DistributedDataParallel`. In such case, each
    process can pass a DistributedSamplerWrapper instance as a DataLoader
    sampler, and load a subset of subsampled data of the original dataset
    that is exclusive to it.

    .. note::
        Sampler is assumed to be of constant size.
    """

    def __init__(
        self,
        sampler,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
    ):
        """

        Args:
            sampler: Sampler used for subsampling
            num_replicas (int, optional): Number of processes participating in
              distributed training
            rank (int, optional): Rank of the current process
              within ``num_replicas``
            shuffle (bool, optional): If true (default),
              sampler will shuffle the indices
        """
        super(DistributedSamplerWrapper, self).__init__(
            DatasetFromSampler(sampler),
            num_replicas=num_replicas,
            rank=rank,
            shuffle=shuffle,
        )
        self.sampler = sampler

    def __iter__(self):

        self.dataset = DatasetFromSampler(self.sampler)
        indexes_of_indexes = super().__iter__()
        subsampler_indexes = self.dataset
        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))

__init__(sampler, num_replicas=None, rank=None, shuffle=True)

Args: sampler: Sampler used for subsampling num_replicas (int, optional): Number of processes participating in distributed training rank (int, optional): Rank of the current process within num_replicas shuffle (bool, optional): If true (default), sampler will shuffle the indices

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    sampler,
    num_replicas: Optional[int] = None,
    rank: Optional[int] = None,
    shuffle: bool = True,
):
    """

    Args:
        sampler: Sampler used for subsampling
        num_replicas (int, optional): Number of processes participating in
          distributed training
        rank (int, optional): Rank of the current process
          within ``num_replicas``
        shuffle (bool, optional): If true (default),
          sampler will shuffle the indices
    """
    super(DistributedSamplerWrapper, self).__init__(
        DatasetFromSampler(sampler),
        num_replicas=num_replicas,
        rank=rank,
        shuffle=shuffle,
    )
    self.sampler = sampler

RepeatAugSampler

Bases: Sampler

Sampler that restricts data loading to a subset of the dataset for distributed, with repeated augmentation. It ensures that different each augmented version of a sample will be visible to a different process (GPU). Heavily based on torch.utils.data.DistributedSampler This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py Copyright (c) 2015-present, Facebook, Inc.

Below code is modified from: https://github.com/rwightman/pytorch-image-models/blame/master/timm/data/distributed_sampler.py

Note this sampler is currently supported only for DDP training.

Arguments: dataset (torch.utils.data.Dataset): dataset to sample from. num_replicas (int): Number of dataset replicas, equals to world_size when set to 0 (default=0). shuffle (bool): whether to shuffle the dataset indices (default=True). num_repeats (int): amount of repetitions for each example. selected_round (int): When > 0, the number of samples to select per epoch for each rank is determined by

    int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))

    (default=256)

selected_ratio (int): ratio to reduce selected samples by, num_replicas if 0.
Source code in src/super_gradients/training/datasets/samplers/repeated_augmentation_sampler.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@register_sampler(Samplers.REPEAT_AUG)
class RepeatAugSampler(Sampler):
    """
    Sampler that restricts data loading to a subset of the dataset for distributed,
    with repeated augmentation.
    It ensures that different each augmented version of a sample will be visible to a
    different process (GPU). Heavily based on torch.utils.data.DistributedSampler
    This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py
    Copyright (c) 2015-present, Facebook, Inc.

    Below code is modified from:
     https://github.com/rwightman/pytorch-image-models/blame/master/timm/data/distributed_sampler.py

    Note this sampler is currently supported only for DDP training.

    Arguments:
        dataset (torch.utils.data.Dataset): dataset to sample from.
        num_replicas (int): Number of dataset replicas, equals to world_size when set to 0 (default=0).
        shuffle (bool): whether to shuffle the dataset indices (default=True).
        num_repeats (int): amount of repetitions for each example.
        selected_round (int): When > 0, the number of samples to select per epoch for each rank is determined by

            int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))

            (default=256)

        selected_ratio (int): ratio to reduce selected samples by, num_replicas if 0.

    """

    def __init__(
        self,
        dataset: torch.utils.data.Dataset,
        num_replicas: int = None,
        rank: int = None,
        shuffle: bool = True,
        num_repeats: int = 3,
        selected_round: int = 256,
        selected_ratio: int = 0,
    ):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.shuffle = shuffle
        self.num_repeats = num_repeats
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * num_repeats / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
        # Determine the number of samples to select per epoch for each rank.
        # num_selected logic defaults to be the same as original RASampler impl, but this one can be tweaked
        # via selected_ratio and selected_round args.
        selected_ratio = selected_ratio or num_replicas  # ratio to reduce selected samples by, num_replicas if 0

        if selected_round:
            self.num_selected_samples = int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))
        else:
            self.num_selected_samples = int(math.ceil(len(self.dataset) / selected_ratio))

    def __iter__(self):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(self.epoch)
        if self.shuffle:
            indices = torch.randperm(len(self.dataset), generator=g)
        else:
            indices = torch.arange(start=0, end=len(self.dataset))

        # produce repeats e.g. [0, 0, 0, 1, 1, 1, 2, 2, 2....]
        if isinstance(self.num_repeats, float) and not self.num_repeats.is_integer():
            # resample for repeats w/ non-integer ratio
            repeat_size = math.ceil(self.num_repeats * len(self.dataset))
            indices = indices[torch.tensor([int(i // self.num_repeats) for i in range(repeat_size)])]
        else:
            indices = torch.repeat_interleave(indices, repeats=int(self.num_repeats), dim=0)
        indices = indices.tolist()  # leaving as tensor thrashes dataloader memory
        # add extra samples to make it evenly divisible
        padding_size = self.total_size - len(indices)
        if padding_size > 0:
            indices += indices[:padding_size]
        assert len(indices) == self.total_size

        # subsample per rank
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples

        # return up to num selected samples
        return iter(indices[: self.num_selected_samples])

    def __len__(self):
        return self.num_selected_samples

    def set_epoch(self, epoch):
        self.epoch = epoch

CityscapesConcatDataset

Bases: ConcatDataset

Support building a Cityscapes dataset which includes multiple group of samples from several list files. i.e to initiate a trainval dataset:

trainval_set = CityscapesConcatDataset( root_dir='/data', list_files=['lists/train.lst', 'lists/val.lst'], labels_csv_path='lists/labels.csv', ... )

i.e to initiate a combination of the train-set with AutoLabelling-set:

train_al_set = CityscapesConcatDataset( root_dir='/data', list_files=['lists/train.lst', 'lists/auto_labelling.lst'], labels_csv_path='lists/labels.csv', ... )

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
@register_dataset(Datasets.CITYSCAPES_CONCAT_DATASET)
class CityscapesConcatDataset(ConcatDataset):
    """
    Support building a Cityscapes dataset which includes multiple group of samples from several list files.
    i.e to initiate a trainval dataset:
    >>> trainval_set = CityscapesConcatDataset(
    >>>    root_dir='/data', list_files=['lists/train.lst', 'lists/val.lst'], labels_csv_path='lists/labels.csv', ...
    >>> )

    i.e to initiate a combination of the train-set with AutoLabelling-set:
    >>> train_al_set = CityscapesConcatDataset(
    >>>    root_dir='/data', list_files=['lists/train.lst', 'lists/auto_labelling.lst'], labels_csv_path='lists/labels.csv', ...
    >>> )
    """

    def __init__(self, root_dir: str, list_files: List[str], labels_csv_path: str, **kwargs):
        """
        :param root_dir:        Absolute path to root directory of the dataset.
        :param list_files:      List of list files that contains names of images to load,
                                line format: <image_path> <label_path>. The path is relative to root.
        :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
        :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
        """
        super().__init__(
            datasets=[
                CityscapesDataset(
                    root_dir=root_dir,
                    list_file=list_file,
                    labels_csv_path=labels_csv_path,
                    **kwargs,
                )
                for list_file in list_files
            ]
        )

__init__(root_dir, list_files, labels_csv_path, **kwargs)

Parameters:

Name Type Description Default
root_dir str

Absolute path to root directory of the dataset.

required
list_files List[str]

List of list files that contains names of images to load, line format: . The path is relative to root.

required
labels_csv_path str

Path to csv file, with labels metadata and mapping. The path is relative to root.

required
kwargs

Any hyper params required for the dataset, i.e img_size, crop_size, cache_images

{}
Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def __init__(self, root_dir: str, list_files: List[str], labels_csv_path: str, **kwargs):
    """
    :param root_dir:        Absolute path to root directory of the dataset.
    :param list_files:      List of list files that contains names of images to load,
                            line format: <image_path> <label_path>. The path is relative to root.
    :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
    :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
    """
    super().__init__(
        datasets=[
            CityscapesDataset(
                root_dir=root_dir,
                list_file=list_file,
                labels_csv_path=labels_csv_path,
                **kwargs,
            )
            for list_file in list_files
        ]
    )

CityscapesDataset

Bases: SegmentationDataSet

CityscapesDataset - Segmentation Data Set Class for Cityscapes Segmentation Data Set, main resolution of dataset: (2048 x 1024). Not all the original labels are used for training and evaluation, according to cityscape paper: "Classes that are too rare are excluded from our benchmark, leaving 19 classes for evaluation". For more details about the dataset labels format see: https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py

To use this Dataset you need to:

  • Download cityscape dataset (https://www.cityscapes-dataset.com/downloads/)

    root_dir (in recipe default to /data/cityscapes) ├─── gtFine │ ├── test │ │ ├── berlin │ │ │ ├── berlin_000000_000019_gtFine_color.png │ │ │ ├── berlin_000000_000019_gtFine_instanceIds.png │ │ │ └── ... │ │ ├── bielefeld │ │ │ └── ... │ │ └── ... │ ├─── train │ │ └── ... │ └─── val │ └── ... └─── leftImg8bit ├── test │ └── ... ├─── train │ └── ... └─── val └── ...

  • Download metadata folder (https://deci-pretrained-models.s3.amazonaws.com/cityscape_lists.zip)

    lists ├── labels.csv ├── test.lst ├── train.lst ├── trainval.lst └── val.lst

  • Move Metadata folder to the Cityscape folder

    root_dir (in recipe default to /data/cityscapes) ├─── gtFine │ └── ... ├─── leftImg8bit │ └── ... └─── lists └── ...

Example: >> CityscapesDataset(root_dir='.../root_dir', list_file='lists/train.lst', labels_csv_path='lists/labels.csv', ...)

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
@register_dataset(Datasets.CITYSCAPES_DATASET)
class CityscapesDataset(SegmentationDataSet):
    """
    CityscapesDataset - Segmentation Data Set Class for Cityscapes Segmentation Data Set,
    main resolution of dataset: (2048 x 1024).
    Not all the original labels are used for training and evaluation, according to cityscape paper:
    "Classes that are too rare are excluded from our benchmark, leaving 19 classes for evaluation".
    For more details about the dataset labels format see:
    https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py

    To use this Dataset you need to:

    - Download cityscape dataset (https://www.cityscapes-dataset.com/downloads/)

        root_dir (in recipe default to /data/cityscapes)
            ├─── gtFine
            │       ├── test
            │       │     ├── berlin
            │       │     │   ├── berlin_000000_000019_gtFine_color.png
            │       │     │   ├── berlin_000000_000019_gtFine_instanceIds.png
            │       │     │   └── ...
            │       │     ├── bielefeld
            │       │     │   └── ...
            │       │     └── ...
            │       ├─── train
            │       │     └── ...
            │       └─── val
            │             └── ...
            └─── leftImg8bit
                    ├── test
                    │     └── ...
                    ├─── train
                    │     └── ...
                    └─── val
                          └── ...

    - Download metadata folder (https://deci-pretrained-models.s3.amazonaws.com/cityscape_lists.zip)

        lists
            ├── labels.csv
            ├── test.lst
            ├── train.lst
            ├── trainval.lst
            └── val.lst

    - Move Metadata folder to the Cityscape folder

        root_dir (in recipe default to /data/cityscapes)
            ├─── gtFine
            │      └── ...
            ├─── leftImg8bit
            │      └── ...
            └─── lists
                   └── ...

    Example:
        >> CityscapesDataset(root_dir='.../root_dir', list_file='lists/train.lst', labels_csv_path='lists/labels.csv', ...)
    """

    def __init__(self, root_dir: str, list_file: str, labels_csv_path: str, **kwargs):
        """
        :param root_dir:        Absolute path to root directory of the dataset.
        :param list_file:       List file that contains names of images to load, line format: <image_path> <label_path>. The path is relative to root.
        :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
        :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
        """

        self.root_dir = root_dir
        super().__init__(root_dir, list_file=list_file, **kwargs)
        # labels dataframe for labels metadata.
        self.labels_data = np.recfromcsv(os.path.join(self.root_dir, labels_csv_path), dtype="<i8,U20,<i8,<i8,U12,<i8,?,?,U7", comments="&")
        # map vector to map ground-truth labels to train labels
        self.labels_map = self.labels_data.field("trainid")
        # class names
        self.classes = self.labels_data.field("name")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()
        # color palette for visualization
        self.train_id_color_palette = self._create_color_palette()

    def _generate_samples_and_targets(self):
        """
        override _generate_samples_and_targets function, to parse list file.
        line format of list file: <image_path> <label_path>
        """
        with open(os.path.join(self.root_dir, self.list_file_path)) as f:
            img_list = [line.strip().split() for line in f]
        for image_path, label_path in img_list:
            self.samples_targets_tuples_list.append((os.path.join(self.root, image_path), os.path.join(self.root, label_path)))
        super(CityscapesDataset, self)._generate_samples_and_targets()

    def target_loader(self, label_path: str) -> Image:
        """
        Override target_loader function, load the labels mask image.
            :param label_path:  Path to the label image.
            :return:                     The mask image created from the array, with converted class labels.
        """
        # assert that is a png file, other file types might alter the class labels value.
        assert os.path.splitext(label_path)[-1].lower() == ".png"

        label = cv2.imread(label_path, cv2.IMREAD_GRAYSCALE)
        # map ground-truth ids to train ids
        label = self.labels_map[label].astype(np.uint8)
        return Image.fromarray(label, "L")

    def _create_color_palette(self):
        """
        Create color pallete for visualizing the segmentation masks
        :return: list of rgb color values
        """
        palette = []
        hex_colors = self.labels_data.field("color")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()

        for hex_color in hex_colors:
            rgb_color = ImageColor.getcolor(hex_color, "RGB")
            palette += [x for x in rgb_color]

        return palette

    def get_train_ids_color_palette(self):
        return self.train_id_color_palette

    def __getitem__(self, index):
        sample, target = super(CityscapesDataset, self).__getitem__(index)
        target[target == 255] = CITYSCAPES_IGNORE_LABEL
        return sample, target

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 1024, 2048

__init__(root_dir, list_file, labels_csv_path, **kwargs)

Parameters:

Name Type Description Default
root_dir str

Absolute path to root directory of the dataset.

required
list_file str

List file that contains names of images to load, line format: . The path is relative to root.

required
labels_csv_path str

Path to csv file, with labels metadata and mapping. The path is relative to root.

required
kwargs

Any hyper params required for the dataset, i.e img_size, crop_size, cache_images

{}
Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def __init__(self, root_dir: str, list_file: str, labels_csv_path: str, **kwargs):
    """
    :param root_dir:        Absolute path to root directory of the dataset.
    :param list_file:       List file that contains names of images to load, line format: <image_path> <label_path>. The path is relative to root.
    :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
    :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
    """

    self.root_dir = root_dir
    super().__init__(root_dir, list_file=list_file, **kwargs)
    # labels dataframe for labels metadata.
    self.labels_data = np.recfromcsv(os.path.join(self.root_dir, labels_csv_path), dtype="<i8,U20,<i8,<i8,U12,<i8,?,?,U7", comments="&")
    # map vector to map ground-truth labels to train labels
    self.labels_map = self.labels_data.field("trainid")
    # class names
    self.classes = self.labels_data.field("name")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()
    # color palette for visualization
    self.train_id_color_palette = self._create_color_palette()

target_loader(label_path)

Override target_loader function, load the labels mask image. :param label_path: Path to the label image. :return: The mask image created from the array, with converted class labels.

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py
106
107
108
109
110
111
112
113
114
115
116
117
118
def target_loader(self, label_path: str) -> Image:
    """
    Override target_loader function, load the labels mask image.
        :param label_path:  Path to the label image.
        :return:                     The mask image created from the array, with converted class labels.
    """
    # assert that is a png file, other file types might alter the class labels value.
    assert os.path.splitext(label_path)[-1].lower() == ".png"

    label = cv2.imread(label_path, cv2.IMREAD_GRAYSCALE)
    # map ground-truth ids to train ids
    label = self.labels_map[label].astype(np.uint8)
    return Image.fromarray(label, "L")

CoCoSegmentationDataSet

Bases: SegmentationDataSet

Segmentation Data Set Class for COCO 2017 Segmentation Data Set

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Instantiate the dataset:
    >> train_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
Source code in src/super_gradients/training/datasets/segmentation_datasets/coco_segmentation.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@register_dataset(Datasets.COCO_SEGMENTATION_DATASET)
class CoCoSegmentationDataSet(SegmentationDataSet):
    """
    Segmentation Data Set Class for COCO 2017 Segmentation Data Set

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Instantiate the dataset:
            >> train_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(self, root_dir: str, dataset_classes_inclusion_tuples_list: list = None, *args, **kwargs):
        # THERE ARE 91 CLASSES, INCLUDING BACKGROUND - BUT WE ENABLE THE USAGE OF SUBCLASSES, TO PARTIALLY USE THE DATA
        self.dataset_classes_inclusion_tuples_list = dataset_classes_inclusion_tuples_list or COCO_DEFAULT_CLASSES_TUPLES_LIST

        self.root_dir = root_dir
        super().__init__(root_dir, *args, **kwargs)

        _, class_names = zip(*self.dataset_classes_inclusion_tuples_list)
        self.classes = class_names

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # FIRST OF ALL LOAD ALL OF THE ANNOTATIONS, AND CREATE THE PATH FOR THE PRE-PROCESSED MASKS
        self.annotations_file_path = os.path.join(self.root, self.targets_sub_directory, self.list_file_path)
        self.coco = COCO(self.annotations_file_path)

        # USE SUB-CLASSES OF THE ENTIRE COCO DATA SET, INSTEAD ALL OF THE DATA -> HIGHLY RELEVANT FOR TRANSFER LEARNING
        sub_dataset_image_ids_file_path = self.annotations_file_path.replace("json", "pth")

        if os.path.exists(sub_dataset_image_ids_file_path):
            self.relevant_image_ids = torch.load(sub_dataset_image_ids_file_path)
        else:
            self.relevant_image_ids = self._sub_dataset_creation(sub_dataset_image_ids_file_path)

        for relevant_image_id in self.relevant_image_ids:
            img_metadata = self.coco.loadImgs(relevant_image_id)[0]
            image_path = os.path.join(self.root, self.samples_sub_directory, img_metadata["file_name"])
            mask_metadata_tuple = (relevant_image_id, img_metadata["height"], img_metadata["width"])
            self.samples_targets_tuples_list.append((image_path, mask_metadata_tuple))

        super(CoCoSegmentationDataSet, self)._generate_samples_and_targets()

    def target_loader(self, mask_metadata_tuple) -> Image:
        """
        target_loader
            :param mask_metadata_tuple:  A tuple of (coco_image_id, original_image_height, original_image_width)
            :return:                     The mask image created from the array
        """
        coco_image_id, original_image_h, original_image_w = mask_metadata_tuple
        coco_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=coco_image_id))

        mask = self._generate_coco_segmentation_mask(coco_annotations, original_image_h, original_image_w)
        return Image.fromarray(mask)

    def _generate_coco_segmentation_mask(self, target_coco_annotations, h, w):
        """
        _generate_segmentation_mask - Extracts a segmentation mask
            :param target_coco_annotations:
            :param h:
            :param w:
            :return:
        """
        mask = np.zeros((h, w), dtype=np.uint8)

        for i, instance in enumerate(target_coco_annotations):
            rle = pycocotools_mask.frPyObjects(instance["segmentation"], h, w)
            coco_segementation_mask = pycocotools_mask.decode(rle)

            if not self.dataset_classes_inclusion_tuples_list:
                # NO CLASSES WERE SELECTED FROM COCO'S 91 CLASSES - ERROR
                raise EmptyCoCoClassesSelectionException
            else:
                # FILTER OUT ALL OF THE MASKS OF INSTANCES THAT ARE NOT IN THE SUB-DATASET CLASSES
                class_category = instance["category_id"]

                sub_classes_category_ids, _ = map(list, zip(*self.dataset_classes_inclusion_tuples_list))
                if class_category not in sub_classes_category_ids:
                    continue

                class_index = sub_classes_category_ids.index(class_category)
                if len(coco_segementation_mask.shape) < 3:
                    mask[:, :] += (mask == 0) * (coco_segementation_mask * class_index)
                else:
                    mask[:, :] += (mask == 0) * (((np.sum(coco_segementation_mask, axis=2)) > 0) * class_index).astype(np.uint8)

        return mask

    def _sub_dataset_creation(self, sub_dataset_image_ids_file_path) -> list:
        """
        _sub_dataset_creation - This method creates the segmentation annotations for coco using
                                self._generate_segmentation_mask that uses the sub-classes inclusion tuple to keep only
                                the annotations that are relevant to the sub-classes selected when instantiating the class
            :param  sub_dataset_image_ids_file_path: The path to save the sub-dataset in for future loading
            :return:            All of the ids with enough pixel data after the sub-classing
        """
        print("Creating sub-dataset , this will take a while but don't worry, it only runs once and caches the results")
        all_coco_image_ids = list(self.coco.imgs.keys())
        sub_dataset_image_ids = []

        with tqdm(all_coco_image_ids, desc="Generating sub-dataset image ids") as tbar:
            for i, img_id in enumerate(tbar):
                coco_target_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
                img_metadata = self.coco.loadImgs(img_id)[0]

                mask = self._generate_coco_segmentation_mask(coco_target_annotations, img_metadata["height"], img_metadata["width"])

                # MAKE SURE THERE IS ENOUGH INPUT IN THE IMAGE (MORE THAN 1K PIXELS) AFTER SUB-CLASSES FILTRATION
                if (mask > 0).sum() > 1000:
                    sub_dataset_image_ids.append(img_id)

                tbar.set_description("Processed images: {}/{}, generated {} qualified images".format(i, len(all_coco_image_ids), len(sub_dataset_image_ids)))
        print("Number of images in sub-dataset: ", len(sub_dataset_image_ids))
        torch.save(sub_dataset_image_ids, sub_dataset_image_ids_file_path)
        return sub_dataset_image_ids

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 512, 512

target_loader(mask_metadata_tuple)

target_loader :param mask_metadata_tuple: A tuple of (coco_image_id, original_image_height, original_image_width) :return: The mask image created from the array

Source code in src/super_gradients/training/datasets/segmentation_datasets/coco_segmentation.py
89
90
91
92
93
94
95
96
97
98
99
def target_loader(self, mask_metadata_tuple) -> Image:
    """
    target_loader
        :param mask_metadata_tuple:  A tuple of (coco_image_id, original_image_height, original_image_width)
        :return:                     The mask image created from the array
    """
    coco_image_id, original_image_h, original_image_w = mask_metadata_tuple
    coco_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=coco_image_id))

    mask = self._generate_coco_segmentation_mask(coco_annotations, original_image_h, original_image_w)
    return Image.fromarray(mask)

MapillaryDataset

Bases: SegmentationDataSet

Mapillary Vistas is a large-scale urban street-view dataset. This dataset contains 18k, 2k, and 5k images for training, validation and testing with a variety of image resolutions, ranging from 1024 × 768 to 4000 × 6000. Paper: "Gerhard Neuhold, Tobias Ollmann, Samuel Rota Bulò, and Peter Kontschieder. The mapillary vistas dataset for semantic understanding of street scenes. In CVPR, 2017." https://openaccess.thecvf.com/content_ICCV_2017/papers/Neuhold_The_Mapillary_Vistas_ICCV_2017_paper.pdf Official site: https://www.mapillary.com/ (register for free, then download Vistas dataset)

Source code in src/super_gradients/training/datasets/segmentation_datasets/mapillary_dataset.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
@register_dataset(Datasets.MAPILLARY_DATASET)
class MapillaryDataset(SegmentationDataSet):
    """
    Mapillary Vistas is a large-scale urban street-view dataset.
    This dataset contains 18k, 2k, and 5k images for training, validation and testing with a variety of image
    resolutions, ranging from 1024 × 768 to 4000 × 6000.
    Paper:
        "Gerhard Neuhold, Tobias Ollmann, Samuel Rota Bulò, and Peter Kontschieder. The mapillary vistas dataset for
         semantic understanding of street scenes. In CVPR, 2017."
         https://openaccess.thecvf.com/content_ICCV_2017/papers/Neuhold_The_Mapillary_Vistas_ICCV_2017_paper.pdf
    Official site:
        https://www.mapillary.com/ (register for free, then download Vistas dataset)
    """

    """
        Dataset layout:
            root_dir
            ├── config_v1.2.json
            ├── config_v2.0.json
            ├── training
                ├── images
                    ├── {image_name}.jpg            # RGB images
                ├── v1.2
                    ├── labels
                        ├── {image_name}.jpg        # Target masks
                ├── v2.0
                    ├── labels
                        ├── {image_name}.jpg        # Target masks
            ├── validation
            ├── testing
        Note that there are two versions currently available for this dataset, `v1.2` and `v2.0`, the difference according
        to the change log is as follows:
            - Expanded the set of labels to 124 classes (70 instance-specific, 46 stuff, 8 void or crowd).
            - Added raw polygonal annotations as json files. These reflect the ordering in which the segments where
                annotated by the original annotators, i.e. approximately from the background towards the camera.
        The common practice is to use the 65 categorical labels from v1.2 and older.
    """

    IGNORE_LABEL_V1_2 = 65
    IGNORE_LABEL_V2_0 = 123

    def __init__(
        self,
        root_dir: str,
        config_file: str,
        samples_sub_directory: str,
        targets_sub_directory: str,
        sample_extension: str = ".jpg",
        target_extension: str = ".png",
        **kwargs,
    ):
        self.samples_sub_directory = samples_sub_directory
        self.targets_sub_directory = targets_sub_directory
        self.target_extension = target_extension
        self.sample_extension = sample_extension
        # FIXME - Must pass list_file, due to double inheritance error when using DirectoryDataset. See the bug report
        super().__init__(
            root=root_dir,
            samples_sub_directory=samples_sub_directory,
            targets_sub_directory=targets_sub_directory,
            list_file="",
            target_extension=target_extension,
            **kwargs,
        )

        # read in config file
        with open(os.path.join(self.root, config_file), "r") as f:
            config = json.load(f)
        self.labels = config["labels"]
        self.label_colors = [label["color"] for label in self.labels]
        self.label_names = [label["readable"].replace(" ", "_") for label in self.labels]
        # Ignore labels is called `Unlabeled` in config files
        self.ignore_label = self.label_names.index("Unlabeled")
        # SG format requires returning classes as label names without ignore labels, it is also often used to calculate
        # the num of classes.
        self.classes = self.label_names[:-1]

    def _generate_samples_and_targets(self):
        samples_dir = os.path.join(self.root, self.samples_sub_directory)
        labels_dir = os.path.join(self.root, self.targets_sub_directory)

        sample_names = [n for n in sorted(os.listdir(samples_dir)) if n.endswith(self.sample_extension)]
        label_names = [n for n in sorted(os.listdir(labels_dir)) if n.endswith(self.target_extension)]

        assert len(sample_names) == len(label_names), f"Number of samples: {len(sample_names)}," f" doesn't match the number of labels {len(label_names)}"

        for sample_name in sample_names:
            label_path = os.path.join(labels_dir, sample_name.replace(self.sample_extension, self.target_extension))
            sample_path = os.path.join(samples_dir, sample_name)

            if os.path.exists(sample_path) and os.path.exists(label_path):
                self.samples_targets_tuples_list.append((sample_path, label_path))
            else:
                raise AssertionError(f"Sample and/or target file(s) not found or in illegal format " f"(sample path: {sample_path}, target path: {label_path})")

    def apply_color_map(self, target: Image) -> np.ndarray:
        """
        Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.
        """
        target_array = np.array(target)
        rgb_array = np.zeros((target_array.shape[0], target_array.shape[1], 3), dtype=np.uint8)

        for label_id, color in enumerate(self.label_colors):
            # set all pixels with the current label to the color of the current label
            rgb_array[target_array == label_id] = color

        return rgb_array

apply_color_map(target)

Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.

Source code in src/super_gradients/training/datasets/segmentation_datasets/mapillary_dataset.py
107
108
109
110
111
112
113
114
115
116
117
118
def apply_color_map(self, target: Image) -> np.ndarray:
    """
    Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.
    """
    target_array = np.array(target)
    rgb_array = np.zeros((target_array.shape[0], target_array.shape[1], 3), dtype=np.uint8)

    for label_id, color in enumerate(self.label_colors):
        # set all pixels with the current label to the color of the current label
        rgb_array[target_array == label_id] = color

    return rgb_array

PascalAUG2012SegmentationDataSet

Bases: PascalVOC2012SegmentationDataSet

Segmentation Data Set Class for Pascal AUG 2012 Data Set

- Download pascal AUG 2012 dataset:
    https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

- Unzip and organize it as below:
    pascal_voc_2012
        └──VOCaug
            ├── aug.txt
            └── dataset
                  ├──inst
                  ├──img
                  └──cls

- Instantiate the dataset:
    >> train_set = PascalAUG2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCaug/dataset/aug.txt',
            samples_sub_directory='VOCaug/dataset/img',
            targets_sub_directory='VOCaug/dataset/cls',
            ...
        )

NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
@register_dataset(Datasets.PASCAL_AUG_2012_SEGMENTATION_DATASET)
class PascalAUG2012SegmentationDataSet(PascalVOC2012SegmentationDataSet):
    """
    Segmentation Data Set Class for Pascal AUG 2012 Data Set

        - Download pascal AUG 2012 dataset:
            https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

        - Unzip and organize it as below:
            pascal_voc_2012
                └──VOCaug
                    ├── aug.txt
                    └── dataset
                          ├──inst
                          ├──img
                          └──cls

        - Instantiate the dataset:
            >> train_set = PascalAUG2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCaug/dataset/aug.txt',
                    samples_sub_directory='VOCaug/dataset/img',
                    targets_sub_directory='VOCaug/dataset/cls',
                    ...
                )

    NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.
    """

    def __init__(self, *args, **kwargs):
        self.sample_suffix = ".jpg"
        self.target_suffix = ".mat"
        super().__init__(sample_suffix=self.sample_suffix, target_suffix=self.target_suffix, *args, **kwargs)

    @staticmethod
    def target_loader(target_path: str) -> Image:
        """
        target_loader
            :param target_path: The path to the target data
            :return:            The loaded target
        """
        mat = scipy.io.loadmat(target_path, mat_dtype=True, squeeze_me=True, struct_as_record=False)
        mask = mat["GTcls"].Segmentation
        return Image.fromarray(mask)

target_loader(target_path) staticmethod

target_loader :param target_path: The path to the target data :return: The loaded target

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py
210
211
212
213
214
215
216
217
218
219
@staticmethod
def target_loader(target_path: str) -> Image:
    """
    target_loader
        :param target_path: The path to the target data
        :return:            The loaded target
    """
    mat = scipy.io.loadmat(target_path, mat_dtype=True, squeeze_me=True, struct_as_record=False)
    mask = mat["GTcls"].Segmentation
    return Image.fromarray(mask)

PascalVOC2012SegmentationDataSet

Bases: SegmentationDataSet

Segmentation Data Set Class for Pascal VOC 2012 Data Set.

To use this Dataset you need to:

- Download pascal VOC 2012 dataset:
    http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

- Unzip and organize it as below:
    pascal_voc_2012
        └──VOCdevkit
              └──VOC2012
                 ├──JPEGImages
                 ├──SegmentationClass
                 ├──ImageSets
                 │    ├──Segmentation
                 │    │   └── train.txt
                 │    ├──Main
                 │    ├──Action
                 │    └──Layout
                 ├──Annotations
                 └──SegmentationObject

- Instantiate the dataset:
    >> train_set = PascalVOC2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt',
            samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
            targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
            ...
        )
    >> valid_set = PascalVOC2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt',
            samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
            targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
            ...
        )
Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
@register_dataset(Datasets.PASCAL_VOC_2012_SEGMENTATION_DATASET)
class PascalVOC2012SegmentationDataSet(SegmentationDataSet):
    """
    Segmentation Data Set Class for Pascal VOC 2012 Data Set.

    To use this Dataset you need to:

        - Download pascal VOC 2012 dataset:
            http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

        - Unzip and organize it as below:
            pascal_voc_2012
                └──VOCdevkit
                      └──VOC2012
                         ├──JPEGImages
                         ├──SegmentationClass
                         ├──ImageSets
                         │    ├──Segmentation
                         │    │   └── train.txt
                         │    ├──Main
                         │    ├──Action
                         │    └──Layout
                         ├──Annotations
                         └──SegmentationObject

        - Instantiate the dataset:
            >> train_set = PascalVOC2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt',
                    samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
                    targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
                    ...
                )
            >> valid_set = PascalVOC2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt',
                    samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
                    targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
                    ...
                )
    """

    IGNORE_LABEL = 21
    _ORIGINAL_IGNORE_LABEL = 255

    def __init__(self, sample_suffix=None, target_suffix=None, *args, **kwargs):
        self.sample_suffix = ".jpg" if sample_suffix is None else sample_suffix
        self.target_suffix = ".png" if target_suffix is None else target_suffix
        super().__init__(*args, **kwargs)

        self.classes = PASCAL_VOC_2012_CLASSES

    def __getitem__(self, index):
        sample, target = super(PascalVOC2012SegmentationDataSet, self).__getitem__(index)
        target[target == PascalVOC2012SegmentationDataSet._ORIGINAL_IGNORE_LABEL] = PascalVOC2012SegmentationDataSet.IGNORE_LABEL
        return sample, target

    def decode_segmentation_mask(self, label_mask: np.ndarray):
        """
        decode_segmentation_mask - Decodes the colors for the Segmentation Mask
            :param: label_mask:  an (M,N) array of integer values denoting
                                the class label at each spatial location.
        :return:
        """
        label_colours = self._get_pascal_labels()
        r = label_mask.copy()
        g = label_mask.copy()
        b = label_mask.copy()

        num_classes_to_plot = len(self.classes)
        for ll in range(0, num_classes_to_plot):
            r[label_mask == ll] = label_colours[ll, 0]
            g[label_mask == ll] = label_colours[ll, 1]
            b[label_mask == ll] = label_colours[ll, 2]
        rgb = np.zeros((label_mask.shape[0], label_mask.shape[1], 3))
        rgb[:, :, 0] = r / 255.0
        rgb[:, :, 1] = g / 255.0
        rgb[:, :, 2] = b / 255.0

        return rgb

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # GENERATE SAMPLES AND TARGETS HERE SPECIFICALLY FOR PASCAL VOC 2012
        with open(self.root + os.path.sep + self.list_file_path, "r", encoding="utf-8") as lines:
            for line in lines:
                image_path = os.path.join(self.root, self.samples_sub_directory, line.rstrip("\n") + self.sample_suffix)
                mask_path = os.path.join(self.root, self.targets_sub_directory, line.rstrip("\n") + self.target_suffix)

                if os.path.exists(mask_path) and os.path.exists(image_path):
                    self.samples_targets_tuples_list.append((image_path, mask_path))

        # GENERATE SAMPLES AND TARGETS OF THE SEGMENTATION DATA SET CLASS
        super()._generate_samples_and_targets()

    def _get_pascal_labels(self) -> np.ndarray:
        """Load the mapping that associates pascal classes with label colors
        :return: np.ndarray with dimensions (21, 3)
        """
        return np.asarray(
            [
                [0, 0, 0],
                [128, 0, 0],
                [0, 128, 0],
                [128, 128, 0],
                [0, 0, 128],
                [128, 0, 128],
                [0, 128, 128],
                [128, 128, 128],
                [64, 0, 0],
                [192, 0, 0],
                [64, 128, 0],
                [192, 128, 0],
                [64, 0, 128],
                [192, 0, 128],
                [64, 128, 128],
                [192, 128, 128],
                [0, 64, 0],
                [128, 64, 0],
                [0, 192, 0],
                [128, 192, 0],
                [0, 64, 128],
            ]
        )

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 512, 512

decode_segmentation_mask(label_mask)

decode_segmentation_mask - Decodes the colors for the Segmentation Mask :param: label_mask: an (M,N) array of integer values denoting the class label at each spatial location.

Returns:

Type Description
Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def decode_segmentation_mask(self, label_mask: np.ndarray):
    """
    decode_segmentation_mask - Decodes the colors for the Segmentation Mask
        :param: label_mask:  an (M,N) array of integer values denoting
                            the class label at each spatial location.
    :return:
    """
    label_colours = self._get_pascal_labels()
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()

    num_classes_to_plot = len(self.classes)
    for ll in range(0, num_classes_to_plot):
        r[label_mask == ll] = label_colours[ll, 0]
        g[label_mask == ll] = label_colours[ll, 1]
        b[label_mask == ll] = label_colours[ll, 2]
    rgb = np.zeros((label_mask.shape[0], label_mask.shape[1], 3))
    rgb[:, :, 0] = r / 255.0
    rgb[:, :, 1] = g / 255.0
    rgb[:, :, 2] = b / 255.0

    return rgb

PascalVOCAndAUGUnifiedDataset

Bases: ConcatDataset

Pascal VOC + AUG train dataset, aka SBD dataset contributed in "Semantic contours from inverse detectors". This is class implement the common usage of the SBD and PascalVOC datasets as a unified augmented trainset. The unified dataset includes a total of 10,582 samples and don't contains duplicate samples from the PascalVOC validation set.

To use this Dataset you need to:

- Download pascal datasets:
    VOC 2012: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
    AUG 2012: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

- Unzip and organize it as below:
    pascal_voc_2012
        ├─VOCdevkit
        │ └──VOC2012
        │    ├──JPEGImages
        │    ├──SegmentationClass
        │    ├──ImageSets
        │    │    ├──Segmentation
        │    │    │   └── train.txt
        │    │    ├──Main
        │    │    ├──Action
        │    │    └──Layout
        │    ├──Annotations
        │    └──SegmentationObject
        └──VOCaug
            ├── aug.txt
            └── dataset
                  ├──inst
                  ├──img
                  └──cls

- Instantiate the dataset:
    >> train_set = PascalVOCAndAUGUnifiedDataset(root='.../pascal_voc_2012', ...)

NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
@register_dataset(Datasets.PASCAL_VOC_AND_AUG_UNIFIED_DATASET)
class PascalVOCAndAUGUnifiedDataset(ConcatDataset):
    """
    Pascal VOC + AUG train dataset, aka `SBD` dataset contributed in "Semantic contours from inverse detectors".
    This is class implement the common usage of the SBD and PascalVOC datasets as a unified augmented trainset.
    The unified dataset includes a total of 10,582 samples and don't contains duplicate samples from the PascalVOC
    validation set.

    To use this Dataset you need to:

        - Download pascal datasets:
            VOC 2012: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
            AUG 2012: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

        - Unzip and organize it as below:
            pascal_voc_2012
                ├─VOCdevkit
                │ └──VOC2012
                │    ├──JPEGImages
                │    ├──SegmentationClass
                │    ├──ImageSets
                │    │    ├──Segmentation
                │    │    │   └── train.txt
                │    │    ├──Main
                │    │    ├──Action
                │    │    └──Layout
                │    ├──Annotations
                │    └──SegmentationObject
                └──VOCaug
                    ├── aug.txt
                    └── dataset
                          ├──inst
                          ├──img
                          └──cls

        - Instantiate the dataset:
            >> train_set = PascalVOCAndAUGUnifiedDataset(root='.../pascal_voc_2012', ...)

    NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.
    """

    def __init__(self, **kwargs):
        print(kwargs)
        if any([kwargs.pop("list_file"), kwargs.pop("samples_sub_directory"), kwargs.pop("targets_sub_directory")]):
            logger.warning(
                "[list_file, samples_sub_directory, targets_sub_directory] arguments passed will not be used"
                " when passed to `PascalVOCAndAUGUnifiedDataset`. Those values are predefined for initiating"
                " the Pascal VOC + AUG training set."
            )
        super().__init__(
            datasets=[
                PascalVOC2012SegmentationDataSet(
                    list_file="VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt",
                    samples_sub_directory="VOCdevkit/VOC2012/JPEGImages",
                    targets_sub_directory="VOCdevkit/VOC2012/SegmentationClass",
                    **kwargs,
                ),
                PascalAUG2012SegmentationDataSet(
                    list_file="VOCaug/dataset/aug.txt", samples_sub_directory="VOCaug/dataset/img", targets_sub_directory="VOCaug/dataset/cls", **kwargs
                ),
            ]
        )

SegmentationDataSet

Bases: DirectoryDataSet, ListDataset, HasPreprocessingParams

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
@register_dataset(Datasets.SEGMENTATION_DATASET)
class SegmentationDataSet(DirectoryDataSet, ListDataset, HasPreprocessingParams):
    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(
        self,
        root: str,
        list_file: str = None,
        samples_sub_directory: str = None,
        targets_sub_directory: str = None,
        cache_labels: bool = False,
        cache_images: bool = False,
        collate_fn: Callable = None,
        target_extension: str = ".png",
        transforms: Iterable = None,
    ):
        """
        SegmentationDataSet
            :param root:                        Root folder of the Data Set
            :param list_file:                   Path to the file with the samples list
            :param samples_sub_directory:       name of the samples sub-directory
            :param targets_sub_directory:       name of the targets sub-directory
            :param cache_labels:                "Caches" the labels -> Pre-Loads to memory as a list
            :param cache_images:                "Caches" the images -> Pre-Loads to memory as a list
            :param collate_fn:                  collate_fn func to process batches for the Data Loader
            :param target_extension:            file extension of the targets (default is .png for PASCAL VOC 2012)
            :param transforms:                  transforms to be applied on image and mask

        """
        self.samples_sub_directory = samples_sub_directory
        self.targets_sub_directory = targets_sub_directory
        self.cache_labels = cache_labels
        self.cache_images = cache_images

        # CREATE A DIRECTORY DATASET OR A LIST DATASET BASED ON THE list_file INPUT VARIABLE
        if list_file is not None:
            ListDataset.__init__(
                self,
                root=root,
                file=list_file,
                target_extension=target_extension,
                sample_loader=self.sample_loader,
                target_loader=self.target_loader,
                collate_fn=collate_fn,
            )
        else:
            DirectoryDataSet.__init__(
                self,
                root=root,
                samples_sub_directory=samples_sub_directory,
                targets_sub_directory=targets_sub_directory,
                target_extension=target_extension,
                sample_loader=self.sample_loader,
                target_loader=self.target_loader,
                collate_fn=collate_fn,
            )

        self.transforms = transforms if transforms else []

    def __getitem__(self, index):
        sample_path, target_path = self.samples_targets_tuples_list[index]

        # TRY TO LOAD THE CACHED IMAGE FIRST
        if self.cache_images:
            sample = self.imgs[index]
        else:
            sample = self.sample_loader(sample_path)

        # TRY TO LOAD THE CACHED LABEL FIRST
        if self.cache_labels:
            target = self.labels[index]
        else:
            target = self.target_loader(target_path)

        # MAKE SURE THE TRANSFORM WORKS ON BOTH IMAGE AND MASK TO ALIGN THE AUGMENTATIONS
        sample, target = self._transform_image_and_mask(sample, target)
        return sample, target

    @staticmethod
    def sample_loader(sample_path: str) -> Image:
        """
        sample_loader - Loads a dataset image from path using PIL
            :param sample_path: The path to the sample image
            :return:            The loaded Image
        """
        image = Image.open(sample_path).convert("RGB")
        return image

    @staticmethod
    def target_loader(target_path: str) -> Image:
        """
        target_loader
            :param target_path: The path to the sample image
            :return:            The loaded Image
        """
        target = Image.open(target_path)
        return target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # IF THE DERIVED CLASS DID NOT IMPLEMENT AN EXPLICIT _generate_samples_and_targets CHILD METHOD
        if not self.samples_targets_tuples_list:
            super()._generate_samples_and_targets()

        # EXTRACT THE LABELS FROM THE TUPLES LIST
        image_files, label_files = map(list, zip(*self.samples_targets_tuples_list))
        image_indices_to_remove = []

        # CACHE IMAGES INTO MEMORY FOR FASTER TRAINING (WARNING: LARGE DATASETS MAY EXCEED SYSTEM RAM)
        if self.cache_images:
            # CREATE AN EMPTY LIST FOR THE LABELS
            self.imgs = len(self) * [None]
            cached_images_mem_in_gb = 0.0
            with tqdm(image_files, desc="Caching images") as pbar:
                for i, img_path in enumerate(pbar):
                    img = self.sample_loader(img_path)
                    if img is None:
                        image_indices_to_remove.append(i)

                    cached_images_mem_in_gb += os.path.getsize(image_files[i]) / 1024.0**3.0

                    self.imgs[i] = img
                    pbar.desc = "Caching images (%.1fGB)" % (cached_images_mem_in_gb)
            self.img_files = [e for i, e in enumerate(image_files) if i not in image_indices_to_remove]
            self.imgs = [e for i, e in enumerate(self.imgs) if i not in image_indices_to_remove]

        # CACHE LABELS INTO MEMORY FOR FASTER TRAINING - RELEVANT FOR EFFICIENT VALIDATION RUNS DURING TRAINING
        if self.cache_labels:
            # CREATE AN EMPTY LIST FOR THE LABELS
            self.labels = len(self) * [None]
            with tqdm(label_files, desc="Caching labels") as pbar:
                missing_labels, found_labels, duplicate_labels = 0, 0, 0

                for i, file in enumerate(pbar):
                    labels = self.target_loader(file)

                    if labels is None:
                        missing_labels += 1
                        image_indices_to_remove.append(i)
                        continue

                    self.labels[i] = labels
                    found_labels += 1

                    pbar.desc = "Caching labels (%g found, %g missing, %g duplicate, for %g images)" % (
                        found_labels,
                        missing_labels,
                        duplicate_labels,
                        len(image_files),
                    )
            assert found_labels > 0, "No labels found."

            #  REMOVE THE IRRELEVANT ENTRIES FROM THE DATA
            self.label_files = [e for i, e in enumerate(label_files) if i not in image_indices_to_remove]
            self.labels = [e for i, e in enumerate(self.labels) if i not in image_indices_to_remove]

    def _transform_image_and_mask(self, image, mask) -> tuple:
        """
        :param image:           The input image
        :param mask:            The input mask
        :return:                The transformed image, mask
        """
        sample = SegmentationSample(image=image, mask=mask)
        for t in self.transforms:
            sample = t.apply_to_sample(sample)
        return sample.image, sample.mask

    @property
    def _original_dataset_image_shape(self) -> Optional[Tuple[int, int]]:
        """
        Image default shape - (H,W)
        Default shape (model's input) should be defined for additional processing that might be needed
        when using "predict" any input-image/s can be used, the images should be rescaled to match the model's training-data shape
        """
        return None

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as a list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = []

        if self._original_dataset_image_shape:
            pipeline += [{Processings.SegmentationResizeWithPadding: {"output_shape": self._original_dataset_image_shape, "pad_value": 0}}]
            # Resize image to same image-shape as model input. default shape should be defined in dataset class under "output_image_shape"

        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(class_names=self.classes, image_processor={Processings.ComposeProcessing: {"processings": pipeline}})
        return params

__init__(root, list_file=None, samples_sub_directory=None, targets_sub_directory=None, cache_labels=False, cache_images=False, collate_fn=None, target_extension='.png', transforms=None)

SegmentationDataSet :param root: Root folder of the Data Set :param list_file: Path to the file with the samples list :param samples_sub_directory: name of the samples sub-directory :param targets_sub_directory: name of the targets sub-directory :param cache_labels: "Caches" the labels -> Pre-Loads to memory as a list :param cache_images: "Caches" the images -> Pre-Loads to memory as a list :param collate_fn: collate_fn func to process batches for the Data Loader :param target_extension: file extension of the targets (default is .png for PASCAL VOC 2012) :param transforms: transforms to be applied on image and mask

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@resolve_param("transforms", factory=TransformsFactory())
def __init__(
    self,
    root: str,
    list_file: str = None,
    samples_sub_directory: str = None,
    targets_sub_directory: str = None,
    cache_labels: bool = False,
    cache_images: bool = False,
    collate_fn: Callable = None,
    target_extension: str = ".png",
    transforms: Iterable = None,
):
    """
    SegmentationDataSet
        :param root:                        Root folder of the Data Set
        :param list_file:                   Path to the file with the samples list
        :param samples_sub_directory:       name of the samples sub-directory
        :param targets_sub_directory:       name of the targets sub-directory
        :param cache_labels:                "Caches" the labels -> Pre-Loads to memory as a list
        :param cache_images:                "Caches" the images -> Pre-Loads to memory as a list
        :param collate_fn:                  collate_fn func to process batches for the Data Loader
        :param target_extension:            file extension of the targets (default is .png for PASCAL VOC 2012)
        :param transforms:                  transforms to be applied on image and mask

    """
    self.samples_sub_directory = samples_sub_directory
    self.targets_sub_directory = targets_sub_directory
    self.cache_labels = cache_labels
    self.cache_images = cache_images

    # CREATE A DIRECTORY DATASET OR A LIST DATASET BASED ON THE list_file INPUT VARIABLE
    if list_file is not None:
        ListDataset.__init__(
            self,
            root=root,
            file=list_file,
            target_extension=target_extension,
            sample_loader=self.sample_loader,
            target_loader=self.target_loader,
            collate_fn=collate_fn,
        )
    else:
        DirectoryDataSet.__init__(
            self,
            root=root,
            samples_sub_directory=samples_sub_directory,
            targets_sub_directory=targets_sub_directory,
            target_extension=target_extension,
            sample_loader=self.sample_loader,
            target_loader=self.target_loader,
            collate_fn=collate_fn,
        )

    self.transforms = transforms if transforms else []

get_dataset_preprocessing_params()

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as a list of dicts to be resolved by processing factory.

Returns:

Type Description
Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as a list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = []

    if self._original_dataset_image_shape:
        pipeline += [{Processings.SegmentationResizeWithPadding: {"output_shape": self._original_dataset_image_shape, "pad_value": 0}}]
        # Resize image to same image-shape as model input. default shape should be defined in dataset class under "output_image_shape"

    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(class_names=self.classes, image_processor={Processings.ComposeProcessing: {"processings": pipeline}})
    return params

sample_loader(sample_path) staticmethod

sample_loader - Loads a dataset image from path using PIL :param sample_path: The path to the sample image :return: The loaded Image

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py
 93
 94
 95
 96
 97
 98
 99
100
101
@staticmethod
def sample_loader(sample_path: str) -> Image:
    """
    sample_loader - Loads a dataset image from path using PIL
        :param sample_path: The path to the sample image
        :return:            The loaded Image
    """
    image = Image.open(sample_path).convert("RGB")
    return image

target_loader(target_path) staticmethod

target_loader :param target_path: The path to the sample image :return: The loaded Image

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py
103
104
105
106
107
108
109
110
111
@staticmethod
def target_loader(target_path: str) -> Image:
    """
    target_loader
        :param target_path: The path to the sample image
        :return:            The loaded Image
    """
    target = Image.open(target_path)
    return target

SuperviselyPersonsDataset

Bases: SegmentationDataSet

SuperviselyPersonsDataset - Segmentation Data Set Class for Supervisely Persons Segmentation Data Set, main resolution of dataset: (600 x 800). This dataset is a subset of the original dataset (see below) and contains filtered samples For more details about the ORIGINAL dataset see: https://app.supervise.ly/ecosystem/projects/persons For more details about the FILTERED dataset see: https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.3/contrib/PP-HumanSeg

To use this Dataset you need to:

- Download supervisely dataset:
    https://deci-pretrained-models.s3.amazonaws.com/supervisely-persons.zip)

- Unzip:
    supervisely-persons
     ├──images
     │    ├──image-name.png
     │    └──...
     ├──images_600x800
     │    ├──image-name.png
     │    └──...
     ├──masks
     └──masks_600x800

- Instantiate the dataset:
    >> train_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='train.csv', ...)
    >> valid_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='val.csv', ...)
Source code in src/super_gradients/training/datasets/segmentation_datasets/supervisely_persons_segmentation.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@register_dataset(Datasets.SUPERVISELY_PERSONS_DATASET)
class SuperviselyPersonsDataset(SegmentationDataSet):
    """
    SuperviselyPersonsDataset - Segmentation Data Set Class for Supervisely Persons Segmentation Data Set,
    main resolution of dataset: (600 x 800).
    This dataset is a subset of the original dataset (see below) and contains filtered samples
    For more details about the ORIGINAL dataset see:
        https://app.supervise.ly/ecosystem/projects/persons
    For more details about the FILTERED dataset see:
        https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.3/contrib/PP-HumanSeg

    To use this Dataset you need to:

        - Download supervisely dataset:
            https://deci-pretrained-models.s3.amazonaws.com/supervisely-persons.zip)

        - Unzip:
            supervisely-persons
             ├──images
             │    ├──image-name.png
             │    └──...
             ├──images_600x800
             │    ├──image-name.png
             │    └──...
             ├──masks
             └──masks_600x800

        - Instantiate the dataset:
            >> train_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='train.csv', ...)
            >> valid_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='val.csv', ...)
    """

    CLASS_LABELS = {0: "background", 1: "person"}

    def __init__(self, root_dir: str, list_file: str, **kwargs):
        """
        :param root_dir:    root directory to dataset.
        :param list_file:   list file that contains names of images to load, line format: <image_path>,<mask_path>
        :param kwargs:      Any hyper params required for the dataset, i.e img_size, crop_size, etc...
        """

        super().__init__(root=root_dir, list_file=list_file, **kwargs)
        self.classes = ["person"]

    def _generate_samples_and_targets(self):
        with open(os.path.join(self.root, self.list_file_path), "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                sample_path = os.path.join(self.root, row[0])
                target_path = os.path.join(self.root, row[1])
                if self._validate_file(sample_path) and self._validate_file(target_path) and os.path.exists(sample_path) and os.path.exists(target_path):
                    self.samples_targets_tuples_list.append((sample_path, target_path))
                else:
                    raise AssertionError(
                        f"Sample and/or target file(s) not found or in illegal format " f"(sample path: {sample_path}, target path: {target_path})"
                    )
        super(SuperviselyPersonsDataset, self)._generate_samples_and_targets()

__init__(root_dir, list_file, **kwargs)

Parameters:

Name Type Description Default
root_dir str

root directory to dataset.

required
list_file str

list file that contains names of images to load, line format: ,

required
kwargs

Any hyper params required for the dataset, i.e img_size, crop_size, etc...

{}
Source code in src/super_gradients/training/datasets/segmentation_datasets/supervisely_persons_segmentation.py
43
44
45
46
47
48
49
50
51
def __init__(self, root_dir: str, list_file: str, **kwargs):
    """
    :param root_dir:    root directory to dataset.
    :param list_file:   list file that contains names of images to load, line format: <image_path>,<mask_path>
    :param kwargs:      Any hyper params required for the dataset, i.e img_size, crop_size, etc...
    """

    super().__init__(root=root_dir, list_file=list_file, **kwargs)
    self.classes = ["person"]

BaseSgVisionDataset

Bases: VisionDataset

BaseSgVisionDataset

Source code in src/super_gradients/training/datasets/sg_dataset.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class BaseSgVisionDataset(VisionDataset):
    """
    BaseSgVisionDataset
    """

    def __init__(
        self,
        root: str,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        valid_sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
    ):
        """
        Ctor
            :param root:
            :param sample_loader:
            :param target_loader:
            :param collate_fn:
            :param valid_sample_extensions:
            :param sample_transform:
            :param target_transform:
        """
        super().__init__(root=root, transform=sample_transform, target_transform=target_transform)
        self.samples_targets_tuples_list = list(tuple())
        self.classes = []
        self.valid_sample_extensions = valid_sample_extensions
        self.sample_loader = sample_loader
        self.target_loader = target_loader
        self._generate_samples_and_targets()

        # IF collate_fn IS PROVIDED IN CTOR WE ASSUME THERE IS A BASE-CLASS INHERITANCE W/O collate_fn IMPLEMENTATION
        if collate_fn is not None:
            self.collate_fn = collate_fn

    def __getitem__(self, item):
        """

        :param item:
        :return:
        """
        raise NotImplementedError

    def __len__(self):
        """

        :return:
        """
        return len(self.samples_targets_tuples_list)

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets - An abstract method that fills the samples and targets members of the class
        """
        raise NotImplementedError

    def _validate_file(self, filename: str) -> bool:
        """
        validate_file
            :param filename:
            :return:
        """
        for valid_extension in self.valid_sample_extensions:
            if filename.lower().endswith(valid_extension):
                return True

        return False

    @staticmethod
    def numpy_loader_func(path):
        """
        _numpy_loader_func - Uses numpy load func
            :param path:
            :return:
        """
        return np.load(path)

    @staticmethod
    def text_file_loader_func(text_file_path: str, inline_splitter: str = " ") -> list:
        """
        text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
            :param text_file_path:  Input text file
            :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                    please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('\n') SEPARATED
            :return: a list of tuples, where each tuple is a vector of target values
        """
        if not os.path.isfile(text_file_path):
            raise ValueError(" Error in text file path")

        with open(text_file_path, "r", encoding="utf-8") as text_file:
            targets_list = [tuple(map(float, line.split(inline_splitter))) for line in text_file]

        return targets_list

__getitem__(item)

Parameters:

Name Type Description Default
item required

Returns:

Type Description
Source code in src/super_gradients/training/datasets/sg_dataset.py
50
51
52
53
54
55
56
def __getitem__(self, item):
    """

    :param item:
    :return:
    """
    raise NotImplementedError

__init__(root, sample_loader=default_loader, target_loader=None, collate_fn=None, valid_sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None)

Ctor :param root: :param sample_loader: :param target_loader: :param collate_fn: :param valid_sample_extensions: :param sample_transform: :param target_transform:

Source code in src/super_gradients/training/datasets/sg_dataset.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    root: str,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    valid_sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
):
    """
    Ctor
        :param root:
        :param sample_loader:
        :param target_loader:
        :param collate_fn:
        :param valid_sample_extensions:
        :param sample_transform:
        :param target_transform:
    """
    super().__init__(root=root, transform=sample_transform, target_transform=target_transform)
    self.samples_targets_tuples_list = list(tuple())
    self.classes = []
    self.valid_sample_extensions = valid_sample_extensions
    self.sample_loader = sample_loader
    self.target_loader = target_loader
    self._generate_samples_and_targets()

    # IF collate_fn IS PROVIDED IN CTOR WE ASSUME THERE IS A BASE-CLASS INHERITANCE W/O collate_fn IMPLEMENTATION
    if collate_fn is not None:
        self.collate_fn = collate_fn

__len__()

Returns:

Type Description
Source code in src/super_gradients/training/datasets/sg_dataset.py
58
59
60
61
62
63
def __len__(self):
    """

    :return:
    """
    return len(self.samples_targets_tuples_list)

numpy_loader_func(path) staticmethod

_numpy_loader_func - Uses numpy load func :param path: :return:

Source code in src/super_gradients/training/datasets/sg_dataset.py
83
84
85
86
87
88
89
90
@staticmethod
def numpy_loader_func(path):
    """
    _numpy_loader_func - Uses numpy load func
        :param path:
        :return:
    """
    return np.load(path)

text_file_loader_func(text_file_path, inline_splitter=' ') staticmethod

    text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
        :param text_file_path:  Input text file
        :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('

') SEPARATED :return: a list of tuples, where each tuple is a vector of target values

Source code in src/super_gradients/training/datasets/sg_dataset.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@staticmethod
def text_file_loader_func(text_file_path: str, inline_splitter: str = " ") -> list:
    """
    text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
        :param text_file_path:  Input text file
        :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('\n') SEPARATED
        :return: a list of tuples, where each tuple is a vector of target values
    """
    if not os.path.isfile(text_file_path):
        raise ValueError(" Error in text file path")

    with open(text_file_path, "r", encoding="utf-8") as text_file:
        targets_list = [tuple(map(float, line.split(inline_splitter))) for line in text_file]

    return targets_list

DirectoryDataSet

Bases: BaseSgVisionDataset

DirectoryDataSet - A PyTorch Vision Data Set extension that receives a root Dir and two separate sub directories: - Sub-Directory for Samples - Sub-Directory for Targets

Source code in src/super_gradients/training/datasets/sg_dataset.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class DirectoryDataSet(BaseSgVisionDataset):
    """
    DirectoryDataSet - A PyTorch Vision Data Set extension that receives a root Dir and two separate sub directories:
                        - Sub-Directory for Samples
                        - Sub-Directory for Targets

    """

    def __init__(
        self,
        root: str,
        samples_sub_directory: str,
        targets_sub_directory: str,
        target_extension: str,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
    ):
        """
        CTOR
            :param root:                    root directory that contains all of the Data Set
            :param samples_sub_directory:   name of the samples sub-directory
            :param targets_sub_directory:   name of the targets sub-directory
            :param sample_extensions:       file extensions for samples
            :param target_extension:        file extension of the targets
            :param sample_loader:           Func to load samples
            :param target_loader:           Func to load targets
            :param collate_fn:              collate_fn func to process batches for the Data Loader
            :param sample_transform:        Func to pre-process samples for data loading
            :param target_transform:        Func to pre-process targets for data loading
        """

        # INITIALIZING THE TARGETS LOADER TO USE THE TEXT FILE LOADER FUNC
        if target_loader is None:
            target_loader = self.text_file_loader_func

        self.target_extension = target_extension
        self.samples_dir_suffix = samples_sub_directory
        self.targets_dir_suffix = targets_sub_directory

        super().__init__(
            root=root,
            sample_loader=sample_loader,
            target_loader=target_loader,
            collate_fn=collate_fn,
            valid_sample_extensions=sample_extensions,
            sample_transform=sample_transform,
            target_transform=target_transform,
        )

    def __getitem__(self, item):
        """
        getter method for iteration
            :param item:
            :return:
        """
        sample_path, target_path = self.samples_targets_tuples_list[item]
        sample = self.sample_loader(sample_path)
        target = self.target_loader(target_path)
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return sample, target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets - Uses class built in members to generate the list of (SAMPLE, TARGET/S)
                                        that is saved in self.samples_targets_tuples_list
        """
        missing_sample_files, missing_target_files = 0, 0

        # VALIDATE DATA PATH
        samples_dir_path = self.root + os.path.sep + self.samples_dir_suffix
        targets_dir_path = self.root + os.path.sep + self.targets_dir_suffix

        if not os.path.exists(samples_dir_path) or not os.path.exists(targets_dir_path):
            raise ValueError(" Error in data path")

        # ITERATE OVER SAMPLES AND MAKE SURE THERE ARE MATCHING LABELS
        for sample_file_name in os.listdir(samples_dir_path):
            sample_file_path = samples_dir_path + os.path.sep + sample_file_name
            if os.path.isfile(sample_file_path) and self._validate_file(sample_file_path):
                sample_file_prefix = str(sample_file_name.split(".")[:-1][0])

                # TRY TO GET THE MATCHING LABEL
                matching_target_file_name = sample_file_prefix + self.target_extension
                target_file_path = targets_dir_path + os.path.sep + matching_target_file_name
                if os.path.isfile(target_file_path):
                    self.samples_targets_tuples_list.append((sample_file_path, target_file_path))

                else:
                    missing_target_files += 1
            else:
                missing_sample_files += 1

        for counter_name, missing_files_counter in [("samples", missing_sample_files), ("targets", missing_target_files)]:
            if missing_files_counter > 0:
                print(__name__ + " There are " + str(missing_files_counter) + " missing  " + counter_name)

__getitem__(item)

getter method for iteration :param item: :return:

Source code in src/super_gradients/training/datasets/sg_dataset.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def __getitem__(self, item):
    """
    getter method for iteration
        :param item:
        :return:
    """
    sample_path, target_path = self.samples_targets_tuples_list[item]
    sample = self.sample_loader(sample_path)
    target = self.target_loader(target_path)
    if self.transform is not None:
        sample = self.transform(sample)
    if self.target_transform is not None:
        target = self.target_transform(target)

    return sample, target

__init__(root, samples_sub_directory, targets_sub_directory, target_extension, sample_loader=default_loader, target_loader=None, collate_fn=None, sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None)

CTOR :param root: root directory that contains all of the Data Set :param samples_sub_directory: name of the samples sub-directory :param targets_sub_directory: name of the targets sub-directory :param sample_extensions: file extensions for samples :param target_extension: file extension of the targets :param sample_loader: Func to load samples :param target_loader: Func to load targets :param collate_fn: collate_fn func to process batches for the Data Loader :param sample_transform: Func to pre-process samples for data loading :param target_transform: Func to pre-process targets for data loading

Source code in src/super_gradients/training/datasets/sg_dataset.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def __init__(
    self,
    root: str,
    samples_sub_directory: str,
    targets_sub_directory: str,
    target_extension: str,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
):
    """
    CTOR
        :param root:                    root directory that contains all of the Data Set
        :param samples_sub_directory:   name of the samples sub-directory
        :param targets_sub_directory:   name of the targets sub-directory
        :param sample_extensions:       file extensions for samples
        :param target_extension:        file extension of the targets
        :param sample_loader:           Func to load samples
        :param target_loader:           Func to load targets
        :param collate_fn:              collate_fn func to process batches for the Data Loader
        :param sample_transform:        Func to pre-process samples for data loading
        :param target_transform:        Func to pre-process targets for data loading
    """

    # INITIALIZING THE TARGETS LOADER TO USE THE TEXT FILE LOADER FUNC
    if target_loader is None:
        target_loader = self.text_file_loader_func

    self.target_extension = target_extension
    self.samples_dir_suffix = samples_sub_directory
    self.targets_dir_suffix = targets_sub_directory

    super().__init__(
        root=root,
        sample_loader=sample_loader,
        target_loader=target_loader,
        collate_fn=collate_fn,
        valid_sample_extensions=sample_extensions,
        sample_transform=sample_transform,
        target_transform=target_transform,
    )

ListDataset

Bases: BaseSgVisionDataset

ListDataset - A PyTorch Vision Data Set extension that receives a file with FULL PATH to each of the samples. Then, the assumption is that for every sample, there is a * matching target * in the same path but with a different extension, i.e: for the samples paths: (That appear in the list file) /root/dataset/class_x/sample1.png /root/dataset/class_y/sample123.png

                the matching labels paths:  (That DO NOT appear in the list file)
                                            /root/dataset/class_x/sample1.ext
                                            /root/dataset/class_y/sample123.ext
Source code in src/super_gradients/training/datasets/sg_dataset.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
class ListDataset(BaseSgVisionDataset):
    """
    ListDataset - A PyTorch Vision Data Set extension that receives a file with FULL PATH to each of the samples.
                  Then, the assumption is that for every sample, there is a * matching target * in the same
                  path but with a different extension, i.e:
                        for the samples paths:  (That appear in the list file)
                                                    /root/dataset/class_x/sample1.png
                                                    /root/dataset/class_y/sample123.png

                        the matching labels paths:  (That DO NOT appear in the list file)
                                                    /root/dataset/class_x/sample1.ext
                                                    /root/dataset/class_y/sample123.ext
    """

    def __init__(
        self,
        root,
        file,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
        target_extension=".npy",
    ):
        """
        CTOR
            :param root:                    root directory that contains all of the Data Set
            :param file:                    Path to the file with the samples list
            :param sample_extensions:       file extension for samples
            :param target_extension:        file extension of the targets
            :param sample_loader:           Func to load samples
            :param target_loader:           Func to load targets
            :param collate_fn:              collate_fn func to process batches for the Data Loader
            :param sample_transform:        Func to pre-process samples for data loading
            :param target_transform:        Func to pre-process targets for data loading
        """

        if target_loader is None:
            target_loader = self.numpy_loader_func

        self.list_file_path = file
        self.loader = sample_loader
        self.target_loader = target_loader
        self.extensions = sample_extensions
        self.target_extension = target_extension

        super().__init__(
            root,
            sample_loader=sample_loader,
            target_loader=target_loader,
            collate_fn=collate_fn,
            sample_transform=sample_transform,
            valid_sample_extensions=sample_extensions,
            target_transform=target_transform,
        )

    def __getitem__(self, item: int) -> Tuple[Any, Any]:
        """
        :param item: Index
        :return: Tuple (sample, target) where target is class_index of the target class.
        """
        sample_path, target_path = self.samples_targets_tuples_list[item]
        sample = self.loader(sample_path)
        target = self.target_loader(target_path)[0]
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return sample, target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        file = open(self.root + os.path.sep + self.list_file_path, "r", encoding="utf-8")

        reader = csv.reader(file)
        data = [row[0] for row in reader]

        for f in data:
            path = self.root + os.path.sep + f
            target_path = path[:-4] + self.target_extension
            if self._validate_file(path) and os.path.exists(target_path):
                self.samples_targets_tuples_list.append((path, target_path))

__getitem__(item)

Parameters:

Name Type Description Default
item int

Index

required

Returns:

Type Description
Tuple[Any, Any]

Tuple (sample, target) where target is class_index of the target class.

Source code in src/super_gradients/training/datasets/sg_dataset.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def __getitem__(self, item: int) -> Tuple[Any, Any]:
    """
    :param item: Index
    :return: Tuple (sample, target) where target is class_index of the target class.
    """
    sample_path, target_path = self.samples_targets_tuples_list[item]
    sample = self.loader(sample_path)
    target = self.target_loader(target_path)[0]
    if self.transform is not None:
        sample = self.transform(sample)
    if self.target_transform is not None:
        target = self.target_transform(target)

    return sample, target

__init__(root, file, sample_loader=default_loader, target_loader=None, collate_fn=None, sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None, target_extension='.npy')

CTOR :param root: root directory that contains all of the Data Set :param file: Path to the file with the samples list :param sample_extensions: file extension for samples :param target_extension: file extension of the targets :param sample_loader: Func to load samples :param target_loader: Func to load targets :param collate_fn: collate_fn func to process batches for the Data Loader :param sample_transform: Func to pre-process samples for data loading :param target_transform: Func to pre-process targets for data loading

Source code in src/super_gradients/training/datasets/sg_dataset.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def __init__(
    self,
    root,
    file,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
    target_extension=".npy",
):
    """
    CTOR
        :param root:                    root directory that contains all of the Data Set
        :param file:                    Path to the file with the samples list
        :param sample_extensions:       file extension for samples
        :param target_extension:        file extension of the targets
        :param sample_loader:           Func to load samples
        :param target_loader:           Func to load targets
        :param collate_fn:              collate_fn func to process batches for the Data Loader
        :param sample_transform:        Func to pre-process samples for data loading
        :param target_transform:        Func to pre-process targets for data loading
    """

    if target_loader is None:
        target_loader = self.numpy_loader_func

    self.list_file_path = file
    self.loader = sample_loader
    self.target_loader = target_loader
    self.extensions = sample_extensions
    self.target_extension = target_extension

    super().__init__(
        root,
        sample_loader=sample_loader,
        target_loader=target_loader,
        collate_fn=collate_fn,
        sample_transform=sample_transform,
        valid_sample_extensions=sample_extensions,
        target_transform=target_transform,
    )