Datasets

RandAugment RandAugment is a variant of AutoAugment which randomly selects transformations from AutoAugment to be applied on an image.

RandomAugmentation Implementation adapted from: https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py

Papers: RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719

`AugmentOp`

single auto augment operations

Source code in src/super_gradients/training/datasets/auto_augment.py

class AugmentOp:
    """
    single auto augment operations
    """

    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
        hparams = hparams or _HPARAMS_DEFAULT
        self.aug_fn = NAME_TO_OP[name]
        self.level_fn = LEVEL_TO_ARG[name]
        self.prob = prob
        self.magnitude = magnitude
        self.hparams = hparams.copy()
        self.kwargs = dict(
            fillcolor=hparams["img_mean"] if "img_mean" in hparams else _FILL,
            resample=hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION,
        )

        # If magnitude_std is > 0, introduce some randomness
        self.magnitude_std = self.hparams.get("magnitude_std", 0)

    def __call__(self, img):
        if self.prob < 1.0 and random.random() > self.prob:
            return img
        magnitude = self.magnitude
        if self.magnitude_std:
            if self.magnitude_std == float("inf"):
                magnitude = random.uniform(0, magnitude)
            elif self.magnitude_std > 0:
                magnitude = random.gauss(magnitude, self.magnitude_std)
        magnitude = min(_MAX_MAGNITUDE, max(0, magnitude))  # clip to valid range
        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
        return self.aug_fn(img, *level_args, **self.kwargs)

`RandAugment`

Random auto augment class, will select auto augment transforms according to probability weights for each op

Source code in src/super_gradients/training/datasets/auto_augment.py

class RandAugment:
    """
    Random auto augment class, will select auto augment transforms according to probability weights for each op
    """

    def __init__(self, ops, num_layers=2, choice_weights=None):
        self.ops = ops
        self.num_layers = num_layers
        self.choice_weights = choice_weights

    def __call__(self, img):
        # no replacement when using weighted choice
        ops = np.random.choice(self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
        for op in ops:
            img = op(img)
        return img

`rand_augment_transform(config_str, crop_size, img_mean)`

Create a RandAugment transform

Parameters:

Name	Type	Description	Default
`config_str`		String defining configuration of random augmentation. Consists of multiple sections separated by dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining sections, not order sepecific determine 'm' - integer magnitude of rand augment 'n' - integer num layers (number of transform ops selected per image) 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) 'mstd' - float std deviation of magnitude noise applied 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2	required
`crop_size`	`int`	The size of crop image	required
`img_mean`	`List[float]`	Average per channel	required

Returns:

Type	Description
	A PyTorch compatible Transform

Source code in src/super_gradients/training/datasets/auto_augment.py

@register_transform(Transforms.RandAugmentTransform)
def rand_augment_transform(config_str, crop_size: int, img_mean: List[float]):
    """
    Create a RandAugment transform

    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
    sections, not order sepecific determine
        'm' - integer magnitude of rand augment
        'n' - integer num layers (number of transform ops selected per image)
        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
        'mstd' -  float std deviation of magnitude noise applied
        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2

    :param crop_size: The size of crop image
    :param img_mean:  Average per channel

    :return: A PyTorch compatible Transform
    """
    hparams = dict(translate_const=int(crop_size * 0.45), img_mean=tuple([min(255, round(255 * channel_mean)) for channel_mean in img_mean]))

    magnitude = _MAX_MAGNITUDE  # default to _MAX_MAGNITUDE for magnitude (currently 10)
    num_layers = 2  # default to 2 ops per image
    weight_idx = None  # default to no probability weights for op choice
    transforms = _RAND_TRANSFORMS
    config = config_str.split("-")
    for c in config:
        cs = re.split(r"(\d.*)", c)
        if len(cs) < 2:
            continue
        key, val = cs[:2]
        if key == "mstd":
            # noise param injected via hparams for now
            hparams.setdefault("magnitude_std", float(val))
        elif key == "inc":
            if bool(val):
                transforms = _RAND_INCREASING_TRANSFORMS
        elif key == "m":
            magnitude = int(val)
        elif key == "n":
            num_layers = int(val)
        elif key == "w":
            weight_idx = int(val)
        else:
            assert False, "Unknown RandAugment config section"
    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)

`Cifar10`

Bases: CIFAR10, HasPreprocessingParams

CIFAR10 Dataset

Parameters:

Name	Type	Description	Default
`root`	`str`	Path for the data to be extracted	required
`train`	`bool`	Bool to load training (True) or validation (False) part of the dataset	`True`
`transforms`	`Union[list, dict]`	List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose	`None`
`target_transform`	`Optional[Callable]`	Transform to apply to target output	`None`
`download`	`bool`	Download (True) the dataset from source	`False`

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py

@register_dataset(Datasets.CIFAR_10)
class Cifar10(CIFAR10, HasPreprocessingParams):
    """
    CIFAR10 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """

    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar10, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

`get_dataset_preprocessing_params()`

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type	Description
`Dict`	(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py

def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

`Cifar100`

Bases: CIFAR100, HasPreprocessingParams

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py

@register_dataset(Datasets.CIFAR_100)
class Cifar100(CIFAR100, HasPreprocessingParams):
    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        root: str,
        train: bool = True,
        transforms: Union[list, dict] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:
        """
        CIFAR100 Dataset

        :param root:                    Path for the data to be extracted
        :param train:                   Bool to load training (True) or validation (False) part of the dataset
        :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
        :param target_transform:        Transform to apply to target output
        :param download:                Download (True) the dataset from source
        """
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)

        super(Cifar100, self).__init__(
            root=root,
            train=train,
            transform=transforms,
            target_transform=target_transform,
            download=download,
        )

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

`init(root, train=True, transforms=None, target_transform=None, download=False)`

CIFAR100 Dataset

Parameters:

Name	Type	Description	Default
`root`	`str`	Path for the data to be extracted	required
`train`	`bool`	Bool to load training (True) or validation (False) part of the dataset	`True`
`transforms`	`Union[list, dict]`	List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose	`None`
`target_transform`	`Optional[Callable]`	Transform to apply to target output	`None`
`download`	`bool`	Download (True) the dataset from source	`False`

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py

@resolve_param("transforms", TransformsFactory())
def __init__(
    self,
    root: str,
    train: bool = True,
    transforms: Union[list, dict] = None,
    target_transform: Optional[Callable] = None,
    download: bool = False,
) -> None:
    """
    CIFAR100 Dataset

    :param root:                    Path for the data to be extracted
    :param train:                   Bool to load training (True) or validation (False) part of the dataset
    :param transforms:              List of transforms to apply sequentially on sample. Wrapped internally with torchvision.Compose
    :param target_transform:        Transform to apply to target output
    :param download:                Download (True) the dataset from source
    """
    # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
    # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
    if isinstance(transforms, list):
        transforms = Compose(transforms)

    super(Cifar100, self).__init__(
        root=root,
        train=train,
        transform=transforms,
        target_transform=target_transform,
        download=download,
    )

`get_dataset_preprocessing_params()`

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type	Description
`Dict`	(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/cifar.py

def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

`ImageNetDataset`

Bases: torch_datasets.ImageFolder, HasPreprocessingParams

ImageNetDataset dataset.

To use this Dataset you need to:

Download imagenet dataset (https://image-net.org/download.php) Imagenet ├──train │ ├──n02093991 │ │ ├──n02093991_1001.JPEG │ │ ├──n02093991_1004.JPEG │ │ └──... │ ├──n02093992 │ └──... └──val ├──n02093991 ├──n02093992 └──...
Instantiate the dataset: >> train_set = ImageNetDataset(root='.../Imagenet/train', ...) >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py

@register_dataset(Datasets.IMAGENET_DATASET)
class ImageNetDataset(torch_datasets.ImageFolder, HasPreprocessingParams):
    """ImageNetDataset dataset.

    To use this Dataset you need to:

    - Download imagenet dataset (https://image-net.org/download.php)
        Imagenet
         ├──train
         │  ├──n02093991
         │  │   ├──n02093991_1001.JPEG
         │  │   ├──n02093991_1004.JPEG
         │  │   └──...
         │  ├──n02093992
         │  └──...
         └──val
            ├──n02093991
            ├──n02093992
            └──...

    - Instantiate the dataset:
        >> train_set = ImageNetDataset(root='.../Imagenet/train', ...)
        >> valid_set = ImageNetDataset(root='.../Imagenet/val', ...)
    """

    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(self, root: str, transforms: Union[list, dict] = [], *args, **kwargs):
        # TO KEEP BACKWARD COMPATABILITY, WILL BE REMOVED IN THE FUTURE ONCE WE ALLIGN TORCHVISION/NATIVE TRANSFORMS
        # TREATMENT IN FACTORIES (I.E STATING COMPOSE IN CONFIGS)
        if isinstance(transforms, list):
            transforms = Compose(transforms)
        super(ImageNetDataset, self).__init__(root, transform=transforms, *args, **kwargs)

    def get_dataset_preprocessing_params(self) -> Dict:
        """
        Get the preprocessing params for the dataset.
        It infers preprocessing params from transforms used in the dataset & class names
        :return: (dict) Preprocessing params
        """

        pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
        params = dict(
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            class_names=self.classes,
        )
        return params

`get_dataset_preprocessing_params()`

Get the preprocessing params for the dataset. It infers preprocessing params from transforms used in the dataset & class names

Returns:

Type	Description
`Dict`	(dict) Preprocessing params

Source code in src/super_gradients/training/datasets/classification_datasets/imagenet_dataset.py

def get_dataset_preprocessing_params(self) -> Dict:
    """
    Get the preprocessing params for the dataset.
    It infers preprocessing params from transforms used in the dataset & class names
    :return: (dict) Preprocessing params
    """

    pipeline = get_torchvision_transforms_equivalent_processing(self.transforms)
    params = dict(
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        class_names=self.classes,
    )
    return params

`get_torchvision_transforms_equivalent_processing(transforms)`

Get the equivalent processing pipeline for torchvision transforms.

Returns:

Type	Description
`List[Dict[str, Any]]`	List of Processings operations

Source code in src/super_gradients/training/datasets/classification_datasets/torchvision_utils.py

def get_torchvision_transforms_equivalent_processing(transforms: List[Any]) -> List[Dict[str, Any]]:
    """
    Get the equivalent processing pipeline for torchvision transforms.

    :return: List of Processings operations
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = []

    if isinstance(transforms, StandardTransform):
        transforms = transforms.transform

    if isinstance(transforms, Compose):
        transforms = transforms.transforms

    for transform in transforms:
        if isinstance(transform, ToTensor):
            pipeline.append({Processings.StandardizeImage: {"max_value": 255}})
        elif isinstance(transform, Normalize):
            pipeline.append({Processings.NormalizeImage: {"mean": tuple(map(float, transform.mean)), "std": tuple(map(float, transform.std))}})
        elif isinstance(transform, Resize):
            pipeline.append({Processings.Resize: {"size": int(transform.size)}})
        elif isinstance(transform, CenterCrop):
            pipeline.append({Processings.CenterCrop: {"size": int(transform.size)}})
        else:
            raise ValueError(f"Unsupported transform: {transform}")

    pipeline.append({Processings.ImagePermute: {"permutation": (2, 0, 1)}})
    return pipeline

`Lighting`

Bases: object

Lighting noise(AlexNet - style PCA - based noise) Taken from fastai Imagenet training - https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103 To use: - training_params = {"imagenet_pca_aug": 0.1} - Default training_params arg is 0.0 ("don't use") - 0.1 is that default in the original paper

Source code in src/super_gradients/training/datasets/data_augmentation.py

@register_transform(Transforms.Lighting)
class Lighting(object):
    """
    Lighting noise(AlexNet - style PCA - based noise)
    Taken from fastai Imagenet training -
    https://github.com/fastai/imagenet-fast/blob/faa0f9dfc9e8e058ffd07a248724bf384f526fae/imagenet_nv/fastai_imagenet.py#L103
    To use:
        - training_params = {"imagenet_pca_aug": 0.1}
        - Default training_params arg is 0.0 ("don't use")
        - 0.1 is that default in the original paper
    """

    def __init__(self, alphastd, eigval=IMAGENET_PCA["eigval"], eigvec=IMAGENET_PCA["eigvec"]):
        self.alphastd = alphastd
        self.eigval = eigval
        self.eigvec = eigvec

    def __call__(self, img):
        if self.alphastd == 0:
            return img
        alpha = img.new().resize_(3).normal_(0, self.alphastd)
        rgb = self.eigvec.type_as(img).clone().mul(alpha.view(1, 3).expand(3, 3)).mul(self.eigval.view(1, 3).expand(3, 3)).sum(1).squeeze()
        return img.add(rgb.view(3, 1, 1).expand_as(img))

`RandomErase`

Bases: RandomErasing

A simple class that translates the parameters supported in SuperGradient's code base

Source code in src/super_gradients/training/datasets/data_augmentation.py

@register_transform(Transforms.RandomErase)
class RandomErase(RandomErasing):
    """
    A simple class that translates the parameters supported in SuperGradient's code base
    """

    def __init__(self, probability: float, value: str):
        # value might be a string representing a float. First we try to convert to float and if fails,
        # pass it as-is to super
        try:
            value = float(value)
        except ValueError:
            pass
        super().__init__(p=probability, value=value)

`BoundingBoxFormat`

Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py

class BoundingBoxFormat:
    """
    Abstract class for describing a bounding boxes format. It exposes two methods: to_xyxy and from_xyxy to convert
    whatever format of boxes we are dealing with to internal xyxy format and vice versa. This conversion from and to
    intermediate xyxy format has a subtle performance impact, but greatly reduce amount of boilerplate code to support
    all combinations of conversion xyxy, xywh, cxcywh, yxyx <-> xyxy, xywh, cxcywh, yxyx.
    """

    def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert input boxes to XYXY format
        :param bboxes: Input bounding boxes [..., 4]
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in XYXY format
        """
        return self.get_to_xyxy(inplace)(bboxes, image_shape)

    def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
        """
        Convert XYXY boxes to target bboxes format
        :param bboxes: Input bounding boxes [..., 4] in XYXY format
        :param image_shape: Dimensions (rows, cols) of the original image to support
                            normalized boxes or non top-left origin coordinate system.
        :return: Converted bounding boxes [..., 4] in target format
        """
        return self.get_from_xyxy(inplace)(bboxes, image_shape)

    @abstractmethod
    def get_to_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    @abstractmethod
    def get_from_xyxy(self, inplace: bool) -> Callable[[Tensor, Tuple[int, int]], Tensor]:
        raise NotImplementedError()

    def get_num_parameters(self) -> int:
        return 4

`from_xyxy(bboxes, image_shape, inplace)`

Convert XYXY boxes to target bboxes format

Parameters:

Name	Type	Description	Default
`bboxes`		Input bounding boxes [..., 4] in XYXY format	required
`image_shape`	`Tuple[int, int]`	Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.	required

Returns:

Type	Description
	Converted bounding boxes [..., 4] in target format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py

def from_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert XYXY boxes to target bboxes format
    :param bboxes: Input bounding boxes [..., 4] in XYXY format
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in target format
    """
    return self.get_from_xyxy(inplace)(bboxes, image_shape)

`to_xyxy(bboxes, image_shape, inplace)`

Convert input boxes to XYXY format

Parameters:

Name	Type	Description	Default
`bboxes`		Input bounding boxes [..., 4]	required
`image_shape`	`Tuple[int, int]`	Dimensions (rows, cols) of the original image to support normalized boxes or non top-left origin coordinate system.	required

Returns:

Type	Description
	Converted bounding boxes [..., 4] in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py

def to_xyxy(self, bboxes, image_shape: Tuple[int, int], inplace: bool):
    """
    Convert input boxes to XYXY format
    :param bboxes: Input bounding boxes [..., 4]
    :param image_shape: Dimensions (rows, cols) of the original image to support
                        normalized boxes or non top-left origin coordinate system.
    :return: Converted bounding boxes [..., 4] in XYXY format
    """
    return self.get_to_xyxy(inplace)(bboxes, image_shape)

`convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)`

Convert bboxes from source to target format

Parameters:

Name	Type	Description	Default
`bboxes`		Tensor of shape (..., 4) with input bounding boxes	required
`image_shape`	`Tuple[int, int]`	Tuple of (rows, cols) corresponding to image shape	required
`source_format`	`BoundingBoxFormat`	Format of the source bounding boxes	required
`target_format`	`BoundingBoxFormat`	Format of the output bounding boxes	required

Returns:

Type	Description
	Tensor of shape (..., 4) with resulting bounding boxes

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/bbox_format.py

def convert_bboxes(bboxes, image_shape: Tuple[int, int], source_format: BoundingBoxFormat, target_format: BoundingBoxFormat, inplace: bool):
    """
    Convert bboxes from source to target format
    :param bboxes: Tensor of shape (..., 4) with input bounding boxes
    :param image_shape: Tuple of (rows, cols) corresponding to image shape
    :param source_format: Format of the source bounding boxes
    :param target_format: Format of the output bounding boxes
    :return: Tensor of shape (..., 4) with resulting bounding boxes
    """
    xyxy = source_format.to_xyxy(bboxes, image_shape, inplace)
    return target_format.from_xyxy(xyxy, image_shape, inplace)

`cxcywh_to_xyxy(bboxes, image_shape)`

Transforms bboxes from CX-CY-W-H format to XYXY format

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in CX-CY-W-H format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py

def cxcywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from CX-CY-W-H format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    cx, cy, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        if isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

`cxcywh_to_xyxy_inplace(bboxes, image_shape)`

Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in CX-CY-W-H format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py

def cxcywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in CX-CY-W-H format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to cxcywh_to_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    bboxes[..., 0:2] -= bboxes[..., 2:4] * 0.5  # cxcy -> x1y1
    bboxes[..., 2:4] += bboxes[..., 0:2]  # x1y1 + wh -> x2y2
    return bboxes

`xyxy_to_cxcywh(bboxes, image_shape)`

Transforms bboxes from xyxy format to CX-CY-W-H format

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py

def xyxy_to_cxcywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1
    cx = x1 + 0.5 * w
    cy = y1 + 0.5 * h
    if torch.jit.is_scripting():
        return torch.stack([cx, cy, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([cx, cy, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([cx, cy, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

`xyxy_to_cxcywh_inplace(bboxes, image_shape)`

Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place. Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in CX-CY-W-H format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/cxcywh.py

def xyxy_to_cxcywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from xyxy format to CX-CY-W-H format. This function operates in-place.
    Not that bboxes dtype is preserved, and it may lead to unwanted rounding errors when computing a center of bbox.

    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in CX-CY-W-H format
    """
    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
        elif isinstance(bboxes, np.ndarray) and not is_floating_point_array(bboxes):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_cxcywh_inplace function. This may cause rounding errors and lose of precision. "
                "You may want to convert your array to floating-point precision first."
            )
    bboxes[..., 2:4] -= bboxes[..., 0:2]  # x2y2 - x1y2 -> wh
    bboxes[..., 0:2] += bboxes[..., 2:4] * 0.5  # cxcywh
    return bboxes

`NormalizedXYXYCoordinateFormat`

Bases: BoundingBoxFormat

Normalized X1,Y1,X2,Y2 bounding boxes format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py

class NormalizedXYXYCoordinateFormat(BoundingBoxFormat):
    """
    Normalized X1,Y1,X2,Y2 bounding boxes format
    """

    def __init__(self):
        super().__init__()
        self.format = "normalized_xyxy"
        self.normalized = True

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return normalized_xyxy_to_xyxy_inplace
        else:
            return normalized_xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_normalized_xyxy_inplace
        else:
            return xyxy_to_normalized_xyxy

`normalized_xyxy_to_xyxy(bboxes, image_shape)`

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY (unit-normalized) format	required
`image_shape`	`Tuple[int, int]`	Image shape (rows,cols)	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py

def normalized_xyxy_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
            scale = scale.reshape([1] * (len(bboxes.shape) - 1) + [4])
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

    return bboxes * scale

`normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)`

Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY (unit-normalized) format	required
`image_shape`	`Tuple[int, int]`	Image shape (rows,cols)	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY (pixels) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py

def normalized_xyxy_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert unit-normalized XYXY bboxes to XYXY bboxes in pixel units. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (pixels) format
    """
    rows, cols = image_shape
    bboxes[..., 0:3:2] *= cols
    bboxes[..., 1:4:2] *= rows
    return bboxes

`xyxy_to_normalized_xyxy(bboxes, image_shape)`

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format

Parameters:

Name	Type	Description	Default
`bboxes`	`Tensor`	BBoxes of shape (..., 4) in XYXY (pixels) format	required
`image_shape`	`Tuple[int, int]`	Image shape (rows,cols)	required

Returns:

Type	Description
`Tensor`	BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py

def xyxy_to_normalized_xyxy(bboxes: Tensor, image_shape: Tuple[int, int]) -> Tensor:
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """
    rows, cols = image_shape
    if torch.jit.is_scripting():
        scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
        scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
    else:
        if torch.is_tensor(bboxes):
            scale = torch.tensor([cols, rows, cols, rows], dtype=bboxes.dtype, device=bboxes.device)
            scale = scale.reshape([1] * (len(bboxes.size()) - 1) + [4])
        elif isinstance(bboxes, np.ndarray):
            scale = np.array([cols, rows, cols, rows], dtype=bboxes.dtype)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")
    return bboxes / scale

`xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)`

Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY (pixels) format	required
`image_shape`	`Tuple[int, int]`	Image shape (rows,cols)	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY (unit-normalized) format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/normalized_xyxy.py

def xyxy_to_normalized_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Convert bboxes from XYXY (pixels) format to XYXY (unit-normalized) format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY (pixels) format
    :param image_shape: Image shape (rows,cols)
    :return: BBoxes of shape (..., 4) in XYXY (unit-normalized) format
    """

    if not torch.jit.is_scripting():
        if torch.is_tensor(bboxes) and not torch.is_floating_point(bboxes):
            warnings.warn(
                f"Detected non floating-point ({bboxes.dtype}) input to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )
        if isinstance(bboxes, np.ndarray) and not np.issubdtype(bboxes.dtype, np.floating):
            warnings.warn(
                f"Detected non floating-point input ({bboxes.dtype}) to xyxy_to_normalized_xyxy_inplace function. "
                f"This may cause rounding errors and lose of precision. You may want to convert your array to floating-point precision first."
            )

    rows, cols = image_shape
    bboxes[..., 0:3:2] /= cols
    bboxes[..., 1:4:2] /= rows
    return bboxes

`xywh_to_xyxy(bboxes, image_shape)`

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYWH format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py

def xywh_to_xyxy(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    x1, y1, w, h = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    x2 = x1 + w
    y2 = y1 + h

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, x2, y2], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, x2, y2], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, x2, y2], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

`xywh_to_xyxy_inplace(bboxes, image_shape)`

Transforms bboxes from XYWH format to XYXY format

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYWH format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYXY format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py

def xywh_to_xyxy_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes from XYWH format to XYXY format
    :param bboxes: BBoxes of shape (..., 4) in XYWH format
    :return: BBoxes of shape (..., 4) in XYXY format
    """
    bboxes[..., 2:4] += bboxes[..., 0:2]
    return bboxes

`xyxy_to_xywh(bboxes, image_shape)`

Transforms bboxes inplace from XYXY format to XYWH format

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py

def xyxy_to_xywh(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    x1, y1, x2, y2 = bboxes[..., 0], bboxes[..., 1], bboxes[..., 2], bboxes[..., 3]
    w = x2 - x1
    h = y2 - y1

    if torch.jit.is_scripting():
        return torch.stack([x1, y1, w, h], dim=-1)
    else:
        if torch.is_tensor(bboxes):
            return torch.stack([x1, y1, w, h], dim=-1)
        elif isinstance(bboxes, np.ndarray):
            return np.stack([x1, y1, w, h], axis=-1)
        else:
            raise RuntimeError(f"Only Torch tensor or Numpy array is supported. Received bboxes of type {str(type(bboxes))}")

`xyxy_to_xywh_inplace(bboxes, image_shape)`

Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.

Parameters:

Name	Type	Description	Default
`bboxes`		BBoxes of shape (..., 4) in XYXY format	required

Returns:

Type	Description
	BBoxes of shape (..., 4) in XYWH format

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xywh.py

def xyxy_to_xywh_inplace(bboxes, image_shape: Tuple[int, int]):
    """
    Transforms bboxes inplace from XYXY format to XYWH format. This function operates in-place.
    :param bboxes: BBoxes of shape (..., 4) in XYXY format
    :return: BBoxes of shape (..., 4) in XYWH format
    """
    bboxes[..., 2:4] -= bboxes[..., 0:2]
    return bboxes

`XYXYCoordinateFormat`

Bases: BoundingBoxFormat

Bounding boxes format X1, Y1, X2, Y2

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/xyxy.py

class XYXYCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format X1, Y1, X2, Y2
    """

    def __init__(self):
        self.format = "xyxy"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

    def get_from_xyxy(self, inplace: bool):
        return xyxy_to_xyxy

`YXYXCoordinateFormat`

Bases: BoundingBoxFormat

Bounding boxes format Y1, X1, Y2, X1

Source code in src/super_gradients/training/datasets/data_formats/bbox_formats/yxyx.py

class YXYXCoordinateFormat(BoundingBoxFormat):
    """
    Bounding boxes format Y1, X1, Y2, X1
    """

    def __init__(self):
        super().__init__()
        self.format = "yxyx"
        self.normalized = False

    def get_to_xyxy(self, inplace: bool):
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

    def get_from_xyxy(self, inplace: bool):
        # XYXY <-> YXYX is interchangable operation, so we may reuse same routine here
        if inplace:
            return xyxy_to_yxyx_inplace
        else:
            return xyxy_to_yxyx

`ConcatenatedTensorFormatConverter`

Source code in src/super_gradients/training/datasets/data_formats/format_converter.py

class ConcatenatedTensorFormatConverter:
    def __init__(
        self,
        input_format: ConcatenatedTensorFormat,
        output_format: ConcatenatedTensorFormat,
        image_shape: Union[Tuple[int, int], None],
    ):
        """
        Converts concatenated tensors from input format to output format.

        Example:
            >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
            >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
            >>> h, w = 100, 200
            >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
            >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
            >>>
            >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
            >>>
            >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
            >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        self.permutation_indexes = get_permutation_indexes(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.image_shape = image_shape
        self.input_length = input_format.num_channels

    def __call__(self, tensor: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        if tensor.shape[-1] != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({tensor.shape[-1]}) must be "
                f"equal to {self.input_length} as defined by input format."
            )
        tensor = tensor[:, self.permutation_indexes]
        tensor = apply_on_bboxes(fn=self._convert_bbox, tensor=tensor, tensor_format=self.output_format)
        return tensor

    def _convert_bbox(self, bboxes: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
        return convert_bboxes(
            bboxes=bboxes,
            source_format=self.input_format.bboxes_format.format,
            target_format=self.output_format.bboxes_format.format,
            inplace=False,
            image_shape=self.image_shape,
        )

`init(input_format, output_format, image_shape)`

Converts concatenated tensors from input format to output format.

Example: >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY >>> h, w = 100, 200 >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32) >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32) >>> >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w)) >>> >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

Parameters:

Name	Type	Description	Default
`input_format`	`ConcatenatedTensorFormat`	Format definition of the inputs	required
`output_format`	`ConcatenatedTensorFormat`	Format definition of the outputs	required
`image_shape`	`Union[Tuple[int, int], None]`	Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None	required

Source code in src/super_gradients/training/datasets/data_formats/format_converter.py

def __init__(
    self,
    input_format: ConcatenatedTensorFormat,
    output_format: ConcatenatedTensorFormat,
    image_shape: Union[Tuple[int, int], None],
):
    """
    Converts concatenated tensors from input format to output format.

    Example:
        >>> from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
        >>> from super_gradients.training.datasets.data_formats.default_formats import LABEL_CXCYWH, LABEL_NORMALIZED_XYXY
        >>> h, w = 100, 200
        >>> input_target = np.array([[10, 20 / w, 30 / h, 40 / w, 50 / h]], dtype=np.float32)
        >>> expected_output_target = np.array([[10, 30, 40, 20, 20]], dtype=np.float32)
        >>>
        >>> transform = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_XYXY, output_format=LABEL_CXCYWH, image_shape=(h, w))
        >>>
        >>> # np.float32 approximation of multiplication/division can lead to uncertainty of up to 1e-7 in precision
        >>> assert np.allclose(transform(input_target), expected_output_target, atol=1e-6)

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    self.permutation_indexes = get_permutation_indexes(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.image_shape = image_shape
    self.input_length = input_format.num_channels

`ConcatenatedTensorFormat`

Bases: DetectionOutputFormat

Define the output format that return a single tensor of shape [N,M] (N - number of detections, M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields. A layout defines the order of concatenated tensors. For instance: - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1) - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

custom_format = ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

Source code in src/super_gradients/training/datasets/data_formats/formats.py

class ConcatenatedTensorFormat(DetectionOutputFormat):
    """
    Define the output format that return a single tensor of shape [N,M] (N - number of detections,
    M - sum of bbox attributes) that is a concatenated from bbox coordinates and other fields.
    A layout defines the order of concatenated tensors. For instance:
    - layout: (bboxes, scores, labels) gives a Tensor that is product of torch.cat([bboxes, scores, labels], dim=1)
    - layout: (labels, bboxes) produce a Tensor from torch.cat([labels, bboxes], dim=1)


    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> custom_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>         TensorSliceItem(name="label", length=1),
    >>>         TensorSliceItem(name="distance", length=1),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>     )
    >>> )

    """

    layout: Mapping[str, TensorSliceItem]
    locations: Mapping[str, Tuple[int, int]]
    indexes: Mapping[str, List[int]]
    num_channels: int

    @property
    def bboxes_format(self) -> BoundingBoxesTensorSliceItem:
        bbox_items = [x for x in self.layout.values() if isinstance(x, BoundingBoxesTensorSliceItem)]
        return bbox_items[0]

    def __init__(self, layout: Union[List[TensorSliceItem], Tuple[TensorSliceItem, ...]]):
        bbox_items = [x for x in layout if isinstance(x, BoundingBoxesTensorSliceItem)]
        if len(bbox_items) != 1:
            raise RuntimeError("Number of bounding box items must be strictly equal to 1")

        _layout = []
        _locations = []
        _indexes = []

        offset = 0
        for item in layout:
            location_indexes = list(range(offset, offset + item.length))
            location_slice = offset, offset + item.length

            _layout.append((item.name, item))
            _locations.append((item.name, location_slice))
            _indexes.append((item.name, location_indexes))
            offset += item.length

        self.layout = collections.OrderedDict(_layout)
        self.locations = collections.OrderedDict(_locations)
        self.indexes = collections.OrderedDict(_indexes)
        self.num_channels = offset

    def __repr__(self):
        return str(self.layout)

`apply_on_bboxes(fn, tensor, tensor_format)`

Apply inplace a function only on the bboxes of a concatenated tensor.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]`	Function to apply on the bboxes.	required
`tensor`	`Union[np.ndarray, Tensor]`	Concatenated tensor that include - among other - the bboxes.	required
`tensor_format`	`ConcatenatedTensorFormat`	Format of the tensor, required to know the indexes of the bboxes.	required

Returns:

Type	Description
`Union[np.ndarray, Tensor]`	Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py

def apply_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on the bboxes of a concatenated tensor.

    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return apply_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

`apply_on_layout(fn, tensor, tensor_format, layout_name)`

Apply inplace a function only on a specific layout of a concatenated tensor.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]`	Function to apply on the bboxes.	required
`tensor`	`Union[np.ndarray, Tensor]`	Concatenated tensor that include - among other - the layout of interest.	required
`tensor_format`	`ConcatenatedTensorFormat`	Format of the tensor, required to know the indexes of the layout.	required
`layout_name`	`str`	Name of the layout of interest. It has to be defined in the tensor_format.	required

Returns:

Type	Description
`Union[np.ndarray, Tensor]`	Tensor, after applying INPLACE the fn on the layout

Source code in src/super_gradients/training/datasets/data_formats/formats.py

def apply_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Apply inplace a function only on a specific layout of a concatenated tensor.
    :param fn:              Function to apply on the bboxes.
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after applying INPLACE the fn on the layout
    """
    location = slice(*iter(tensor_format.locations[layout_name]))
    result = fn(tensor[..., location])
    tensor[..., location] = result
    return tensor

`filter_on_bboxes(fn, tensor, tensor_format)`

Filter the tensor according to a condition on the bboxes.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]`	Function to filter the bboxes (keep only True elements).	required
`tensor`	`Union[np.ndarray, Tensor]`	Concatenated tensor that include - among other - the bboxes.	required
`tensor_format`	`ConcatenatedTensorFormat`	Format of the tensor, required to know the indexes of the bboxes.	required

Returns:

Type	Description
`Union[np.ndarray, Tensor]`	Tensor, after applying INPLACE the fn on the bboxes

Source code in src/super_gradients/training/datasets/data_formats/formats.py

def filter_on_bboxes(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on the bboxes.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the bboxes.
    :param tensor_format:   Format of the tensor, required to know the indexes of the bboxes.
    :return:                Tensor, after applying INPLACE the fn on the bboxes
    """
    return filter_on_layout(fn=fn, tensor=tensor, tensor_format=tensor_format, layout_name=tensor_format.bboxes_format.name)

`filter_on_layout(fn, tensor, tensor_format, layout_name)`

Filter the tensor according to a condition on a specific layout.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]]`	Function to filter the bboxes (keep only True elements).	required
`tensor`	`Union[np.ndarray, Tensor]`	Concatenated tensor that include - among other - the layout of interest.	required
`tensor_format`	`ConcatenatedTensorFormat`	Format of the tensor, required to know the indexes of the layout.	required
`layout_name`	`str`	Name of the layout of interest. It has to be defined in the tensor_format.	required

Returns:

Type	Description
`Union[np.ndarray, Tensor]`	Tensor, after filtering the bboxes according to fn.

Source code in src/super_gradients/training/datasets/data_formats/formats.py

def filter_on_layout(
    fn: Callable[[Union[np.ndarray, Tensor]], Union[np.ndarray, Tensor]],
    tensor: Union[np.ndarray, Tensor],
    tensor_format: ConcatenatedTensorFormat,
    layout_name: str,
) -> Union[np.ndarray, Tensor]:
    """Filter the tensor according to a condition on a specific layout.

    :param fn:              Function to filter the bboxes (keep only True elements).
    :param tensor:          Concatenated tensor that include - among other - the layout of interest.
    :param tensor_format:   Format of the tensor, required to know the indexes of the layout.
    :param layout_name:     Name of the layout of interest. It has to be defined in the tensor_format.
    :return:                Tensor, after filtering the bboxes according to fn.
    """
    location = slice(*tensor_format.locations[layout_name])
    mask = fn(tensor[..., location])
    tensor = tensor[mask]
    return tensor

`get_permutation_indexes(input_format, output_format)`

Compute the permutations required to change the format layout order.

Parameters:

Name	Type	Description	Default
`input_format`	`ConcatenatedTensorFormat`	Input format to transform from	required
`output_format`	`ConcatenatedTensorFormat`	Output format to transform to	required

Returns:

Type	Description
`List[int]`	Permutation indexes to go from input to output format.

Source code in src/super_gradients/training/datasets/data_formats/formats.py

def get_permutation_indexes(input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat) -> List[int]:
    """Compute the permutations required to change the format layout order.

    :param input_format:    Input format to transform from
    :param output_format:   Output format to transform to
    :return: Permutation indexes to go from input to output format.
    """
    output_indexes = []
    for output_name, output_spec in output_format.layout.items():
        if output_name not in input_format.layout:
            raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

        input_spec = input_format.layout[output_name]
        if input_spec.length != output_spec.length:
            raise RuntimeError(
                f"Length of the output must match in input and output format. "
                f"Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
            )
        indexes = input_format.indexes[output_name]
        output_indexes.extend(indexes)
    return output_indexes

`ConvertBoundingBoxes`

Bases: nn.Module

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

class ConvertBoundingBoxes(nn.Module):
    to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]
    from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor]

    def __init__(
        self,
        location: Tuple[int, int],
        to_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        from_xyxy: Callable[[Tensor, Tuple[int, int]], Tensor],
        image_shape: Tuple[int, int],
    ):
        super().__init__()
        self.to_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], to_xyxy)
        self.from_xyxy = torch.jit.annotate(Callable[[Tensor, Tuple[int, int]], Tensor], from_xyxy)
        self.image_shape = image_shape
        self.location = location

    def forward(self, x: Tensor) -> Tensor:
        """

        :param x:
        :param image_shape:
        :return:
        """
        location = slice(self.location[0], self.location[1])
        bboxes = x[..., location]
        xyxy = self.to_xyxy(bboxes, self.image_shape)
        x[..., location] = self.from_xyxy(xyxy, self.image_shape)
        return x

`forward(x)`

Parameters:

Name	Type	Description	Default
`x`	`Tensor`		required
`image_shape`			required

Returns:

Type	Description
`Tensor`

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

def forward(self, x: Tensor) -> Tensor:
    """

    :param x:
    :param image_shape:
    :return:
    """
    location = slice(self.location[0], self.location[1])
    bboxes = x[..., location]
    xyxy = self.to_xyxy(bboxes, self.image_shape)
    x[..., location] = self.from_xyxy(xyxy, self.image_shape)
    return x

`DetectionOutputAdapter`

Bases: nn.Module

Adapter class for converting model's predictions for object detection to a desired format. This adapter supports torch.jit tracing & scripting & onnx conversion.

from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat

class CustomDetectionHead(nn.Module): num_classes: int = 123

@property def format(self): ''' Describe the semantics of the model's output. In this example model's output consists of - Bounding boxes in XYXY format [4] - Predicted probas of N classes [N] - A distance predictions [1] - K additional labels [K] ''' return ConcatenatedTensorFormat( layout=( BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()), TensorSliceItem(name="label", length=1), TensorSliceItem(name="distance", length=1), TensorSliceItem(name="attributes", length=4), ) )

yolox = YoloX(head=CustomDetectionHead)

Suppose we want to return predictions in another format.

Let it be:

- Bounding boxes in normalized XYWH [4]

- Predicted attributes [4]

- Predicted label [1]

output_format = ConcatenatedTensorFormat( layout=( # Note: For output format it is not required to specify location attribute as it will be # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()), TensorSliceItem(name="attributes", length=4), TensorSliceItem(name="label", length=1), ) )

Now we can construct output adapter and attach it to the model

output_adapter = DetectionOutputAdapter( input_format=yolox.head.format, output_format=output_format, image_shape=(640, 640) )

yolox = nn.Sequential(yolox, output_adapter)

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

class DetectionOutputAdapter(nn.Module):
    """
    Adapter class for converting model's predictions for object detection to a desired format.
    This adapter supports torch.jit tracing & scripting & onnx conversion.

    >>> from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat, BoundingBoxesTensorSliceItem, TensorSliceItem
    >>> from super_gradients.training.datasets.data_formats.bbox_formats import XYXYCoordinateFormat, NormalizedXYWHCoordinateFormat
    >>>
    >>> class CustomDetectionHead(nn.Module):
    >>>    num_classes: int = 123
    >>>
    >>>    @property
    >>>    def format(self):
    >>>        '''
    >>>        Describe the semantics of the model's output. In this example model's output consists of
    >>>         - Bounding boxes in XYXY format [4]
    >>>         - Predicted probas of N classes [N]
    >>>         - A distance predictions [1]
    >>>         - K additional labels [K]
    >>>        '''
    >>>        return ConcatenatedTensorFormat(
    >>>            layout=(
    >>>                BoundingBoxesTensorSliceItem(name="bboxes", format=XYXYCoordinateFormat()),
    >>>                TensorSliceItem(name="label", length=1),
    >>>                TensorSliceItem(name="distance", length=1),
    >>>                TensorSliceItem(name="attributes", length=4),
    >>>            )
    >>>        )
    >>>
    >>> yolox = YoloX(head=CustomDetectionHead)
    >>>
    >>> # Suppose we want to return predictions in another format.
    >>> # Let it be:
    >>> # - Bounding boxes in normalized XYWH [4]
    >>> # - Predicted attributes [4]
    >>> # - Predicted label [1]
    >>> output_format = ConcatenatedTensorFormat(
    >>>     layout=(
    >>>         # Note: For output format it is not required to specify location attribute as it will be
    >>>         # computed with respect to size of "source name" and order of items in layout describe their order in the output tensor
    >>>         BoundingBoxesTensorSliceItem(name="bboxes", format=NormalizedXYWHCoordinateFormat()),
    >>>         TensorSliceItem(name="attributes", length=4),
    >>>         TensorSliceItem(name="label", length=1),
    >>>     )
    >>> )
    >>>
    >>> # Now we can construct output adapter and attach it to the model
    >>> output_adapter = DetectionOutputAdapter(
    >>>     input_format=yolox.head.format,
    >>>     output_format=output_format,
    >>>     image_shape=(640, 640)
    >>> )
    >>>
    >>> yolox = nn.Sequential(yolox, output_adapter)
    >>>
    """

    def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
        """

        :param input_format: Format definition of the inputs
        :param output_format: Format definition of the outputs
        :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                            If you're not using normalized coordinates you can set this to None
        """
        super().__init__()

        self.format_conversion: nn.Module = self.get_format_conversion_module(
            location=input_format.locations[input_format.bboxes_format.name],
            input_bbox_format=input_format.bboxes_format.format,
            output_bbox_format=output_format.bboxes_format.format,
            image_shape=image_shape,
        )

        self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

        self.input_format = input_format
        self.output_format = output_format
        self.input_length = input_format.num_channels

    def forward(self, predictions: Tensor) -> Tensor:
        """
        Convert output detections to the user-specified format
        :param predictions:
        :return:
        """
        if predictions.size(-1) != self.input_length:
            raise RuntimeError(
                f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
                f"equal to {self.input_length} as defined by input format."
            )

        predictions = self.format_conversion(predictions.clone())
        predictions = self.rearrange_outputs(predictions)
        return predictions

    @classmethod
    def get_rearrange_outputs_module(
        cls, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat
    ) -> Tuple[RearrangeOutput, ConcatenatedTensorFormat]:

        output_indexes = []
        rearranged_layout = []

        offset = 0
        for output_name, output_spec in output_format.layout.items():
            if output_name not in input_format.layout:
                raise KeyError(f"Requested item '{output_name}' was not found among input format spec. Present items are: {tuple(input_format.layout.keys())}")

            input_spec = input_format.layout[output_name]

            if input_spec.length != output_spec.length:
                raise RuntimeError(
                    "Length of the output must match in input and output format. "
                    "Input spec size is {input_spec.length} for key '{output_name}' and output spec size is {output_spec.length}."
                )
            indexes = input_format.indexes[output_name]
            output_indexes.extend(indexes)
            output_len = len(indexes)

            rearranged_item = copy.deepcopy(output_spec)
            offset += output_len

            rearranged_layout.append(rearranged_item)
        rearranged_format = ConcatenatedTensorFormat(rearranged_layout)
        return RearrangeOutput(torch.tensor(output_indexes).long()), rearranged_format

    @classmethod
    def get_format_conversion_module(
        cls, location: Tuple[int, int], input_bbox_format: BoundingBoxFormat, output_bbox_format: BoundingBoxFormat, image_shape: Union[Tuple[int, int], None]
    ) -> ConvertBoundingBoxes:
        return ConvertBoundingBoxes(
            location=location,
            to_xyxy=input_bbox_format.get_to_xyxy(False),
            from_xyxy=output_bbox_format.get_from_xyxy(True),
            image_shape=image_shape,
        )

`init(input_format, output_format, image_shape)`

Parameters:

Name	Type	Description	Default
`input_format`	`ConcatenatedTensorFormat`	Format definition of the inputs	required
`output_format`	`ConcatenatedTensorFormat`	Format definition of the outputs	required
`image_shape`	`Union[Tuple[int, int], None]`	Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format. If you're not using normalized coordinates you can set this to None	required

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

def __init__(self, input_format: ConcatenatedTensorFormat, output_format: ConcatenatedTensorFormat, image_shape: Union[Tuple[int, int], None]):
    """

    :param input_format: Format definition of the inputs
    :param output_format: Format definition of the outputs
    :param image_shape: Shape of the input image (rows, cols), used for converting bbox coordinates from/to normalized format.
                        If you're not using normalized coordinates you can set this to None
    """
    super().__init__()

    self.format_conversion: nn.Module = self.get_format_conversion_module(
        location=input_format.locations[input_format.bboxes_format.name],
        input_bbox_format=input_format.bboxes_format.format,
        output_bbox_format=output_format.bboxes_format.format,
        image_shape=image_shape,
    )

    self.rearrange_outputs, rearranged_format = self.get_rearrange_outputs_module(input_format, output_format)

    self.input_format = input_format
    self.output_format = output_format
    self.input_length = input_format.num_channels

`forward(predictions)`

Convert output detections to the user-specified format

Parameters:

Name	Type	Description	Default
`predictions`	`Tensor`		required

Returns:

Type	Description
`Tensor`

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

def forward(self, predictions: Tensor) -> Tensor:
    """
    Convert output detections to the user-specified format
    :param predictions:
    :return:
    """
    if predictions.size(-1) != self.input_length:
        raise RuntimeError(
            f"Number of channels in last dimension of input tensor ({predictions.size(-1)}) must be "
            f"equal to {self.input_length} as defined by input format."
        )

    predictions = self.format_conversion(predictions.clone())
    predictions = self.rearrange_outputs(predictions)
    return predictions

`RearrangeOutput`

Bases: nn.Module

Rearrange elements in last dimension of input tensor with respect to index argument

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

class RearrangeOutput(nn.Module):
    """
    Rearrange elements in last dimension of input tensor with respect to index argument

    """

    def __init__(self, indexes: Tensor):
        super().__init__()
        self.indexes = indexes

    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: Input tensor of  [..., N] shape
        :return: Output tensor of [..., N[index]] shape
        """
        if torch.jit.is_scripting():
            # Workaround "Ellipses followed by tensor indexing is currently not supported"
            # https://github.com/pytorch/pytorch/issues/34837
            x = torch.moveaxis(x, -1, 0)
            x = x[self.indexes]
            x = torch.moveaxis(x, 0, -1)
            return x
        else:
            return x[..., self.indexes]

`forward(x)`

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input tensor of [..., N] shape	required

Returns:

Type	Description
`Tensor`	Output tensor of [..., N[index]] shape

Source code in src/super_gradients/training/datasets/data_formats/output_adapters/detection_adapter.py

def forward(self, x: Tensor) -> Tensor:
    """
    :param x: Input tensor of  [..., N] shape
    :return: Output tensor of [..., N[index]] shape
    """
    if torch.jit.is_scripting():
        # Workaround "Ellipses followed by tensor indexing is currently not supported"
        # https://github.com/pytorch/pytorch/issues/34837
        x = torch.moveaxis(x, -1, 0)
        x = x[self.indexes]
        x = torch.moveaxis(x, 0, -1)
        return x
    else:
        return x[..., self.indexes]

`AbstractCollateFunction`

Bases: ABC

A collate function (for torch DataLoader)

Source code in src/super_gradients/training/datasets/datasets_utils.py

class AbstractCollateFunction(ABC):
    """
    A collate function (for torch DataLoader)
    """

    @abstractmethod
    def __call__(self, batch):
        pass

`AbstractPrePredictionCallback`

Bases: ABC

Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params pre_prediction_callback keyword arg.

Should implement call and return images, targets after applying the desired preprocessing.

Source code in src/super_gradients/training/datasets/datasets_utils.py

class AbstractPrePredictionCallback(ABC):
    """
    Abstract class for forward pass preprocessing function, to be used by passing its inheritors through training_params
     pre_prediction_callback keyword arg.

    Should implement __call__ and return images, targets after applying the desired preprocessing.
    """

    @abstractmethod
    def __call__(self, inputs, targets, batch_idx):
        pass

`ComposedCollateFunction`

Bases: AbstractCollateFunction

A function (for torch DataLoader) which executes a sequence of sub collate functions

Source code in src/super_gradients/training/datasets/datasets_utils.py

@register_collate_function()
class ComposedCollateFunction(AbstractCollateFunction):
    """
    A function (for torch DataLoader) which executes a sequence of sub collate functions
    """

    def __init__(self, functions: list):
        self.functions = functions

    def __call__(self, batch):
        for f in self.functions:
            batch = f(batch)
        return batch

`DatasetStatisticsTensorboardLogger`

Source code in src/super_gradients/training/datasets/datasets_utils.py

class DatasetStatisticsTensorboardLogger:

    logger = get_logger(__name__)
    DEFAULT_SUMMARY_PARAMS = {
        "sample_images": 32,  # by default, 32 images will be sampled from each dataset
        "plot_class_distribution": True,
        "plot_box_size_distribution": True,
        "plot_anchors_coverage": True,
        "max_batches": 30,
    }

    def __init__(self, sg_logger, summary_params: dict = DEFAULT_SUMMARY_PARAMS):
        self.sg_logger = sg_logger
        self.summary_params = {**DatasetStatisticsTensorboardLogger.DEFAULT_SUMMARY_PARAMS, **summary_params}

    def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
        """
        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. applicable only for detection datasets
        :param all_classes: the list of all classes names
        """
        # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
        # if isinstance(data_loader.dataset, DetectionDataSet):
        #     self._analyze_detection(data_loader=data_loader, title=title,
        #                             all_classes=all_classes, anchors=anchors)
        # else:
        #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
        DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

    def _analyze_detection(self, data_loader, title, all_classes, anchors=None):
        """
        Analyze a detection dataset

        :param data_loader: the dataset data loader
        :param dataset_params: the dataset parameters
        :param all_classes: the list of all classes names
        :param title: the title for this dataset (i.e. Coco 2017 test set)
        :param anchors: the list of anchors used by the model. if not provided, anchors coverage will not be analyzed
        """
        try:
            color_mean = AverageMeter()
            color_std = AverageMeter()
            all_labels = []
            image_size = 0
            for i, (images, labels) in enumerate(tqdm(data_loader)):

                if i >= self.summary_params["max_batches"] > 0:
                    break

                if i == 0:
                    image_size = max(images[0].shape[1], images[0].shape[2])
                    if images.shape[0] > self.summary_params["sample_images"]:
                        samples = images[: self.summary_params["sample_images"]]
                    else:
                        samples = images

                    pred = [torch.zeros(size=(0, 6)) for _ in range(len(samples))]
                    try:
                        result_images = DetectionVisualization.visualize_batch(
                            image_tensor=samples,
                            pred_boxes=pred,
                            target_boxes=copy.deepcopy(labels),
                            batch_name=title,
                            class_names=all_classes,
                            box_thickness=1,
                            gt_alpha=1.0,
                        )

                        self.sg_logger.add_images(tag=f"{title} sample images", images=np.stack(result_images).transpose([0, 3, 1, 2])[:, ::-1, :, :])
                    except Exception as e:
                        DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at adding an example batch:\n{e}")
                        return

                all_labels.append(labels)
                color_mean.update(torch.mean(images, dim=[0, 2, 3]), 1)
                color_std.update(torch.std(images, dim=[0, 2, 3]), 1)

            all_labels = torch.cat(all_labels, dim=0)[1:].numpy()

            try:
                if self.summary_params["plot_class_distribution"]:
                    self._analyze_class_distribution(labels=all_labels, num_classes=len(all_classes), title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing class distributions.\n{e}")
                return

            try:
                if self.summary_params["plot_box_size_distribution"]:
                    self._analyze_object_size_distribution(labels=all_labels, title=title)
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing object size " f"distributions.\n{e}")
                return

            summary = ""
            summary += f"dataset size: {len(data_loader)}  \n"
            summary += f"color mean: {color_mean.average}  \n"
            summary += f"color std: {color_std.average}  \n"

            try:
                if anchors is not None and image_size > 0:
                    coverage = self._analyze_anchors_coverage(anchors=anchors, image_size=image_size, title=title, labels=all_labels)
                    summary += f"anchors: {anchors}  \n"
                    summary += f"anchors coverage: {coverage}  \n"
            except Exception as e:
                DatasetStatisticsTensorboardLogger.logger.error(f"Dataset Statistics failed at analyzing anchors " f"coverage.\n{e}")
                return

            self.sg_logger.add_text(tag=f"{title} Statistics", text_string=summary)
            self.sg_logger.flush()

        except Exception as e:
            DatasetStatisticsTensorboardLogger.logger.error(f"dataset analysis failed!\n{e}")

    def _analyze_class_distribution(self, labels: list, num_classes: int, title: str):
        hist, edges = np.histogram(labels[:, 0], num_classes)

        f = plt.figure(figsize=[10, 8])

        plt.bar(range(num_classes), hist, width=0.5, color="#0504aa", alpha=0.7)
        plt.xlim(-1, num_classes)
        plt.grid(axis="y", alpha=0.75)
        plt.xlabel("Value", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.ylabel("Frequency", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.xticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.yticks(fontsize=STAT_LOGGER_FONT_SIZE)
        plt.title(f"{title} class distribution", fontsize=STAT_LOGGER_FONT_SIZE)

        self.sg_logger.add_figure(f"{title} class distribution", figure=f)
        text_dist = ""
        for i, val in enumerate(hist):
            text_dist += f"[{i}]: {val}, "

        self.sg_logger.add_text(tag=f"{title} class distribution", text_string=text_dist)

    def _analyze_object_size_distribution(self, labels: list, title: str):
        """
        This function will add two plots to the tensorboard.
        one is a 2D histogram and the other is a scatter plot. in both cases the X axis is the object width and Y axis
        is the object width (both normalized by image size)
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        # histogram plot
        hist, xedges, yedges = np.histogram2d(labels[:, 4], labels[:, 3], 50)  # x and y are deliberately switched

        fig = plt.figure(figsize=(10, 6))
        fig.suptitle(f"{title} boxes w/h distribution")
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(np.log(hist + 1), interpolation="nearest", origin="lower", extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

        # scatter plot
        if len(labels) > 10000:
            # we randomly sample just 10000 objects so that the scatter plot will not get too dense
            labels = labels[np.random.randint(0, len(labels) - 1, 10000)]
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        plt.scatter(labels[:, 3], labels[:, 4], marker=".")

        self.sg_logger.add_figure(tag=f"{title} boxes w/h distribution", figure=fig)

    @staticmethod
    def _get_rect(w, h):
        min_w = w / 4.0
        min_h = h / 4.0
        return Rectangle((min_w, min_h), w * 4 - min_w, h * 4 - min_h, linewidth=1, edgecolor="b", facecolor="none")

    @staticmethod
    def _get_score(anchors: np.ndarray, points: np.ndarray, image_size: int):
        """
        Calculate the ratio (and 1/ratio) between each anchor width and height and each point (representing a possible
        object width and height).
        i.e. for an anchor with w=10,h=20 the point w=11,h=25 will have the ratios 11/10=1.1 and 25/20=1.25
        or 10/11=0.91 and 20/25=0.8 respectively

        :param anchors: array of anchors of the shape [2,N]
        :param points: array of points of the shape [2,M]
        :param image_size the size of the input image

        :returns: an array of size [image_size - 1, image_size - 1] where each cell i,j represent the minimum ratio
        for that cell (point) from all anchors
        """

        ratio = (
            anchors[:, :, None]
            / points[
                :,
            ]
        )
        inv_ratio = 1 / ratio
        min_ratio = 1 - np.minimum(ratio, inv_ratio)
        min_ratio = np.max(min_ratio, axis=1)
        to_closest_anchor = np.min(min_ratio, axis=0)
        to_closest_anchor[to_closest_anchor > 0.75] = 2
        return to_closest_anchor.reshape(image_size - 1, -1)

    def _analyze_anchors_coverage(self, anchors: Anchors, image_size: int, labels: list, title: str):
        """
        This function will add anchors coverage plots to the tensorboard.
        :param anchors: a list of anchors
        :param image_size: the input image size for this training
        :param labels: all the labels of the dataset of the shape [class_label, x_center, y_center, w, h]
        :param title: the dataset title
        """

        fig = plt.figure(figsize=(12, 5))
        fig.suptitle(f"{title} anchors coverage")

        # box style plot
        ax = fig.add_subplot(121)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_xlim([0, image_size])
        ax.set_ylim([0, image_size])

        anchors_boxes = anchors.anchors.cpu().numpy()
        anchors_len = anchors.num_anchors

        anchors_boxes = anchors_boxes.reshape(-1, 2)

        for i in range(anchors_len):
            rect = self._get_rect(anchors_boxes[i][0], anchors_boxes[i][1])
            rect.set_alpha(0.3)
            rect.set_facecolor([random.random(), random.random(), random.random(), 0.3])
            ax.add_patch(rect)

        # distance from anchor plot
        ax = fig.add_subplot(122)
        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)

        x = np.arange(1, image_size, 1)
        y = np.arange(1, image_size, 1)

        xx, yy = np.meshgrid(x, y, sparse=False, indexing="xy")
        points = np.concatenate([xx.reshape(1, -1), yy.reshape(1, -1)])

        color = self._get_score(anchors_boxes, points, image_size)

        ax.set_xlabel("W", fontsize=STAT_LOGGER_FONT_SIZE)
        ax.set_ylabel("H", fontsize=STAT_LOGGER_FONT_SIZE)
        plt.imshow(color, interpolation="nearest", origin="lower", extent=[0, image_size, 0, image_size])

        # calculate the coverage for the dataset labels
        cover_masks = []
        for i in range(anchors_len):
            w_max = (anchors_boxes[i][0] / image_size) * 4
            w_min = (anchors_boxes[i][0] / image_size) * 0.25
            h_max = (anchors_boxes[i][1] / image_size) * 4
            h_min = (anchors_boxes[i][1] / image_size) * 0.25
            cover_masks.append(
                np.logical_and(np.logical_and(np.logical_and(labels[:, 3] < w_max, labels[:, 3] > w_min), labels[:, 4] < h_max), labels[:, 4] > h_min)
            )
        cover_masks = np.stack(cover_masks)
        coverage = np.count_nonzero(np.any(cover_masks, axis=0)) / len(labels)

        self.sg_logger.add_figure(tag=f"{title} anchors coverage", figure=fig)
        return coverage

`analyze(data_loader, title, all_classes, anchors=None)`

Parameters:

Name	Type	Description	Default
`data_loader`	`torch.utils.data.DataLoader`	the dataset data loader	required
`dataset_params`		the dataset parameters	required
`title`	`str`	the title for this dataset (i.e. Coco 2017 test set)	required
`anchors`	`list`	the list of anchors used by the model. applicable only for detection datasets	`None`
`all_classes`	`List[str]`	the list of all classes names	required

Source code in src/super_gradients/training/datasets/datasets_utils.py

def analyze(self, data_loader: torch.utils.data.DataLoader, title: str, all_classes: List[str], anchors: list = None):
    """
    :param data_loader: the dataset data loader
    :param dataset_params: the dataset parameters
    :param title: the title for this dataset (i.e. Coco 2017 test set)
    :param anchors: the list of anchors used by the model. applicable only for detection datasets
    :param all_classes: the list of all classes names
    """
    # FIXME: UNCOMMENT AND APPLY TO NEW DetectionDataSet ONCE ITS MERGED
    # if isinstance(data_loader.dataset, DetectionDataSet):
    #     self._analyze_detection(data_loader=data_loader, title=title,
    #                             all_classes=all_classes, anchors=anchors)
    # else:
    #     DatasetStatisticsTensorboardLogger.logger.warning('only DetectionDataSet are currently supported')
    DatasetStatisticsTensorboardLogger.logger.warning("only DetectionDataSet are currently supported")

`DetectionMultiscalePrePredictionCallback`

Bases: MultiscalePrePredictionCallback

Mutiscalepre-prediction callback for object detection.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.

Parameters:

Name	Description	Default
`multiscale_range`	Range of values for resize sizes as discussed above (default=5)	required
`image_size_steps`	Image step sizes as discussed abov (default=32)	required
`change_frequency`	The frequency to apply change in input size.	required

Source code in src/super_gradients/training/datasets/datasets_utils.py

@register_callback(Callbacks.DETECTION_MULTISCALE_PREPREDICTION)
class DetectionMultiscalePrePredictionCallback(MultiscalePrePredictionCallback):
    """
    Mutiscalepre-prediction callback for object detection.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps) and apply the same rescaling to the box coordinates.


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.

    """

    def __call__(self, inputs, targets, batch_idx):
        # RESCALE THE IMAGE FIRST WITH SUPER(), AND IF RESCALING HAS ACTUALLY BEEN DONE APPLY TO BOXES AS WELL
        input_size = inputs.shape[2:]
        inputs, targets = super(DetectionMultiscalePrePredictionCallback, self).__call__(inputs, targets, batch_idx)
        new_input_size = inputs.shape[2:]
        scale_y = new_input_size[0] / input_size[0]
        scale_x = new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            targets[..., 2::2] = targets[..., 2::2] * scale_x
            targets[..., 3::2] = targets[..., 3::2] * scale_y
        return inputs, targets

`MultiScaleCollateFunction`

Bases: AbstractCollateFunction

a collate function to implement multi-scale data augmentation according to https://arxiv.org/pdf/1612.08242.pdf

Source code in src/super_gradients/training/datasets/datasets_utils.py

@register_collate_function()
class MultiScaleCollateFunction(AbstractCollateFunction):
    """
    a collate function to implement multi-scale data augmentation
    according to https://arxiv.org/pdf/1612.08242.pdf
    """

    _counter = AtomicInteger(0)
    _current_size = AtomicInteger(0)
    _lock = Lock()

    def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
        """
        set parameters for the multi-scale collate function
        the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
        a new size will be randomly selected every change_frequency calls to the collate_fn()
            :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
            :param min_image_size: the minimum size to scale down to (in pixels)
            :param max_image_size: the maximum size to scale up to (in pixels)
            :param image_size_steps: typically, the stride of the net, which defines the possible image
                    size multiplications
            :param change_frequency:
        """
        assert target_size is not None or (
            max_image_size is not None and min_image_size is not None
        ), "either target_size or min_image_size and max_image_size has to be set"
        assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

        if target_size is not None:
            min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
            max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

        print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

        self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self._current_size = random.choice(self.sizes)

    def __call__(self, batch):

        with self._lock:

            # Important: this implementation was tailored for a specific input. it assumes the batch is a tuple where
            # the images are the first item
            assert isinstance(batch, tuple), "this collate function expects the input to be a tuple (images, labels)"
            images = batch[0]
            if self._counter % self.frequency == 0:
                self._current_size = random.choice(self.sizes)
            self._counter += 1

            assert images.shape[2] % self.image_size_steps == 0 and images.shape[3] % self.image_size_steps == 0, (
                "images sized not divisible by %d. (resize images before calling multi_scale)" % self.image_size_steps
            )

            if self._current_size != max(images.shape[2:]):
                ratio = float(self._current_size) / max(images.shape[2:])
                new_size = (int(round(images.shape[2] * ratio)), int(round(images.shape[3] * ratio)))
                images = F.interpolate(images, size=new_size, mode="bilinear", align_corners=False)

            return images, batch[1]

`init(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)`

set parameters for the multi-scale collate function the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps a new size will be randomly selected every change_frequency calls to the collate_fn() :param target_size: scales will be [0.66 * target_size, 1.5 * target_size] :param min_image_size: the minimum size to scale down to (in pixels) :param max_image_size: the maximum size to scale up to (in pixels) :param image_size_steps: typically, the stride of the net, which defines the possible image size multiplications :param change_frequency:

Source code in src/super_gradients/training/datasets/datasets_utils.py

def __init__(self, target_size: int = None, min_image_size: int = None, max_image_size: int = None, image_size_steps: int = 32, change_frequency: int = 10):
    """
    set parameters for the multi-scale collate function
    the possible image sizes are in range [min_image_size, max_image_size] in steps of image_size_steps
    a new size will be randomly selected every change_frequency calls to the collate_fn()
        :param target_size: scales will be [0.66 * target_size, 1.5 * target_size]
        :param min_image_size: the minimum size to scale down to (in pixels)
        :param max_image_size: the maximum size to scale up to (in pixels)
        :param image_size_steps: typically, the stride of the net, which defines the possible image
                size multiplications
        :param change_frequency:
    """
    assert target_size is not None or (
        max_image_size is not None and min_image_size is not None
    ), "either target_size or min_image_size and max_image_size has to be set"
    assert target_size is None or max_image_size is None, "target_size and max_image_size cannot be both defined"

    if target_size is not None:
        min_image_size = int(0.66 * target_size - ((0.66 * target_size) % image_size_steps) + image_size_steps)
        max_image_size = int(1.5 * target_size - ((1.5 * target_size) % image_size_steps))

    print("Using multi-scale %g - %g" % (min_image_size, max_image_size))

    self.sizes = np.arange(min_image_size, max_image_size + image_size_steps, image_size_steps)
    self.image_size_steps = image_size_steps
    self.frequency = change_frequency
    self._current_size = random.choice(self.sizes)

`MultiscalePrePredictionCallback`

Bases: AbstractPrePredictionCallback

Mutiscale pre-prediction callback pass function.

When passed through train_params images, targets will be applied by the below transform to support multi scaling on the fly.

After each self.frequency forward passes, change size randomly from (input_size-self.multiscale_rangeself.image_size_steps, input_size-(self.multiscale_range-1)self.image_size_steps, ...input_size+self.multiscale_range*self.image_size_steps)

Parameters:

Name	Type	Description	Default
`multiscale_range`	`int`	Range of values for resize sizes as discussed above (default=5)	`5`
`image_size_steps`	`int`	Image step sizes as discussed abov (default=32)	`32`
`change_frequency`	`int`	The frequency to apply change in input size.	`10`

Source code in src/super_gradients/training/datasets/datasets_utils.py

class MultiscalePrePredictionCallback(AbstractPrePredictionCallback):
    """
    Mutiscale pre-prediction callback pass function.

    When passed through train_params images, targets will be applied by the below transform to support multi scaling
    on the fly.

    After each self.frequency forward passes, change size randomly from
     (input_size-self.multiscale_range*self.image_size_steps, input_size-(self.multiscale_range-1)*self.image_size_steps,
     ...input_size+self.multiscale_range*self.image_size_steps)


    :param multiscale_range: Range of values for resize sizes as discussed above (default=5)
    :param image_size_steps: Image step sizes as discussed abov (default=32)
    :param change_frequency: The frequency to apply change in input size.
    """

    def __init__(self, multiscale_range: int = 5, image_size_steps: int = 32, change_frequency: int = 10):

        self.multiscale_range = multiscale_range
        self.image_size_steps = image_size_steps
        self.frequency = change_frequency
        self.rank = None
        self.is_distributed = None
        self.sampled_imres_once = False
        self.new_input_size = None

    def __call__(self, inputs, targets, batch_idx):
        if self.rank is None:
            self.rank = get_local_rank()
        if self.is_distributed is None:
            self.is_distributed = get_world_size() > 1

        # GENERATE A NEW SIZE AND BROADCAST IT TO THE THE OTHER RANKS SO THEY HAVE THE SAME SCALE
        input_size = inputs.shape[2:]
        if batch_idx % self.frequency == 0:
            tensor = torch.LongTensor(2).to(inputs.device)

            if self.rank == 0:
                size_factor = input_size[1] * 1.0 / input_size[0]
                min_size = int(input_size[0] / self.image_size_steps) - self.multiscale_range
                max_size = int(input_size[0] / self.image_size_steps) + self.multiscale_range
                random_size = (min_size, max_size)
                if self.sampled_imres_once:
                    size = random.randint(*random_size)
                else:
                    # sample the biggest resolution first to make sure the run fits into the GPU memory
                    size = max_size
                    self.sampled_imres_once = True
                size = (int(self.image_size_steps * size), self.image_size_steps * int(size * size_factor))
                tensor[0] = size[0]
                tensor[1] = size[1]

            if self.is_distributed:
                dist.barrier()
                dist.broadcast(tensor, 0)

            self.new_input_size = (tensor[0].item(), tensor[1].item())

        scale_y = self.new_input_size[0] / input_size[0]
        scale_x = self.new_input_size[1] / input_size[1]
        if scale_x != 1 or scale_y != 1:
            inputs = torch.nn.functional.interpolate(inputs, size=self.new_input_size, mode="bilinear", align_corners=False)
        return inputs, targets

`RandomResizedCropAndInterpolation`

Bases: RandomResizedCrop

Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks.

Parameters:

Name	Description	Default
`size`	Expected output size of each edge	required
`scale`	Range of size of the origin size cropped	`(0.08, 1.0)`
`ratio`	Range of aspect ratio of the origin aspect ratio cropped	`(3.0 / 4.0, 4.0 / 3.0)`
`interpolation`	Default: PIL.Image.BILINEAR	`'default'`

Source code in src/super_gradients/training/datasets/datasets_utils.py

@register_transform(Transforms.RandomResizedCropAndInterpolation)
class RandomResizedCropAndInterpolation(RandomResizedCrop):
    """
    Crop the given PIL Image to random size and aspect ratio with explicitly chosen or random interpolation.

    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
    is finally resized to given size.
    This is popularly used to train the Inception networks.

    :param size: Expected output size of each edge
    :param scale: Range of size of the origin size cropped
    :param ratio: Range of aspect ratio of the origin aspect ratio cropped
    :param interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="default"):
        super(RandomResizedCropAndInterpolation, self).__init__(size=size, scale=scale, ratio=ratio, interpolation=interpolation)
        if interpolation == "random":
            self.interpolation = _RANDOM_INTERPOLATION
        elif interpolation == "default":
            self.interpolation = InterpolationMode.BILINEAR
        else:
            self.interpolation = _pil_interp(interpolation)

    def forward(self, img: Image) -> Image:
        """
        :param img: Image to be cropped and resized.
        :return: Image: Randomly cropped and resized image.
        """
        i, j, h, w = self.get_params(img, self.scale, self.ratio)
        if isinstance(self.interpolation, (tuple, list)):
            interpolation = random.choice(self.interpolation)
        else:
            interpolation = self.interpolation
        return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

    def __repr__(self):
        if isinstance(self.interpolation, (tuple, list)):
            interpolate_str = " ".join([_pil_interpolation_to_str[x] for x in self.interpolation])
        else:
            interpolate_str = _pil_interpolation_to_str[self.interpolation]
        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
        format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale))
        format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio))
        format_string += ", interpolation={0})".format(interpolate_str)
        return format_string

`forward(img)`

Parameters:

Name	Type	Description	Default
`img`	`Image`	Image to be cropped and resized.	required

Returns:

Type	Description
`Image`	Image: Randomly cropped and resized image.

Source code in src/super_gradients/training/datasets/datasets_utils.py

def forward(self, img: Image) -> Image:
    """
    :param img: Image to be cropped and resized.
    :return: Image: Randomly cropped and resized image.
    """
    i, j, h, w = self.get_params(img, self.scale, self.ratio)
    if isinstance(self.interpolation, (tuple, list)):
        interpolation = random.choice(self.interpolation)
    else:
        interpolation = self.interpolation
    return torchvision.transforms.functional.resized_crop(img, i, j, h, w, self.size, interpolation)

`get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])`

Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned according to rand_augment_config_string

Parameters:

Name	Type	Description	Default
`rand_augment_config_string`	`str`	string which defines the auto augment configurations. If none, color jitter will be returned. For possibile values see auto_augment.py	required
`color_jitter`	`tuple`	tuple for color jitter value.	required
`crop_size`		relevant only for auto augment	`224`
`img_mean`		relevant only for auto augment	`[0.485, 0.456, 0.406]`

Returns:

Type	Description
	RandAugment transform or ColorJitter

Source code in src/super_gradients/training/datasets/datasets_utils.py

def get_color_augmentation(rand_augment_config_string: str, color_jitter: tuple, crop_size=224, img_mean=[0.485, 0.456, 0.406]):
    """
    Returns color augmentation class. As these augmentation cannot work on top one another, only one is returned
    according to rand_augment_config_string

    :param rand_augment_config_string: string which defines the auto augment configurations.
                                       If none, color jitter will be returned. For possibile values see auto_augment.py
    :param color_jitter: tuple for color jitter value.
    :param crop_size: relevant only for auto augment
    :param img_mean: relevant only for auto augment
    :return: RandAugment transform or ColorJitter
    """
    if rand_augment_config_string:
        color_augmentation = rand_augment_transform(rand_augment_config_string, crop_size, img_mean)

    else:  # RandAugment includes colorjitter like augmentations, both cannot be applied together.
        color_augmentation = transforms.ColorJitter(*color_jitter)
    return color_augmentation

`get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)`

A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

Parameters:

Name	Description	Default
`data_dir`	String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"	`None`
`dataloader`	a torch DataLoader, as it would feed the data into the trainer (including transforms etc).	`None`
`RandomResizeSize`	Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet, this value should be 224).	`224`

Returns:

Type	Description
	2 lists,mean and std, each one of len 3 (1 for each channel)

Source code in src/super_gradients/training/datasets/datasets_utils.py

def get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224):
    """
    A function for getting the mean and std of large datasets using pytorch dataloader and gpu functionality.

    :param data_dir: String, path to none-library dataset folder. For example "/data/Imagenette" or "/data/TinyImagenet"
    :param dataloader: a torch DataLoader, as it would feed the data into the trainer (including transforms etc).
    :param RandomResizeSize: Int, the size of the RandomResizeCrop as it appears in the DataInterface (for example, for Imagenet,
    this value should be 224).
    :return: 2 lists,mean and std, each one of len 3 (1 for each channel)
    """
    assert data_dir is None or dataloader is None, "Please provide either path to data folder or DataLoader, not both."

    if dataloader is None:
        traindir = os.path.join(os.path.abspath(data_dir), "train")
        trainset = ImageFolder(
            traindir, transforms.Compose([transforms.RandomResizedCrop(RandomResizeSize), transforms.RandomHorizontalFlip(), transforms.ToTensor()])
        )
        dataloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=num_workers)

    print(f"Calculating on {len(dataloader.dataset.targets)} Training Samples")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    h, w = 0, 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            h, w = inputs.size(2), inputs.size(3)
            print(f"Min: {inputs.min()}, Max: {inputs.max()}")
            chsum = inputs.sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += inputs.sum(dim=(0, 2, 3), keepdim=True)
    mean = chsum / len(trainset) / h / w
    print(f"mean: {mean.view(-1)}")

    chsum = None
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)
        if batch_idx == 0:
            chsum = (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
        else:
            chsum += (inputs - mean).pow(2).sum(dim=(0, 2, 3), keepdim=True)
    std = torch.sqrt(chsum / (len(trainset) * h * w - 1))
    print(f"std: {std.view(-1)}")
    return mean.view(-1).cpu().numpy().tolist(), std.view(-1).cpu().numpy().tolist()

`worker_init_reset_seed(worker_id)`

Make sure each process has different random seed, especially for 'fork' method. Check https://github.com/pytorch/pytorch/issues/63311 for more details.

Parameters:

Name	Type	Description	Default
`worker_id`		placeholder (needs to be passed to DataLoader init).	required

Source code in src/super_gradients/training/datasets/datasets_utils.py

def worker_init_reset_seed(worker_id):
    """
    Make sure each process has different random seed, especially for 'fork' method.
    Check https://github.com/pytorch/pytorch/issues/63311 for more details.

    :param worker_id: placeholder (needs to be passed to DataLoader init).
    """
    seed = uuid.uuid4().int % 2**32
    random.seed(seed)
    torch.set_rng_state(torch.manual_seed(seed).get_state())
    np.random.seed(seed)

`AbstractDepthEstimationDataset`

Bases: Dataset

Abstract class for datasets for depth estimation task.

Attempting to follow principles provided in pose_etimation_dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py

class AbstractDepthEstimationDataset(Dataset):
    """
    Abstract class for datasets for depth estimation task.

    Attempting to follow principles provided in pose_etimation_dataset.
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(self, transforms: List[AbstractDepthEstimationTransform] = None):
        super().__init__()
        self.transforms = transforms or []

    @abc.abstractmethod
    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample from the dataset.

        :param index: Index of the sample to load.
        :return: Instance of DepthEstimationSample.

        If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
        ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
        Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
        """
        raise NotImplementedError()

    def load_random_sample(self) -> DepthEstimationSample:
        """
        Return a random sample from the dataset

        :return: Instance of DepthEstimationSample
        """
        num_samples = len(self)
        random_index = random.randrange(0, num_samples)
        return self.load_sample(random_index)

    def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get a transformed depth estimation sample from the dataset.

        :param index: Index of the sample to retrieve.
        :return: Tuple containing the transformed image and depth map as np.ndarrays.

        After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
        a 2D array (e.g., Height x Width).

        Before returning the image and depth map, the image's channels are moved to CHW format and additional
         dummy dimension is added to the depth map resulting 1HW shape.
        """
        sample = self.load_sample(index)
        for transform in self.transforms:
            sample = transform(sample)
        return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

    def plot(
        self,
        max_samples_per_plot: int = 8,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        color_scheme: Optional[int] = None,
        drop_extreme_percentage: float = 0,
        inverse: bool = False,
    ):
        """
        Combine samples of images with depth maps into plots and display the result.

        :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
        :param n_plots:                 Number of plots to display.
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                        If False, the plot will be over the raw samples (i.e., on load_sample).
        :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                        - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                        - If `inverse=False`, the default is COLORMAP_MAGMA.


        :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
        :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

        :return: None
        """
        plot_counter = 0

        for plot_i in range(n_plots):
            fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * max_samples_per_plot
                if plot_transformed_data:
                    image, depth_map = self[index]

                    # Transpose to HWC format for visualization
                    image = image.transpose(1, 2, 0)
                    depth_map = depth_map.squeeze()  # Remove dummy dimension
                else:
                    sample = self.load_sample(index)
                    image, depth_map = sample.image, sample.depth_map

                # Plot the image
                axes[0, img_i].imshow(image)
                axes[0, img_i].axis("off")
                axes[0, img_i].set_title(f"Sample {index}")

                # Plot the depth map side by side with the selected color scheme
                depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
                axes[1, img_i].imshow(depth_map)
                axes[1, img_i].axis("off")
                axes[1, img_i].set_title(f"Depth Map {index}")

            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

`getitem(index)`

Get a transformed depth estimation sample from the dataset.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index of the sample to retrieve.	required

Returns:

Type	Description
`Tuple[np.ndarray, np.ndarray]`	Tuple containing the transformed image and depth map as np.ndarrays. After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be a 2D array (e.g., Height x Width). Before returning the image and depth map, the image's channels are moved to CHW format and additional dummy dimension is added to the depth map resulting 1HW shape.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py

def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Get a transformed depth estimation sample from the dataset.

    :param index: Index of the sample to retrieve.
    :return: Tuple containing the transformed image and depth map as np.ndarrays.

    After applying the transforms pipeline, the image is expected to be in HWC format, and the depth map should be
    a 2D array (e.g., Height x Width).

    Before returning the image and depth map, the image's channels are moved to CHW format and additional
     dummy dimension is added to the depth map resulting 1HW shape.
    """
    sample = self.load_sample(index)
    for transform in self.transforms:
        sample = transform(sample)
    return np.transpose(sample.image, (2, 0, 1)).astype(np.float32), np.expand_dims(sample.depth_map, axis=0).astype(np.float32)

`load_random_sample()`

Return a random sample from the dataset

Returns:

Type	Description
`DepthEstimationSample`	Instance of DepthEstimationSample

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py

def load_random_sample(self) -> DepthEstimationSample:
    """
    Return a random sample from the dataset

    :return: Instance of DepthEstimationSample
    """
    num_samples = len(self)
    random_index = random.randrange(0, num_samples)
    return self.load_sample(random_index)

`load_sample(index)` `abstractmethod`

Load a depth estimation sample from the dataset.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index of the sample to load.	required

Returns:

Type	Description
`DepthEstimationSample`	Instance of DepthEstimationSample. If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas, ensure that the same value is used as the `ignore_val` argument in your metric and loss functions. Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py

@abc.abstractmethod
def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample from the dataset.

    :param index: Index of the sample to load.
    :return: Instance of DepthEstimationSample.

    If your dataset contains non-labeled regions with a specific value (e.g., -100) representing ignored areas,
    ensure that the same value is used as the `ignore_val` argument in your metric and loss functions.
    Fill the entries in the depth map that are supposed to be ignored with the `ignore_val` after loading the sample.
    """
    raise NotImplementedError()

`plot(max_samples_per_plot=8, n_plots=1, plot_transformed_data=True, color_scheme=None, drop_extreme_percentage=0, inverse=False)`

Combine samples of images with depth maps into plots and display the result.

Parameters:

Name	Type	Description	Default
`max_samples_per_plot`	`int`	Maximum number of samples (image with depth map) to be displayed per plot.	`8`
`n_plots`	`int`	Number of plots to display.	`1`
`plot_transformed_data`	`bool`	If True, the plot will be over samples after applying transforms (i.e., on getitem). If False, the plot will be over the raw samples (i.e., on load_sample).	`True`
`color_scheme`	`Optional[int]`	OpenCV color scheme for the depth map visualization. If not specified: - If `inverse=True`, the default is COLORMAP_VIRIDIS. - If `inverse=False`, the default is COLORMAP_MAGMA.	`None`
`drop_extreme_percentage`	`float`	Percentage of extreme values to drop on both ends of the depth spectrum.	`0`
`inverse`	`bool`	Apply inversion (1 / depth) if True to the depth map.	`False`

Returns:

Type	Description
	None

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/abstract_depth_estimation_dataset.py

def plot(
    self,
    max_samples_per_plot: int = 8,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    color_scheme: Optional[int] = None,
    drop_extreme_percentage: float = 0,
    inverse: bool = False,
):
    """
    Combine samples of images with depth maps into plots and display the result.

    :param max_samples_per_plot:    Maximum number of samples (image with depth map) to be displayed per plot.
    :param n_plots:                 Number of plots to display.
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e., on __getitem__).
                                    If False, the plot will be over the raw samples (i.e., on load_sample).
    :param color_scheme:            OpenCV color scheme for the depth map visualization. If not specified:
                                    - If `inverse=True`, the default is COLORMAP_VIRIDIS.
                                    - If `inverse=False`, the default is COLORMAP_MAGMA.


    :param drop_extreme_percentage: Percentage of extreme values to drop on both ends of the depth spectrum.
    :param inverse:                 Apply inversion (1 / depth) if True to the depth map.

    :return: None
    """
    plot_counter = 0

    for plot_i in range(n_plots):
        fig, axes = plt.subplots(2, max_samples_per_plot, figsize=(15, 5))
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * max_samples_per_plot
            if plot_transformed_data:
                image, depth_map = self[index]

                # Transpose to HWC format for visualization
                image = image.transpose(1, 2, 0)
                depth_map = depth_map.squeeze()  # Remove dummy dimension
            else:
                sample = self.load_sample(index)
                image, depth_map = sample.image, sample.depth_map

            # Plot the image
            axes[0, img_i].imshow(image)
            axes[0, img_i].axis("off")
            axes[0, img_i].set_title(f"Sample {index}")

            # Plot the depth map side by side with the selected color scheme
            depth_map = DepthVisualization.process_depth_map_for_visualization(depth_map, color_scheme, drop_extreme_percentage, inverse)
            axes[1, img_i].imshow(depth_map)
            axes[1, img_i].axis("off")
            axes[1, img_i].set_title(f"Depth Map {index}")

        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

`NYUv2DepthEstimationDataset`

Bases: AbstractDepthEstimationDataset

Dataset class for NYU Depth V2 dataset for depth estimation.

Parameters:

Name	Type	Description	Default
`root`	`str`	Root directory containing the dataset.	required
`df_path`	`str`	Path to the CSV file containing image and depth map file paths, relative to root.	required
`transforms`		Transforms to be applied to the samples. To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows: - Root directory (specified as 'root' when initializing the dataset) - nyu2_train (or any other split) - scene_category_1 - image_1.jpg - image_2.png - ... - scene_category_2 - image_1.jpg - image_2.png - ... - ... - nyu2_test (or any other split) - 00000_colors.png - 00001_colors.png - 00002_colors.png ... The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns: path to the color images, path to depth maps (both relative to the root). Example CSV content: data/nyu2_train/scene_category_1/image_1.jpg, data/nyu2_train/scene_category_1/image_1_depth.png data/nyu2_train/scene_category_1/image_2.jpg, data/nyu2_train/scene_category_1/image_2_depth.png data/nyu2_train/scene_category_2/image_1.jpg, data/nyu2_train/scene_category_2/image_1_depth.png Note: As of 14/12/2023 official downlaod link is broken. Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input ...	`None`

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py

@register_dataset(Datasets.NYUV2_DEPTH_ESTIMATION_DATASET)
class NYUv2DepthEstimationDataset(AbstractDepthEstimationDataset):
    """
    Dataset class for NYU Depth V2 dataset for depth estimation.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths, relative to root.
    :param transforms: Transforms to be applied to the samples.

    To use the NYUv2Dataset class, ensure that your dataset directory is organized as follows:

    - Root directory (specified as 'root' when initializing the dataset)
      - nyu2_train (or any other split)
        - scene_category_1
          - image_1.jpg
          - image_2.png
          - ...
        - scene_category_2
          - image_1.jpg
          - image_2.png
          - ...
        - ...
      - nyu2_test (or any other split)
        - 00000_colors.png
        - 00001_colors.png
        - 00002_colors.png
        ...

    The CSV file (specified as 'df_path' when initializing the dataset) should contain two columns:
     path to the color images,  path to depth maps (both relative to the root).

    Example CSV content:
    data/nyu2_train/scene_category_1/image_1.jpg,   data/nyu2_train/scene_category_1/image_1_depth.png
    data/nyu2_train/scene_category_1/image_2.jpg,   data/nyu2_train/scene_category_1/image_2_depth.png
    data/nyu2_train/scene_category_2/image_1.jpg,   data/nyu2_train/scene_category_2/image_1_depth.png

    Note: As of 14/12/2023 official downlaod link is broken.
     Data can be obtained at https://www.kaggle.com/code/shreydan/monocular-depth-estimation-nyuv2/input
    ...
    """

    def __init__(self, root: str, df_path: str, transforms=None):
        """
        Initialize NYUv2Dataset.

        :param root: Root directory containing the dataset.
        :param df_path: Path to the CSV file containing image and depth map file paths.
        :param transforms: Transforms to be applied to the samples.
        """
        super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
        self.root = root
        self.df = self._read_df(df_path)
        self._check_paths_exist()

    def _read_df(self, df_path: str) -> pd.DataFrame:
        """
        Read the CSV file containing image and depth map file paths.

        :param df_path: Path to the CSV file.

        :return: DataFrame containing image and depth map file paths.
        """
        df = pd.read_csv(df_path, header=None)
        df[0] = df[0].map(lambda x: os.path.join(self.root, x))
        df[1] = df[1].map(lambda x: os.path.join(self.root, x))
        return df

    def load_sample(self, index: int) -> DepthEstimationSample:
        """
        Load a depth estimation sample at the specified index.

        :param index: Index of the sample.

        :return: Loaded depth estimation sample.
        """
        sample_paths = self.df.iloc[index, :]
        image_path, dp_path = sample_paths[0], sample_paths[1]
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
        return DepthEstimationSample(image=image, depth_map=depth_map)

    def __len__(self):
        """
        Get the number of samples in the dataset.

        :return: Number of samples in the dataset.
        """
        return len(self.df)

    def _check_paths_exist(self):
        """
        Check if the paths in self.df exist. Remove lines with missing paths and print information about removed paths.
        Raise an error if all lines are removed.
        """
        valid_paths = []
        for _, row in self.df.iterrows():
            paths_exist = all(os.path.exists(path) for path in row)
            if paths_exist:
                valid_paths.append(row)
            else:
                warnings.warn(f"Warning: Removed the following line as one or more paths do not exist: {row}")

        if not valid_paths:
            raise FileNotFoundError("All lines in the dataset have been removed as some paths do not exist. " "Please check the paths and dataset structure.")

        self.df = pd.DataFrame(valid_paths, columns=[0, 1])

`init(root, df_path, transforms=None)`

Initialize NYUv2Dataset.

Parameters:

Name	Type	Description	Default
`root`	`str`	Root directory containing the dataset.	required
`df_path`	`str`	Path to the CSV file containing image and depth map file paths.	required
`transforms`		Transforms to be applied to the samples.	`None`

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py

def __init__(self, root: str, df_path: str, transforms=None):
    """
    Initialize NYUv2Dataset.

    :param root: Root directory containing the dataset.
    :param df_path: Path to the CSV file containing image and depth map file paths.
    :param transforms: Transforms to be applied to the samples.
    """
    super(NYUv2DepthEstimationDataset, self).__init__(transforms=transforms)
    self.root = root
    self.df = self._read_df(df_path)
    self._check_paths_exist()

`len()`

Get the number of samples in the dataset.

Returns:

Type	Description
	Number of samples in the dataset.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py

def __len__(self):
    """
    Get the number of samples in the dataset.

    :return: Number of samples in the dataset.
    """
    return len(self.df)

`load_sample(index)`

Load a depth estimation sample at the specified index.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index of the sample.	required

Returns:

Type	Description
`DepthEstimationSample`	Loaded depth estimation sample.

Source code in src/super_gradients/training/datasets/depth_estimation_datasets/nyuv2_dataset.py

def load_sample(self, index: int) -> DepthEstimationSample:
    """
    Load a depth estimation sample at the specified index.

    :param index: Index of the sample.

    :return: Loaded depth estimation sample.
    """
    sample_paths = self.df.iloc[index, :]
    image_path, dp_path = sample_paths[0], sample_paths[1]
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    depth_map = cv2.imread(dp_path, cv2.IMREAD_GRAYSCALE)
    return DepthEstimationSample(image=image, depth_map=depth_map)

`COCODetectionDataset`

Bases: COCOFormatDetectionDataset

Dataset for COCO object detection.

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset:
    >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)

Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py

@register_dataset(Datasets.COCO_DETECTION_DATASET)
class COCODetectionDataset(COCOFormatDetectionDataset):
    """Dataset for COCO object detection.

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset:
            >> train_set = COCODetectionDataset(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = COCODetectionDataset(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(
        self,
        json_file: str = "instances_train2017.json",
        subdir: str = "images/train2017",
        *args,
        **kwargs,
    ):
        """
        :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
        :param subdir:              Sub directory of data_dir containing the data.
        :param with_crowd: Add the crowd groundtruths to __getitem__

        kwargs:
            all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
        """
        super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

`init(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)`

Parameters:

Name	Type	Description	Default
`json_file`	`str`	Name of the coco json file, that resides in data_dir/annotations/json_file.	`'instances_train2017.json'`
`subdir`	`str`	Sub directory of data_dir containing the data.	`'images/train2017'`
`with_crowd`		Add the crowd groundtruths to getitem kwargs: all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.	required

Source code in src/super_gradients/training/datasets/detection_datasets/coco_detection.py

def __init__(
    self,
    json_file: str = "instances_train2017.json",
    subdir: str = "images/train2017",
    *args,
    **kwargs,
):
    """
    :param json_file:           Name of the coco json file, that resides in data_dir/annotations/json_file.
    :param subdir:              Sub directory of data_dir containing the data.
    :param with_crowd: Add the crowd groundtruths to __getitem__

    kwargs:
        all_classes_list: all classes list, default is COCO_DETECTION_CLASSES_LIST.
    """
    super().__init__(json_annotation_file=os.path.join("annotations", json_file), images_dir=subdir, *args, **kwargs)

`COCOFormatDetectionDataset`

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the COCO dataset. - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh). - One folder with all the images.

Output format: (x, y, x, y, class_id)

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py

@register_dataset("COCOFormatDetectionDataset")
class COCOFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the COCO dataset.
    - Annotation file (.json). It has to respect the exact same format as COCO, for both the json schema and the bbox format (xywh).
    - One folder with all the images.

    Output format: (x, y, x, y, class_id)
    """

    @deprecated_parameter(
        "tight_box_rotation",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
    )
    def __init__(
        self,
        data_dir: str,
        json_annotation_file: str,
        images_dir: str,
        with_crowd: bool = True,
        class_ids_to_ignore: Optional[List[int]] = None,
        tight_box_rotation=None,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
        :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
        :param with_crowd:              Add the crowd groundtruths to __getitem__
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
        """
        if tight_box_rotation is not None:
            logger.warning(
                "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
            )
        self.images_dir = images_dir
        self.json_annotation_file = json_annotation_file
        self.with_crowd = with_crowd
        self.class_ids_to_ignore = class_ids_to_ignore or []

        target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
        kwargs["target_fields"] = target_fields
        kwargs["output_fields"] = ["image", *target_fields]
        kwargs["original_target_format"] = XYXY_LABEL
        super().__init__(data_dir=data_dir, *args, **kwargs)

        if len(self.original_classes) != len(self.all_classes_list):
            if set(self.all_classes_list).issubset(set(self.original_classes)):
                raise ParameterMismatchException(
                    "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                    "Please use `class_inclusion_list` to train with reduced number of classes",
                )
            else:
                raise DatasetValidationException(
                    "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                    "Most likely this indicates an error in your all_classes_list parameter"
                )

    def _setup_data_source(self) -> int:
        """
        Parse COCO annotation file
        :return: Number of images in annotation JSON
        """
        if os.path.isabs(self.json_annotation_file):
            annotation_file_path = self.json_annotation_file
        else:
            annotation_file_path = os.path.join(self.data_dir, self.json_annotation_file)
        if not os.path.exists(annotation_file_path):
            raise ValueError("Could not find annotation file under " + str(annotation_file_path))

        all_class_names, annotations = parse_coco_into_detection_annotations(
            annotation_file_path,
            exclude_classes=None,
            include_classes=None,
            # This parameter exists solely for the purpose of keeping the backward compatibility with the old code.
            # Once we refactor base dataset, we can remove this parameter and use only exclude_classes/include_classes
            # at parsing time instead.
            class_ids_to_ignore=self.class_ids_to_ignore,
            image_path_prefix=os.path.join(self.data_dir, self.images_dir),
        )

        self.original_classes = list(all_class_names)
        self.classes = copy.deepcopy(self.original_classes)
        self._annotations = annotations
        return len(annotations)

    @property
    def _all_classes(self) -> List[str]:
        return self.original_classes

    def _load_annotation(self, sample_id: int) -> dict:
        """
        Load relevant information of a specific image.

        :param sample_id:               Sample_id in the dataset
        :return target:                 Target Bboxes (detection) in XYXY_LABEL format
        :return crowd_target:           Crowd target Bboxes (detection) in XYXY_LABEL format
        :return target_segmentation:    Segmentation
        :return initial_img_shape:      Image (height, width)
        :return resized_img_shape:      Resides image (height, width)
        :return img_path:               Path to the associated image
        """

        annotation = self._annotations[sample_id]

        width = annotation.image_width
        height = annotation.image_height

        # Make a copy of the annotations, so that we can modify them
        boxes_xyxy = change_bbox_bounds_for_image_size(annotation.ann_boxes_xyxy, img_shape=(height, width), inplace=False)
        iscrowd = annotation.ann_is_crowd.copy()
        labels = annotation.ann_labels.copy()

        # Exclude boxes with invalid dimensions (x1 > x2 or y1 > y2)
        mask = np.logical_and(boxes_xyxy[:, 2] >= boxes_xyxy[:, 0], boxes_xyxy[:, 3] >= boxes_xyxy[:, 1])
        boxes_xyxy = boxes_xyxy[mask]
        iscrowd = iscrowd[mask]
        labels = labels[mask]

        # Currently, the base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        initial_img_shape = (height, width)
        if self.input_dim is not None:
            scale_factor = min(self.input_dim[0] / height, self.input_dim[1] / width)
            resized_img_shape = (int(height * scale_factor), int(width * scale_factor))
        else:
            resized_img_shape = initial_img_shape
            scale_factor = 1

        targets = np.concatenate([boxes_xyxy[~iscrowd] * scale_factor, labels[~iscrowd, None]], axis=1).astype(np.float32)
        crowd_targets = np.concatenate([boxes_xyxy[iscrowd] * scale_factor, labels[iscrowd, None]], axis=1).astype(np.float32)

        annotation = {
            "target": targets,
            "crowd_target": crowd_targets,
            "initial_img_shape": initial_img_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": annotation.image_path,
        }
        return annotation

`init(data_dir, json_annotation_file, images_dir, with_crowd=True, class_ids_to_ignore=None, tight_box_rotation=None, *args, **kwargs)`

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Where the data is stored.	required
`json_annotation_file`	`str`	Name of the coco json file. Path can be either absolute, or relative to data_dir.	required
`images_dir`	`str`	Name of the directory that includes all the images. Path relative to data_dir.	required
`with_crowd`	`bool`	Add the crowd groundtruths to getitem	`True`
`class_ids_to_ignore`	`Optional[List[int]]`	List of class ids to ignore in the dataset. By default, doesnt ignore any class.	`None`
`tight_box_rotation`		This parameter is deprecated and will be removed in a SuperGradients 3.8.	`None`

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py

@deprecated_parameter(
    "tight_box_rotation",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `tight_box_rotation` has been removed. This parameter has no effect anymore.",
)
def __init__(
    self,
    data_dir: str,
    json_annotation_file: str,
    images_dir: str,
    with_crowd: bool = True,
    class_ids_to_ignore: Optional[List[int]] = None,
    tight_box_rotation=None,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param json_annotation_file:    Name of the coco json file. Path can be either absolute, or relative to data_dir.
    :param images_dir:              Name of the directory that includes all the images. Path relative to data_dir.
    :param with_crowd:              Add the crowd groundtruths to __getitem__
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param tight_box_rotation:      This parameter is deprecated and will be removed in a SuperGradients 3.8.
    """
    if tight_box_rotation is not None:
        logger.warning(
            "Parameter `tight_box_rotation` is deprecated and will be removed in a SuperGradients 3.8." "Please remove this parameter from your code."
        )
    self.images_dir = images_dir
    self.json_annotation_file = json_annotation_file
    self.with_crowd = with_crowd
    self.class_ids_to_ignore = class_ids_to_ignore or []

    target_fields = ["target", "crowd_target"] if self.with_crowd else ["target"]
    kwargs["target_fields"] = target_fields
    kwargs["output_fields"] = ["image", *target_fields]
    kwargs["original_target_format"] = XYXY_LABEL
    super().__init__(data_dir=data_dir, *args, **kwargs)

    if len(self.original_classes) != len(self.all_classes_list):
        if set(self.all_classes_list).issubset(set(self.original_classes)):
            raise ParameterMismatchException(
                "Parameter `all_classes_list` contains a subset of classes from dataset JSON. "
                "Please use `class_inclusion_list` to train with reduced number of classes",
            )
        else:
            raise DatasetValidationException(
                "Number of classes in dataset JSON do not match with number of classes in all_classes_list parameter. "
                "Most likely this indicates an error in your all_classes_list parameter"
            )

`parse_coco_into_detection_annotations(ann, exclude_classes=None, include_classes=None, class_ids_to_ignore=None, image_path_prefix=None)`

Load COCO detection dataset from annotation file.

Parameters:

Name	Type	Description	Default
`ann`	`str`	A path to the JSON annotation file in COCO format.	required
`exclude_classes`	`Optional[List[str]]`	List of classes to exclude from the dataset. All other classes will be included. This parameter is mutually exclusive with include_classes and class_ids_to_ignore.	`None`
`include_classes`	`Optional[List[str]]`	List of classes to include in the dataset. All other classes will be excluded. This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.	`None`
`class_ids_to_ignore`	`Optional[List[int]]`	List of category ids to ignore in the dataset. All other classes will be included. This parameter added for the purpose of backward compatibility with the class_ids_to_ignore argument of COCOFormatDetectionDataset but will be removed in future in favor of include_classes/exclude_classes. This parameter is mutually exclusive with exclude_classes and include_classes.	`None`
`image_path_prefix`		A prefix to add to the image paths in the annotation file.	`None`

Returns:

Type	Description
`Tuple[List[str], List[DetectionAnnotation]]`	Tuple (class_names, annotations) where class_names is a list of class names (respecting include_classes/exclude_classes/class_ids_to_ignore) and annotations is a list of DetectionAnnotation objects.

Source code in src/super_gradients/training/datasets/detection_datasets/coco_format_detection.py

def parse_coco_into_detection_annotations(
    ann: str,
    exclude_classes: Optional[List[str]] = None,
    include_classes: Optional[List[str]] = None,
    class_ids_to_ignore: Optional[List[int]] = None,
    image_path_prefix=None,
) -> Tuple[List[str], List[DetectionAnnotation]]:
    """
    Load COCO detection dataset from annotation file.
    :param ann: A path to the JSON annotation file in COCO format.
    :param exclude_classes: List of classes to exclude from the dataset. All other classes will be included.
                                This parameter is mutually exclusive with include_classes and class_ids_to_ignore.

    :param include_classes:     List of classes to include in the dataset. All other classes will be excluded.
                                This parameter is mutually exclusive with exclude_classes and class_ids_to_ignore.
    :param class_ids_to_ignore: List of category ids to ignore in the dataset. All other classes will be included.
                                This parameter added for the purpose of backward compatibility with the class_ids_to_ignore
                                argument of COCOFormatDetectionDataset but will be
                                removed in future in favor of include_classes/exclude_classes.
                                This parameter is mutually exclusive with exclude_classes and include_classes.
    :param image_path_prefix:   A prefix to add to the image paths in the annotation file.
    :return:                    Tuple (class_names, annotations) where class_names is a list of class names
                                (respecting include_classes/exclude_classes/class_ids_to_ignore) and
                                annotations is a list of DetectionAnnotation objects.
    """
    with open(ann, "r") as f:
        coco = json.load(f)

    # Extract class names and class ids
    category_ids = np.array([category["id"] for category in coco["categories"]], dtype=int)
    category_names = np.array([category["name"] for category in coco["categories"]], dtype=str)

    # Extract box annotations
    ann_box_xyxy = xywh_to_xyxy_inplace(np.array([annotation["bbox"] for annotation in coco["annotations"]], dtype=np.float32).reshape(-1, 4), image_shape=None)

    ann_category_id = np.array([annotation["category_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)
    ann_iscrowd = np.array([annotation["iscrowd"] for annotation in coco["annotations"]], dtype=bool).reshape(-1)
    ann_image_ids = np.array([annotation["image_id"] for annotation in coco["annotations"]], dtype=int).reshape(-1)

    # Extract image stuff
    img_ids = [img["id"] for img in coco["images"]]
    img_paths = [img["file_name"] if "file_name" in img else "{:012}".format(img["id"]) + ".jpg" for img in coco["images"]]
    img_width_height = [(img["width"], img["height"]) for img in coco["images"]]

    # Now, we can drop the annotations that belongs to the excluded classes
    if int(class_ids_to_ignore is not None) + int(exclude_classes is not None) + int(include_classes is not None) > 1:
        raise ValueError("Only one of exclude_classes, class_ids_to_ignore or include_classes can be specified")
    elif exclude_classes is not None:
        if len(exclude_classes) != len(set(exclude_classes)):
            raise ValueError("The excluded classes must be unique")
        classes_not_in_dataset = set(exclude_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the excluded classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, exclude_classes, invert=True)
    elif class_ids_to_ignore is not None:
        if len(class_ids_to_ignore) != len(set(class_ids_to_ignore)):
            raise ValueError("The ignored classes must be unique")
        classes_not_in_dataset = set(class_ids_to_ignore).difference(set(category_ids))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the ignored classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_ids, class_ids_to_ignore, invert=True)
    elif include_classes is not None:
        if len(include_classes) != len(set(include_classes)):
            raise ValueError("The included classes must be unique")
        classes_not_in_dataset = set(include_classes).difference(set(category_names))
        if len(classes_not_in_dataset) > 0:
            raise ValueError(f"One or more of the included classes does not exist in the dataset: {classes_not_in_dataset}")
        keep_classes_mask = np.isin(category_names, include_classes)
    else:
        keep_classes_mask = None

    if keep_classes_mask is not None:
        category_ids = category_ids[keep_classes_mask]
        category_names = category_names[keep_classes_mask]

        keep_anns_mask = np.isin(ann_category_id, category_ids)
        ann_category_id = ann_category_id[keep_anns_mask]

    # category_ids can be non-sequential and not ordered
    num_categories = len(category_ids)

    # Make sequential
    order = np.argsort(category_ids, kind="stable")
    category_ids = category_ids[order]  #
    category_names = category_names[order]

    # Remap category ids to be in range [0, num_categories)
    class_label_table = np.zeros(np.max(category_ids) + 1, dtype=int) - 1
    new_class_ids = np.arange(num_categories, dtype=int)
    class_label_table[category_ids] = new_class_ids

    # Remap category ids in annotations
    ann_category_id = class_label_table[ann_category_id]
    if (ann_category_id < 0).any():
        raise ValueError("Some annotations have class ids that are not in the list of classes. This probably indicates a bug in the annotation file")

    annotations = []

    img_id2ann_box_xyxy = defaultdict(list)
    img_id2ann_iscrowd = defaultdict(list)
    img_id2ann_category_id = defaultdict(list)
    for ann_image_id, _ann_box_xyxy, _ann_iscrowd, _ann_category_id in zip(ann_image_ids, ann_box_xyxy, ann_iscrowd, ann_category_id):
        img_id2ann_box_xyxy[ann_image_id].append(_ann_box_xyxy)
        img_id2ann_iscrowd[ann_image_id].append(_ann_iscrowd)
        img_id2ann_category_id[ann_image_id].append(_ann_category_id)

    for img_id, image_path, (image_width, image_height) in zip(img_ids, img_paths, img_width_height):
        if image_path_prefix is not None:
            image_path = os.path.join(image_path_prefix, image_path)

        ann = DetectionAnnotation(
            image_id=img_id,
            image_path=image_path,
            image_width=image_width,
            image_height=image_height,
            ann_boxes_xyxy=np.asarray(img_id2ann_box_xyxy[img_id], dtype=np.float32).reshape(-1, 4),
            ann_is_crowd=np.asarray(img_id2ann_iscrowd[img_id], dtype=bool).reshape(-1),
            ann_labels=np.asarray(img_id2ann_category_id[img_id], dtype=int).reshape(-1),
        )
        annotations.append(ann)

    return category_names, annotations

`DetectionDataset`

Bases: Dataset, HasPreprocessingParams, HasClassesInformation

Detection dataset.

This is a boilerplate class to facilitate the implementation of datasets.

HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ? - Inherit from DetectionDataSet - implement the method self.load_annotation to return at least the fields "target" and "img_path" - Call super().__init_ with the required params. //!\ super().init will call self.load_annotation, so make sure that every required attributes are set up before calling super().__init_ (ideally just call it last)

WORKFLOW: - On instantiation: - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

- On call (__getitem__) for a specific image index:
    - The image and annotations are grouped together in a dict called SAMPLE
    - the sample is processed according to th transform
    - Only the specified fields are returned by __getitem__

TERMINOLOGY - TARGET: Groundtruth, made of bboxes. The format can vary from one dataset to another - ANNOTATION: Combination of targets (groundtruth) and metadata of the image, but without the image itself. > Has to include the fields "target" and "img_path" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - SAMPLE: Outout of the dataset: > Has to include the fields "target" and "image" > Can include other fields like "crowd_target", "image_info", "segmentation", ... - Index: Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1 - Sample ID: Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

@register_dataset(Datasets.DETECTION_DATASET)
class DetectionDataset(Dataset, HasPreprocessingParams, HasClassesInformation):
    """Detection dataset.

    This is a boilerplate class to facilitate the implementation of datasets.

    HOW TO CREATE A DATASET THAT INHERITS FROM DetectionDataSet ?
        - Inherit from DetectionDataSet
        - implement the method self._load_annotation to return at least the fields "target" and "img_path"
        - Call super().__init__ with the required params.
                //!\\ super().__init__ will call self._load_annotation, so make sure that every required
                      attributes are set up before calling super().__init__ (ideally just call it last)

    WORKFLOW:
        - On instantiation:
            - All annotations are cached. If class_inclusion_list was specified, there is also subclassing at this step.

        - On call (__getitem__) for a specific image index:
            - The image and annotations are grouped together in a dict called SAMPLE
            - the sample is processed according to th transform
            - Only the specified fields are returned by __getitem__

    TERMINOLOGY
        - TARGET:       Groundtruth, made of bboxes. The format can vary from one dataset to another
        - ANNOTATION:   Combination of targets (groundtruth) and metadata of the image, but without the image itself.
                            > Has to include the fields "target" and "img_path"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - SAMPLE:       Outout of the dataset:
                            > Has to include the fields "target" and "image"
                            > Can include other fields like "crowd_target", "image_info", "segmentation", ...
        - Index:        Index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        - Sample ID:    Index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
    """

    @resolve_param("transforms", ListFactory(TransformsFactory()))
    def __init__(
        self,
        data_dir: str,
        original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        target_fields: List[str] = None,
        output_fields: List[str] = None,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """Detection dataset.

        :param data_dir:                Where the data is stored
        :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                        None means that the image will be loaded as is.
                                        Scalar (size) - Image will be resized to (size, size)
                                        Tuple (rows,cols) - Image will be resized to (rows, cols)
        :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                        differ based on transforms.
        :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
        :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                        but requires more RAM and more time to instantiate the dataset when working on very large datasets.
        :param transforms:              List of transforms to apply sequentially on sample.
        :param all_classes_list:        All the class names.
        :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                        Classes not in this list will excluded from training.
                                        Thus, number of classes in model must be adjusted accordingly.
        :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                                will be ignored.
        :param target_fields:                   List of the fields target fields. This has to include regular target,
                                                but can also include crowd target, segmentation target, ...
                                                It has to include at least "target" but can include other.
        :param output_fields:                   Fields that will be outputed by __getitem__.
                                                It has to include at least "image" and "target" but can include other.
        :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
        :param show_all_warnings:       Whether to show all warnings or not.
        :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
        """
        if cache is not None:
            warnings.warn(
                "cache parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )
        if cache_dir is not None:
            warnings.warn(
                "cache_dir parameter has been marked as deprecated and setting it has no effect. "
                "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
                DeprecationWarning,
            )

        super().__init__()
        self.verbose = verbose
        self.show_all_warnings = show_all_warnings

        if isinstance(original_target_format, DetectionTargetsFormat):
            logger.warning(
                "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
                "Support for DetectionTargetsFormat will be removed in 3.1"
            )

        self.data_dir = data_dir
        if not Path(data_dir).exists():
            raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

        # Number of images that are available (regardless of ignored images)
        n_dataset_samples = self._setup_data_source()
        if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
            raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
        n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

        self.input_dim = ensure_is_tuple_of_two(input_dim)
        self.original_target_format = original_target_format

        if len(all_classes_list) != len(set(all_classes_list)):
            raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

        if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
            raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

        self.all_classes_list = all_classes_list or self._all_classes
        self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
        self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
        self.classes = self.class_inclusion_list or self.all_classes_list
        if len(set(self.classes) - set(self.all_classes_list)) > 0:
            wrong_classes = set(self.classes) - set(all_classes_list)
            raise DatasetValidationException(
                f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
            )

        self.ignore_empty_annotations = ignore_empty_annotations
        self.target_fields = target_fields or ["target"]
        if "target" not in self.target_fields:
            raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

        self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

        self.transforms = transforms

        self.output_fields = output_fields or ["image", "target"]
        if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
            raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

        self._cache_annotations = cache_annotations
        self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

        # Maps (dataset index) -> (non-empty sample ids)
        self._non_empty_sample_ids: Optional[List[int]] = None

        # Some transform may require non-empty annotations to be indexed.
        transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

        # Iterate over the whole dataset to index the images with/without annotations.
        if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
            if self._cache_annotations:
                logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
            elif self.ignore_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
                )
            elif transform_require_non_empty_annotations:
                logger.info(
                    "Dataset Initialization in progress. "
                    "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
                )

            # Map indexes to sample annotations.
            non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
            if self._cache_annotations:
                if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                    self._cached_annotations = non_empty_annotations
                else:
                    # Non overlapping dicts. since they map unique sample_ids -> sample
                    self._cached_annotations = {**non_empty_annotations, **empty_annotations}

            if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
                raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

            self._non_empty_sample_ids = list(non_empty_annotations.keys())

        self._n_samples = n_samples  # Regardless of any filtering

    @property
    def _all_classes(self):
        """Placeholder to setup the class names. This is an alternative to passing "all_classes_list" to __init__.
        This is usefull when all_classes_list is not known in advance, only after loading the dataset."""
        raise NotImplementedError

    def _setup_data_source(self) -> int:
        """Set up the data source and store relevant objects as attributes.

        :return: Number of available samples, (i.e. how many images we have, regardless of any filter we might want to use)"""
        raise NotImplementedError

    def _load_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load annotations associated to a specific sample.
        Please note that the targets should be resized according to self.input_dim!

        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        :return:            Annotation, a dict with any field but has to include at least the fields specified in self._required_annotation_fields.
        """
        raise NotImplementedError

    def _get_sample_annotations(self, index: int, ignore_empty_annotations: bool) -> Dict[str, Union[np.ndarray, Any]]:
        """Get the annotation associated to a specific sample. Use cache if enabled.
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    Whether to ignore empty annotations or not.
        :return:                            Dict representing the annotation of a specific image
        """
        sample_id = self._non_empty_sample_ids[index] if ignore_empty_annotations else index
        if self._cache_annotations:
            return self._cached_annotations[sample_id]
        else:
            return self._load_sample_annotation(sample_id=sample_id)

    def _load_sample_annotation(self, sample_id: int) -> Dict[str, Union[np.ndarray, Any]]:
        """Load the annotation associated to a specific sample and apply subclassing.
        :param sample_id:   Sample ID refers to the index of the sample in the dataset, WITHOUT considering any filtering. 0<=sample_id<=len(source)-1
        """
        sample_annotations = self._load_annotation(sample_id=sample_id)
        if not self._required_annotation_fields.issubset(set(sample_annotations.keys())):
            raise KeyError(
                f"_load_annotation is expected to return at least the fields {self._required_annotation_fields}, but got {set(sample_annotations.keys())}"
            )

        # Filter out classes that are not in self.class_inclusion_list
        if self.class_inclusion_list is not None:
            sample_annotations = self._sub_class_annotation(annotation=sample_annotations)

        return sample_annotations

    def _load_all_annotations(self, n_samples: int) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[str, Any]]]:
        """Load ALL the annotations into memory. This is usually required when `ignore_empty_annotations=True`,
        because we have to iterate over the whole dataset once in order to know which sample is empty and which is not.
        Question: Why not just check if annotation is empty on the fly ?
        Answer: When running with DDP, we split the dataset into small chunks.
                Therefore, we need to make sure that each chunk includes a similar subset of index.
                If we were to check on the fly, we would not know in advance the size of dataset/chunks
                and this means that some chunks would be smaller than others

        :param n_samples:   Number of samples in the datasets (including samples without annotations).
        :return:            A tuple of two dicts, one for non-empty annotations and one for empty annotations
                                - non_empty_annotations: Dict mapping dataset index -> non-empty annotations
                                - empty_annotations:     Dict mapping dataset index -> empty annotations
        """
        n_invalid_bbox = 0
        non_empty_annotations, empty_annotations = {}, {}

        for index in tqdm(range(n_samples), desc="Indexing dataset annotations", disable=not self.verbose):
            sample_annotations = self._load_sample_annotation(sample_id=index)
            n_invalid_bbox += sample_annotations.get("n_invalid_labels", 0)

            is_annotation_non_empty = any(len(sample_annotations[field]) != 0 for field in self.target_fields)
            if is_annotation_non_empty:
                non_empty_annotations[index] = sample_annotations if self._cache_annotations else None
            else:
                empty_annotations[index] = sample_annotations if self._cache_annotations else None

        if len(non_empty_annotations) + len(empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        if n_invalid_bbox > 0:
            logger.warning(f"Found {n_invalid_bbox} invalid bbox that were ignored. For more information, please set `show_all_warnings=True`.")

        return non_empty_annotations, empty_annotations

    def _sub_class_annotation(self, annotation: dict) -> Union[dict, None]:
        """Subclass every field listed in self.target_fields. It could be targets, crowd_targets, ...

        :param annotation: Dict representing the annotation of a specific image
        :return:           Subclassed annotation if non-empty after subclassing, otherwise None
        """
        class_index = _get_class_index_in_target(target_format=self.original_target_format)
        for field in self.target_fields:
            annotation[field] = self._sub_class_target(targets=annotation[field], class_index=class_index)
        return annotation

    def _sub_class_target(self, targets: np.ndarray, class_index: int) -> np.ndarray:
        """Sublass targets of a specific image.

        :param targets:     Target array to subclass of shape [n_targets, 5], 5 representing a bbox
        :param class_index:    Position of the class id in a bbox
                                ex: 0 if bbox of format label_xyxy | -1 if bbox of format xyxy_label
        :return:            Subclassed target
        """
        targets_kept = []
        for target in targets:
            cls_id = int(target[class_index])
            cls_name = self.all_classes_list[cls_id]
            if cls_name in self.class_inclusion_list:
                # Replace the target cls_id in self.all_classes_list by cls_id in self.class_inclusion_list
                target[class_index] = self.class_inclusion_list.index(cls_name)
                targets_kept.append(target)

        return np.array(targets_kept) if len(targets_kept) > 0 else np.zeros((0, 5), dtype=np.float32)

    def _load_resized_img(self, image_path: str) -> np.ndarray:
        """Load an image and resize it to the desired size (If relevant).
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img = self._load_image(image_path=image_path)

        if self.input_dim is not None:
            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)

        return img

    def _load_image(self, image_path: str) -> np.ndarray:
        """Load an image.
        :param image_path:  Full path of the image
        :return:            Image in BGR format, and channel last (HWC).
        """
        img_file = os.path.join(image_path)
        img = cv2.imread(img_file)

        if img is None:
            raise FileNotFoundError(f"{img_file} was no found. Please make sure that the dataset was" f"downloaded and that the path is correct")
        return img

    def __len__(self) -> int:
        """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
        return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

    def __getitem__(self, index: int) -> Tuple:
        """Get the sample post transforms at a specific index of the dataset.
        The output of this function will be collated to form batches.

        :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :return:        Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
        sample = self.apply_transforms(sample)
        for field in self.output_fields:
            if field not in sample.keys():
                raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
        return tuple(sample[field] for field in self.output_fields)

    def get_random_item(self):
        return self[self.get_random_sample(ignore_empty_annotations=self.ignore_empty_annotations)]

    def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        """Get raw sample, before any transform (beside subclassing).
        :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
        :param ignore_empty_annotations:    If True, empty annotations will be ignored
        :return:                            Sample, i.e. a dictionary including at least "image" and "target"
        """
        sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
        image = self._load_resized_img(image_path=sample_annotations["img_path"])
        return {"image": image, **deepcopy(sample_annotations)}

    def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
        """
        Applies self.transforms sequentially to sample

        If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
         sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
         only additional samples with objects in them.

        :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
        :return: Transformed sample
        """

        has_crowd_target = "crowd_target" in sample
        detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
        target_format_transform: Optional[DetectionTargetsFormatTransform] = None

        for transform in self.transforms:
            detection_sample.additional_samples = [
                LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
            ]
            detection_sample = transform.apply_to_sample(sample=detection_sample)

            detection_sample.additional_samples = None
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format_transform = transform

        transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
        if target_format_transform is not None:
            transformed_dict = target_format_transform(sample=transformed_dict)
        return transformed_dict

    def _get_additional_inputs_for_transform(self, transform: AbstractDetectionTransform) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Add additional inputs required by a transform to the sample"""
        additional_samples_count = transform.additional_samples_count if hasattr(transform, "additional_samples_count") else 0
        non_empty_annotations = transform.non_empty_annotations if hasattr(transform, "non_empty_annotations") else False
        return self.get_random_samples(count=additional_samples_count, ignore_empty_annotations=non_empty_annotations)

    def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
        """Load random samples.

        :param count: The number of samples wanted
        :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
        :return: A list of samples satisfying input params
        """
        return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

    def get_random_sample(self, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
        n_relevant_samples = len(self._non_empty_sample_ids) if ignore_empty_annotations else self._n_samples
        random_index = random.randint(0, n_relevant_samples - 1)
        return self.get_sample(index=random_index, ignore_empty_annotations=ignore_empty_annotations)

    @property
    def output_target_format(self):
        target_format = self.original_target_format
        for transform in self.transforms:
            if isinstance(transform, DetectionTargetsFormatTransform):
                target_format = transform.output_format
        return target_format

    @staticmethod
    def _standardize_image(image):
        # Normalize the image to have minimum of 0 and maximum of 1
        image_min = image.min()
        image_max = image.max()
        normalized_image = (image - image_min) / (image_max - image_min + 1e-8)

        # Rescale the normalized image to 0-255
        standardized_image = (normalized_image * 255).astype(np.uint8)

        return standardized_image

    def plot(
        self,
        max_samples_per_plot: int = 16,
        n_plots: int = 1,
        plot_transformed_data: bool = True,
        box_thickness: int = 2,
    ):
        """Combine samples of images with bbox into plots and display the result.

        :param max_samples_per_plot:    Maximum number of images to be displayed per plot
        :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
        :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                        If False, the plot will be over the raw samples (i.e. on get_sample)
        :return:
        """
        plot_counter = 0
        input_format = self.output_target_format if plot_transformed_data else self.original_target_format
        if isinstance(input_format, DetectionTargetsFormat):
            raise ValueError(
                "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
            )

        for plot_i in range(n_plots):
            fig = plt.figure(figsize=(10, 10))

            n_subplot = int(np.ceil(max_samples_per_plot**0.5))

            # Plot `max_samples_per_plot` images.
            for img_i in range(max_samples_per_plot):
                index = img_i + plot_i * 16

                # LOAD IMAGE/TARGETS
                if plot_transformed_data:
                    # Access to the image and the target AFTER self.transform
                    image, targets, *_ = self[img_i + plot_i * 16]
                else:
                    # Access to the image and the target BEFORE self.transform
                    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                    image, targets = sample["image"], sample["target"]

                # FORMAT TARGETS
                if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                    image = image.transpose((1, 2, 0))

                image = self._standardize_image(image)
                image = image.astype(np.uint8)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

                # Convert to XYXY_LABEL format
                targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
                targets_label_xyxy = targets_format_converter(targets)

                image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

                plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
                plt.imshow(image)
                plt.axis("off")

            fig.tight_layout()
            plt.show()
            plt.close()

            plot_counter += 1
            if plot_counter == n_plots:
                return

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as as list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = [Processings.ReverseImageChannels]
        if self.input_dim is not None:
            pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(
            class_names=self.classes,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            iou=0.65,
            conf=0.5,
        )
        return params

    def get_sample_classes_information(self, index) -> np.ndarray:
        target = self._get_sample_annotations(index=index, ignore_empty_annotations=self.ignore_empty_annotations)["target"]
        if len(target) == 0:  # in case of no objects in the sample
            return np.zeros(len(self.classes))

        target_class_index = _get_class_index_in_target(target_format=self.original_target_format)  # can be sped-up with a property rather computing per index
        classes = target[:, target_class_index].astype(int)

        return np.bincount(classes, minlength=len(self.classes))

    def get_dataset_classes_information(self) -> np.ndarray:
        return np.row_stack([self.get_sample_classes_information(index=index) for index in range(len(self))])

`getitem(index)`

Get the sample post transforms at a specific index of the dataset. The output of this function will be collated to form batches.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1	required

Returns:

Type	Description
`Tuple`	Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def __getitem__(self, index: int) -> Tuple:
    """Get the sample post transforms at a specific index of the dataset.
    The output of this function will be collated to form batches.

    :param index:   Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :return:        Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
    sample = self.apply_transforms(sample)
    for field in self.output_fields:
        if field not in sample.keys():
            raise KeyError(f"The field {field} must be present in the sample but was not found." "Please check the output fields of your transforms.")
    return tuple(sample[field] for field in self.output_fields)

`init(data_dir, original_target_format, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, target_fields=None, output_fields=None, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)`

Detection dataset.

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Where the data is stored	required
`input_dim`	`Union[int, Tuple[int, int], None]`	Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols). None means that the image will be loaded as is. Scalar (size) - Image will be resized to (size, size) Tuple (rows,cols) - Image will be resized to (rows, cols)	`None`
`original_target_format`	`Union[ConcatenatedTensorFormat, DetectionTargetsFormat]`	Format of targets stored on disk. raw data format, the output format might differ based on transforms.	required
`max_num_samples`	`int`	If not None, set the maximum size of the dataset by only indexing the first n annotations/images.	`None`
`cache_annotations`	`bool`	Whether to cache annotations or not. This reduces training time by pre-loading all the annotations, but requires more RAM and more time to instantiate the dataset when working on very large datasets.	`True`
`transforms`	`List[AbstractDetectionTransform]`	List of transforms to apply sequentially on sample.	`[]`
`all_classes_list`	`Optional[List[str]]`	All the class names.	`[]`
`class_inclusion_list`	`Optional[List[str]]`	If not None, define the subset of classes to be included as targets. Classes not in this list will excluded from training. Thus, number of classes in model must be adjusted accordingly.	`None`
`ignore_empty_annotations`	`bool`	If True and class_inclusion_list not None, images without any target will be ignored.	`True`
`target_fields`	`List[str]`	List of the fields target fields. This has to include regular target, but can also include crowd target, segmentation target, ... It has to include at least "target" but can include other.	`None`
`output_fields`	`List[str]`	Fields that will be outputed by getitem. It has to include at least "image" and "target" but can include other.	`None`
`verbose`	`bool`	Whether to show additional information or not, such as loading progress. (doesnt include warnings)	`True`
`show_all_warnings`	`bool`	Whether to show all warnings or not.	`False`
`cache`		Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8	`None`
`cache_dir`		Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8	`None`

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

@resolve_param("transforms", ListFactory(TransformsFactory()))
def __init__(
    self,
    data_dir: str,
    original_target_format: Union[ConcatenatedTensorFormat, DetectionTargetsFormat],
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    target_fields: List[str] = None,
    output_fields: List[str] = None,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """Detection dataset.

    :param data_dir:                Where the data is stored
    :param input_dim:               Image size (when loaded, before transforms). Can be None, scalar or tuple (rows, cols).
                                    None means that the image will be loaded as is.
                                    Scalar (size) - Image will be resized to (size, size)
                                    Tuple (rows,cols) - Image will be resized to (rows, cols)
    :param original_target_format:  Format of targets stored on disk. raw data format, the output format might
                                    differ based on transforms.
    :param max_num_samples:         If not None, set the maximum size of the dataset by only indexing the first n annotations/images.
    :param cache_annotations:       Whether to cache annotations or not. This reduces training time by pre-loading all the annotations,
                                    but requires more RAM and more time to instantiate the dataset when working on very large datasets.
    :param transforms:              List of transforms to apply sequentially on sample.
    :param all_classes_list:        All the class names.
    :param class_inclusion_list:    If not None, define the subset of classes to be included as targets.
                                    Classes not in this list will excluded from training.
                                    Thus, number of classes in model must be adjusted accordingly.
    :param ignore_empty_annotations:        If True and class_inclusion_list not None, images without any target
                                            will be ignored.
    :param target_fields:                   List of the fields target fields. This has to include regular target,
                                            but can also include crowd target, segmentation target, ...
                                            It has to include at least "target" but can include other.
    :param output_fields:                   Fields that will be outputed by __getitem__.
                                            It has to include at least "image" and "target" but can include other.
    :param verbose:                 Whether to show additional information or not, such as loading progress. (doesnt include warnings)
    :param show_all_warnings:       Whether to show all warnings or not.
    :param cache:                   Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    :param cache_dir:               Deprecated. This parameter is not used and setting it has no effect. It will be removed in 3.8
    """
    if cache is not None:
        warnings.warn(
            "cache parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )
    if cache_dir is not None:
        warnings.warn(
            "cache_dir parameter has been marked as deprecated and setting it has no effect. "
            "It will be removed in SuperGradients 3.8. Please remove this parameter when instantiating a dataset instance",
            DeprecationWarning,
        )

    super().__init__()
    self.verbose = verbose
    self.show_all_warnings = show_all_warnings

    if isinstance(original_target_format, DetectionTargetsFormat):
        logger.warning(
            "Deprecation: original_target_format should be of type ConcatenatedTensorFormat instead of DetectionTargetsFormat."
            "Support for DetectionTargetsFormat will be removed in 3.1"
        )

    self.data_dir = data_dir
    if not Path(data_dir).exists():
        raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")

    # Number of images that are available (regardless of ignored images)
    n_dataset_samples = self._setup_data_source()
    if not isinstance(n_dataset_samples, int) or n_dataset_samples < 1:
        raise ValueError(f"_setup_data_source() should return the number of available samples but got {n_dataset_samples}")
    n_samples = n_dataset_samples if max_num_samples is None else min(n_dataset_samples, max_num_samples)

    self.input_dim = ensure_is_tuple_of_two(input_dim)
    self.original_target_format = original_target_format

    if len(all_classes_list) != len(set(all_classes_list)):
        raise DatasetValidationException(f"all_classes_list contains duplicate class names: {collections.Counter(all_classes_list)}")

    if class_inclusion_list is not None and len(class_inclusion_list) != len(set(class_inclusion_list)):
        raise DatasetValidationException(f"class_inclusion_list contains duplicate class names: {collections.Counter(class_inclusion_list)}")

    self.all_classes_list = all_classes_list or self._all_classes
    self.all_classes_list = list(self.all_classes_list) if self.all_classes_list is not None else None
    self.class_inclusion_list = list(class_inclusion_list) if class_inclusion_list is not None else None
    self.classes = self.class_inclusion_list or self.all_classes_list
    if len(set(self.classes) - set(self.all_classes_list)) > 0:
        wrong_classes = set(self.classes) - set(all_classes_list)
        raise DatasetValidationException(
            f"{wrong_classes} defined in `class_inclusion_list` were not found among `all_classes_list={self.all_classes_list}`"
        )

    self.ignore_empty_annotations = ignore_empty_annotations
    self.target_fields = target_fields or ["target"]
    if "target" not in self.target_fields:
        raise KeyError('"target" is expected to be in the fields to subclass but it was not included')

    self._required_annotation_fields = {"target", "img_path", "resized_img_shape"}

    self.transforms = transforms

    self.output_fields = output_fields or ["image", "target"]
    if len(self.output_fields) < 2 or self.output_fields[0] != "image" or self.output_fields[1] != "target":
        raise ValueError('output_fields must start with "image" and then "target", followed by any other field')

    self._cache_annotations = cache_annotations
    self._cached_annotations: Dict[int, Dict] = {}  # We use a dict and not a list because when `ignore_empty_annotations=True` we may ignore some indexes.

    # Maps (dataset index) -> (non-empty sample ids)
    self._non_empty_sample_ids: Optional[List[int]] = None

    # Some transform may require non-empty annotations to be indexed.
    transform_require_non_empty_annotations = any(getattr(transform, "non_empty_annotations", False) for transform in self.transforms)

    # Iterate over the whole dataset to index the images with/without annotations.
    if self._cache_annotations or self.ignore_empty_annotations or transform_require_non_empty_annotations:
        if self._cache_annotations:
            logger.info("Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.")
        elif self.ignore_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. `ignore_empty_annotations=True` causes the process to take longer due to full dataset indexing."
            )
        elif transform_require_non_empty_annotations:
            logger.info(
                "Dataset Initialization in progress. "
                "Having a transform with `non_empty_annotations=True` set causes the process to take longer due to the need for a full dataset indexing."
            )

        # Map indexes to sample annotations.
        non_empty_annotations, empty_annotations = self._load_all_annotations(n_samples=n_samples)
        if self._cache_annotations:
            if self.ignore_empty_annotations and transform_require_non_empty_annotations:
                self._cached_annotations = non_empty_annotations
            else:
                # Non overlapping dicts. since they map unique sample_ids -> sample
                self._cached_annotations = {**non_empty_annotations, **empty_annotations}

        if self.ignore_empty_annotations and len(non_empty_annotations) == 0:
            raise EmptyDatasetException(f"Out of {n_samples} images, not a single one was found with any of these classes: {self.class_inclusion_list}")

        self._non_empty_sample_ids = list(non_empty_annotations.keys())

    self._n_samples = n_samples  # Regardless of any filtering

`len()`

Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant).

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def __len__(self) -> int:
    """Get the length of the dataset. Note that this is the number of samples AFTER filtering (if relevant)."""
    return len(self._non_empty_sample_ids) if self.ignore_empty_annotations else self._n_samples

`apply_transforms(sample)`

Applies self.transforms sequentially to sample

If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load only additional samples with objects in them.

Parameters:

Name	Type	Description	Default
`sample`	`Dict[str, Union[np.ndarray, Any]]`	Sample to apply the transforms on to (loaded with self.get_sample)	required

Returns:

Type	Description
`Dict[str, Union[np.ndarray, Any]]`	Transformed sample

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def apply_transforms(self, sample: Dict[str, Union[np.ndarray, Any]]) -> Dict[str, Union[np.ndarray, Any]]:
    """
    Applies self.transforms sequentially to sample

    If a transforms has the attribute 'additional_samples_count', additional samples will be loaded and stored in
     sample["additional_samples"] prior to applying it. Combining with the attribute "non_empty_annotations" will load
     only additional samples with objects in them.

    :param sample: Sample to apply the transforms on to (loaded with self.get_sample)
    :return: Transformed sample
    """

    has_crowd_target = "crowd_target" in sample
    detection_sample = LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(sample).sanitize_sample()
    target_format_transform: Optional[DetectionTargetsFormatTransform] = None

    for transform in self.transforms:
        detection_sample.additional_samples = [
            LegacyDetectionTransformMixin.convert_input_dict_to_detection_sample(s) for s in self._get_additional_inputs_for_transform(transform=transform)
        ]
        detection_sample = transform.apply_to_sample(sample=detection_sample)

        detection_sample.additional_samples = None
        if isinstance(transform, DetectionTargetsFormatTransform):
            target_format_transform = transform

    transformed_dict = LegacyDetectionTransformMixin.convert_detection_sample_to_dict(detection_sample, include_crowd_target=has_crowd_target)
    if target_format_transform is not None:
        transformed_dict = target_format_transform(sample=transformed_dict)
    return transformed_dict

`get_dataset_preprocessing_params()`

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as as list of dicts to be resolved by processing factory.

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as as list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = [Processings.ReverseImageChannels]
    if self.input_dim is not None:
        pipeline += [{Processings.DetectionLongestMaxSizeRescale: {"output_shape": self.input_dim}}]
    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(
        class_names=self.classes,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        iou=0.65,
        conf=0.5,
    )
    return params

`get_random_samples(count, ignore_empty_annotations=False)`

Load random samples.

Parameters:

Name	Type	Description	Default
`count`	`int`	The number of samples wanted	required
`ignore_empty_annotations`	`bool`	If true, only return samples with at least 1 annotation	`False`

Returns:

Type	Description
`List[Dict[str, Union[np.ndarray, Any]]]`	A list of samples satisfying input params

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def get_random_samples(self, count: int, ignore_empty_annotations: bool = False) -> List[Dict[str, Union[np.ndarray, Any]]]:
    """Load random samples.

    :param count: The number of samples wanted
    :param ignore_empty_annotations: If true, only return samples with at least 1 annotation
    :return: A list of samples satisfying input params
    """
    return [self.get_random_sample(ignore_empty_annotations) for _ in range(count)]

`get_sample(index, ignore_empty_annotations=False)`

Get raw sample, before any transform (beside subclassing).

Parameters:

Name	Type	Description	Default
`index`	`int`	Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1	required
`ignore_empty_annotations`	`bool`	If True, empty annotations will be ignored	`False`

Returns:

Type	Description
`Dict[str, Union[np.ndarray, Any]]`	Sample, i.e. a dictionary including at least "image" and "target"

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def get_sample(self, index: int, ignore_empty_annotations: bool = False) -> Dict[str, Union[np.ndarray, Any]]:
    """Get raw sample, before any transform (beside subclassing).
    :param index:                       Index refers to the index of the sample in the dataset, AFTER filtering (if relevant). 0<=index<=len(dataset)-1
    :param ignore_empty_annotations:    If True, empty annotations will be ignored
    :return:                            Sample, i.e. a dictionary including at least "image" and "target"
    """
    sample_annotations = self._get_sample_annotations(index=index, ignore_empty_annotations=ignore_empty_annotations)
    image = self._load_resized_img(image_path=sample_annotations["img_path"])
    return {"image": image, **deepcopy(sample_annotations)}

`plot(max_samples_per_plot=16, n_plots=1, plot_transformed_data=True, box_thickness=2)`

Combine samples of images with bbox into plots and display the result.

Parameters:

Name	Type	Description	Default
`max_samples_per_plot`	`int`	Maximum number of images to be displayed per plot	`16`
`n_plots`	`int`	Number of plots to display (each plot being a combination of img with bbox)	`1`
`plot_transformed_data`	`bool`	If True, the plot will be over samples after applying transforms (i.e. on getitem). If False, the plot will be over the raw samples (i.e. on get_sample)	`True`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/detection_datasets/detection_dataset.py

def plot(
    self,
    max_samples_per_plot: int = 16,
    n_plots: int = 1,
    plot_transformed_data: bool = True,
    box_thickness: int = 2,
):
    """Combine samples of images with bbox into plots and display the result.

    :param max_samples_per_plot:    Maximum number of images to be displayed per plot
    :param n_plots:                 Number of plots to display (each plot being a combination of img with bbox)
    :param plot_transformed_data:   If True, the plot will be over samples after applying transforms (i.e. on __getitem__).
                                    If False, the plot will be over the raw samples (i.e. on get_sample)
    :return:
    """
    plot_counter = 0
    input_format = self.output_target_format if plot_transformed_data else self.original_target_format
    if isinstance(input_format, DetectionTargetsFormat):
        raise ValueError(
            "Plot is not supported for DetectionTargetsFormat. Please set original_target_format to be an instance of ConcatenateTransform instead."
        )

    for plot_i in range(n_plots):
        fig = plt.figure(figsize=(10, 10))

        n_subplot = int(np.ceil(max_samples_per_plot**0.5))

        # Plot `max_samples_per_plot` images.
        for img_i in range(max_samples_per_plot):
            index = img_i + plot_i * 16

            # LOAD IMAGE/TARGETS
            if plot_transformed_data:
                # Access to the image and the target AFTER self.transform
                image, targets, *_ = self[img_i + plot_i * 16]
            else:
                # Access to the image and the target BEFORE self.transform
                sample = self.get_sample(index=index, ignore_empty_annotations=self.ignore_empty_annotations)
                image, targets = sample["image"], sample["target"]

            # FORMAT TARGETS
            if image.shape[0] in (1, 3):  # (C, H, W) -> (H, W, C)
                image = image.transpose((1, 2, 0))

            image = self._standardize_image(image)
            image = image.astype(np.uint8)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Detection dataset works with BGR images, so we have to convert to RGB

            # Convert to XYXY_LABEL format
            targets_format_converter = ConcatenatedTensorFormatConverter(input_format=input_format, output_format=LABEL_XYXY, image_shape=image.shape)
            targets_label_xyxy = targets_format_converter(targets)

            image = DetectionVisualization.visualize_image(image_np=image, class_names=self.classes, target_boxes=targets_label_xyxy, gt_alpha=1)

            plt.subplot(n_subplot, n_subplot, img_i + 1).imshow(image[:, :, ::-1])
            plt.imshow(image)
            plt.axis("off")

        fig.tight_layout()
        plt.show()
        plt.close()

        plot_counter += 1
        if plot_counter == n_plots:
            return

`PascalVOCDetectionDataset`

Bases: PascalVOCFormatDetectionDataset

Dataset for Pascal VOC object detection

Parameters:
    data_dir (str): Base directory where the dataset is stored.
    images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
    labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
    images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
    download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py

@register_dataset(Datasets.PASCAL_VOC_DETECTION_DATASET)
class PascalVOCDetectionDataset(PascalVOCFormatDetectionDataset):
    """Dataset for Pascal VOC object detection

        Parameters:
            data_dir (str): Base directory where the dataset is stored.
            images_dir (str, optional): Directory containing all the images, relative to `data_dir`. Defaults to None.
            labels_dir (str, optional): Directory containing all the labels, relative to `data_dir`. Defaults to None.
            images_sub_directory (str, optional): Deprecated. Subdirectory within data_dir that includes images. Defaults to None.
            download (bool, optional): If True, download the dataset to `data_dir`. Defaults to False.

        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_sub_directory: Optional[str] = None,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        download: bool = False,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        # Adding a check for deprecated usage alongside new parameters
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )

        elif images_sub_directory is not None:
            images_dir = images_sub_directory
            labels_dir = images_sub_directory.replace("images", "labels")
        elif images_dir is None or labels_dir is None:
            raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

        if download:
            self.download(data_dir)

        super().__init__(
            data_dir=data_dir,
            images_dir=images_dir,
            labels_dir=labels_dir,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
            all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
        )

    @staticmethod
    def download(data_dir: str) -> None:
        """Download Pascal dataset in XYXY_LABEL format.

        Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
        """

        def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
            """Parse and save the labels of an image in XYXY_LABEL format."""

            with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
                xml_parser = ElementTree.parse(f).getroot()

            labels = []
            for obj in xml_parser.iter("object"):
                cls = obj.find("name").text
                if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                    xml_box = obj.find("bndbox")

                    def get_coord(box_coord):
                        return xml_box.find(box_coord).text

                    xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                    labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

            with open(new_label_path, "w") as f:
                f.write("\n".join(labels))

        urls = [
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
            "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
        ]  # 1.86G, 17125 images
        data_dir = Path(data_dir)
        download_and_untar_from_url(urls, dir=data_dir / "images")

        # Convert
        data_path = data_dir / "images" / "VOCdevkit"
        for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
            dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
            dest_imgs_path.mkdir(exist_ok=True, parents=True)

            dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
            dest_labels_path.mkdir(exist_ok=True, parents=True)

            with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
                image_ids = f.read().strip().split()

            for id in tqdm(image_ids, desc=f"{image_set}{year}"):
                img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
                new_img_path = dest_imgs_path / img_path.name
                new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
                img_path.rename(new_img_path)  # Move image to dest folder
                _parse_and_save_labels(data_path, new_label_path, year, id)

`init(data_dir, images_sub_directory=None, images_dir=None, labels_dir=None, download=False, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)`

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py

@deprecated_parameter(
    "images_sub_directory",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
)
def __init__(
    self,
    data_dir: str,
    images_sub_directory: Optional[str] = None,
    images_dir: Optional[str] = None,
    labels_dir: Optional[str] = None,
    download: bool = False,
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """
    Initialize the Pascal VOC Detection Dataset.

    """

    # Adding a check for deprecated usage alongside new parameters
    if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
        logger.warning(
            "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
            "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
            DeprecationWarning,
        )

    elif images_sub_directory is not None:
        images_dir = images_sub_directory
        labels_dir = images_sub_directory.replace("images", "labels")
    elif images_dir is None or labels_dir is None:
        raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

    if download:
        self.download(data_dir)

    super().__init__(
        data_dir=data_dir,
        images_dir=images_dir,
        labels_dir=labels_dir,
        max_num_samples=max_num_samples,
        cache_annotations=cache_annotations,
        input_dim=input_dim,
        transforms=transforms,
        class_inclusion_list=class_inclusion_list,
        ignore_empty_annotations=ignore_empty_annotations,
        verbose=verbose,
        show_all_warnings=show_all_warnings,
        cache=cache,
        cache_dir=cache_dir,
        all_classes_list=PASCAL_VOC_2012_CLASSES_LIST,
    )

`download(data_dir)` `staticmethod`

Download Pascal dataset in XYXY_LABEL format.

Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py

@staticmethod
def download(data_dir: str) -> None:
    """Download Pascal dataset in XYXY_LABEL format.

    Data extracted form http://host.robots.ox.ac.uk/pascal/VOC/
    """

    def _parse_and_save_labels(path: str, new_label_path: str, year: str, image_id: str) -> None:
        """Parse and save the labels of an image in XYXY_LABEL format."""

        with open(f"{path}/VOC{year}/Annotations/{image_id}.xml") as f:
            xml_parser = ElementTree.parse(f).getroot()

        labels = []
        for obj in xml_parser.iter("object"):
            cls = obj.find("name").text
            if cls in PASCAL_VOC_2012_CLASSES_LIST and not int(obj.find("difficult").text) == 1:
                xml_box = obj.find("bndbox")

                def get_coord(box_coord):
                    return xml_box.find(box_coord).text

                xmin, ymin, xmax, ymax = get_coord("xmin"), get_coord("ymin"), get_coord("xmax"), get_coord("ymax")
                labels.append(" ".join([xmin, ymin, xmax, ymax, str(PASCAL_VOC_2012_CLASSES_LIST.index(cls))]))

        with open(new_label_path, "w") as f:
            f.write("\n".join(labels))

    urls = [
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar",  # 439M 5011 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar",  # 430M, 4952 images
        "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
    ]  # 1.86G, 17125 images
    data_dir = Path(data_dir)
    download_and_untar_from_url(urls, dir=data_dir / "images")

    # Convert
    data_path = data_dir / "images" / "VOCdevkit"
    for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
        dest_imgs_path = data_dir / "images" / f"{image_set}{year}"
        dest_imgs_path.mkdir(exist_ok=True, parents=True)

        dest_labels_path = data_dir / "labels" / f"{image_set}{year}"
        dest_labels_path.mkdir(exist_ok=True, parents=True)

        with open(data_path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
            image_ids = f.read().strip().split()

        for id in tqdm(image_ids, desc=f"{image_set}{year}"):
            img_path = data_path / f"VOC{year}/JPEGImages/{id}.jpg"
            new_img_path = dest_imgs_path / img_path.name
            new_label_path = (dest_labels_path / img_path.name).with_suffix(".txt")
            img_path.rename(new_img_path)  # Move image to dest folder
            _parse_and_save_labels(data_path, new_label_path, year, id)

`PascalVOCUnifiedDetectionTrainDataset`

Bases: ConcatDataset

Unified Dataset for Pascal VOC object detection.

Unified Dataset class for training on Pascal VOC object detection datasets.

This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

Parameters: data_dir (str): Base directory where the dataset is stored. input_dim (tuple): Input dimension that the images should be resized to. cache (optional): Cache configuration. cache_dir (optional): Directory for cache. transforms (List[AbstractDetectionTransform], optional): List of transforms to apply. class_inclusion_list (Optional[List[str]], optional): List of classes to include. max_num_samples (int, optional): Maximum number of samples to include from each dataset part. download (bool, optional): If True, downloads the dataset parts to data_dir. Defaults to False. images_dir (Optional[str], optional): Directory containing all the images, relative to data_dir. Should only be used without 'images_sub_directory'. labels_dir (Optional[str], optional): Directory containing all the labels, relative to data_dir. Should only be used without 'images_sub_directory'. images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.

Example Dataset structure:

    ./data/pascal_voc/
    ├─images
    │   ├─ train2012
    │   ├─ val2012
    │   ├─ VOCdevkit
    │   │    ├─ VOC2007
    │   │    │  ├──JPEGImages
    │   │    │  ├──SegmentationClass
    │   │    │  ├──ImageSets
    │   │    │  ├──ImageSets/Segmentation
    │   │    │  ├──ImageSets/Main
    │   │    │  ├──ImageSets/Layout
    │   │    │  ├──Annotations
    │   │    │  └──SegmentationObject
    │   │    └──VOC2012
    │   │       ├──JPEGImages
    │   │       ├──SegmentationClass
    │   │       ├──ImageSets
    │   │       ├──ImageSets/Segmentation
    │   │       ├──ImageSets/Main
    │   │       ├──ImageSets/Action
    │   │       ├──ImageSets/Layout
    │   │       ├──Annotations
    │   │       └──SegmentationObject
    │   ├─train2007
    │   ├─test2007
    │   └─val2007
    └─labels
        ├─train2012
        ├─val2012
        ├─train2007
        ├─test2007
        └─val2007
    Usage:
unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                        input_dim=(512, 512),
                                                        download=True,
                                                        images_dir="images",
                                                        labels_dir="labels")

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_detection.py

class PascalVOCUnifiedDetectionTrainDataset(ConcatDataset):
    """Unified Dataset for Pascal VOC object detection.

    Unified Dataset class for training on Pascal VOC object detection datasets.

    This class combines datasets from multiple years (e.g., 2007, 2012) into a single dataset for training purposes.

    Parameters:
        data_dir (str): Base directory where the dataset is stored.
        input_dim (tuple): Input dimension that the images should be resized to.
        cache (optional): Cache configuration.
        cache_dir (optional): Directory for cache.
        transforms (List[AbstractDetectionTransform], optional): List of transforms to apply.
        class_inclusion_list (Optional[List[str]], optional): List of classes to include.
        max_num_samples (int, optional): Maximum number of samples to include from each dataset part.
        download (bool, optional): If True, downloads the dataset parts to `data_dir`. Defaults to False.
        images_dir (Optional[str], optional): Directory containing all the images, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        labels_dir (Optional[str], optional): Directory containing all the labels, relative to `data_dir`. Should only be used without 'images_sub_directory'.
        images_sub_directory (Optional[str], optional): Deprecated. Use 'images_dir' and 'labels_dir' instead for future compatibility.


        Example Dataset structure:

            ./data/pascal_voc/
            ├─images
            │   ├─ train2012
            │   ├─ val2012
            │   ├─ VOCdevkit
            │   │    ├─ VOC2007
            │   │    │  ├──JPEGImages
            │   │    │  ├──SegmentationClass
            │   │    │  ├──ImageSets
            │   │    │  ├──ImageSets/Segmentation
            │   │    │  ├──ImageSets/Main
            │   │    │  ├──ImageSets/Layout
            │   │    │  ├──Annotations
            │   │    │  └──SegmentationObject
            │   │    └──VOC2012
            │   │       ├──JPEGImages
            │   │       ├──SegmentationClass
            │   │       ├──ImageSets
            │   │       ├──ImageSets/Segmentation
            │   │       ├──ImageSets/Main
            │   │       ├──ImageSets/Action
            │   │       ├──ImageSets/Layout
            │   │       ├──Annotations
            │   │       └──SegmentationObject
            │   ├─train2007
            │   ├─test2007
            │   └─val2007
            └─labels
                ├─train2012
                ├─val2012
                ├─train2007
                ├─test2007
                └─val2007
            Usage:
        unified_dataset = PascalVOCUnifiedDetectionTrainDataset(data_dir="./data/pascal_voc",
                                                                input_dim=(512, 512),
                                                                download=True,
                                                                images_dir="images",
                                                                labels_dir="labels")

    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility. Please use " "'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        input_dim: tuple,
        cache=None,
        cache_dir=None,
        transforms: List[AbstractDetectionTransform] = [],
        class_inclusion_list: Optional[List[str]] = None,
        max_num_samples: int = None,
        download: bool = False,
        images_dir: Optional[str] = None,
        labels_dir: Optional[str] = None,
        images_sub_directory: Optional[str] = None,  # Marked for deprecation.
    ):
        if images_sub_directory is not None and (images_dir is not None or labels_dir is not None):
            logger.warning(
                "Both 'images_sub_directory' (deprecated) and 'images_dir'/'labels_dir' are provided. "
                "Prefer using 'images_dir' and 'labels_dir' for future compatibility.",
                DeprecationWarning,
            )
        if download:
            PascalVOCDetectionDataset.download(data_dir=data_dir)

        train_dataset_names = ["train2007", "val2007", "train2012", "val2012"]
        if max_num_samples:
            max_num_samples_per_train_dataset = [len(segment) for segment in np.array_split(range(max_num_samples), len(train_dataset_names))]
        else:
            max_num_samples_per_train_dataset = [None] * len(train_dataset_names)

        train_sets = []
        for i, trainset_name in enumerate(train_dataset_names):
            dataset_kwargs = {
                "data_dir": data_dir,
                "input_dim": input_dim,
                "cache": cache,
                "cache_dir": cache_dir,
                "transforms": transforms,
                "class_inclusion_list": class_inclusion_list,
                "max_num_samples": max_num_samples_per_train_dataset[i],
            }
            if images_dir is not None and labels_dir is not None:
                dataset_kwargs["images_dir"] = os.path.join(images_dir, trainset_name)
                dataset_kwargs["labels_dir"] = os.path.join(labels_dir, trainset_name)
            elif images_sub_directory is not None:
                deprecated_images_path = os.path.join("images", trainset_name)
                deprecated_labels_path = os.path.join("labels", trainset_name)
                dataset_kwargs["images_dir"] = deprecated_images_path
                dataset_kwargs["labels_dir"] = deprecated_labels_path
            else:
                raise ValueError("You must provide either 'images_dir' and 'labels_dir', or the deprecated 'images_sub_directory'.")

            train_sets.append(PascalVOCDetectionDataset(**dataset_kwargs))
            super(PascalVOCUnifiedDetectionTrainDataset, self).__init__(train_sets)

`PascalVOCFormatDetectionDataset`

Bases: DetectionDataset

Dataset for Pascal VOC object detection

Parameters: data_dir (str): Base directory where the dataset is stored.

images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
 n annotations/images. Defaults to None.

cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
 but requires more RAM. Defaults to True.

input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
 or a tuple (height, width). Defaults to None.

transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
 Defaults to an empty list.

all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
 Adjust the number of model classes accordingly. Defaults to None.

ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
 ignored. Defaults to True.

verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

show_all_warnings (bool): If True, displays all warnings. Defaults to False.

cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
 future version.

cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
 a future version.



Dataset structure:

./data/pascal_voc
├─images
│   ├─ train2012
│   ├─ val2012
│   ├─ VOCdevkit
│   │    ├─ VOC2007
│   │    │  ├──JPEGImages
│   │    │  ├──SegmentationClass
│   │    │  ├──ImageSets
│   │    │  ├──ImageSets/Segmentation
│   │    │  ├──ImageSets/Main
│   │    │  ├──ImageSets/Layout
│   │    │  ├──Annotations
│   │    │  └──SegmentationObject
│   │    └──VOC2012
│   │       ├──JPEGImages
│   │       ├──SegmentationClass
│   │       ├──ImageSets
│   │       ├──ImageSets/Segmentation
│   │       ├──ImageSets/Main
│   │       ├──ImageSets/Action
│   │       ├──ImageSets/Layout
│   │       ├──Annotations
│   │       └──SegmentationObject
│   ├─train2007
│   ├─test2007
│   └─val2007
└─labels
    ├─train2012
    ├─val2012
    ├─train2007
    ├─test2007
    └─val2007

Note: If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

Usage: voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc", images_dir="images/train2012/JPEGImages", labels_dir="labels/train2012/Annotations", download=True)

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py

@register_dataset("PascalVOCFormatDetectionDataset")
class PascalVOCFormatDetectionDataset(DetectionDataset):
    """Dataset for Pascal VOC object detection

    Parameters:
        data_dir (str): Base directory where the dataset is stored.

        images_dir (Optional[str]): Directory containing all the images, relative to `data_dir`. Defaults to None.

        labels_dir (Optional[str]): Directory containing all the labels, relative to `data_dir`. Defaults to None.

        max_num_samples (Optional[int]): If not None, sets the maximum size of the dataset by only indexing the first
         n annotations/images. Defaults to None.

        cache_annotations (bool): Whether to cache annotations. Reduces training time by pre-loading all annotations
         but requires more RAM. Defaults to True.

        input_dim (Optional[Union[int, Tuple[int, int]]]): Image size when loaded, before transforms. Can be None, a scalar,
         or a tuple (height, width). Defaults to None.

        transforms (List[AbstractDetectionTransform]): List of transforms to apply sequentially on each sample.
         Defaults to an empty list.

        all_classes_list (Optional[List[str]]): All class names in the dataset. Defaults to an empty list.

        class_inclusion_list (Optional[List[str]]): Subset of classes to include. Classes not in this list will be excluded.
         Adjust the number of model classes accordingly. Defaults to None.

        ignore_empty_annotations (bool): If True and class_inclusion_list is not None, images without any target will be
         ignored. Defaults to True.

        verbose (bool): If True, displays additional information (does not include warnings). Defaults to True.

        show_all_warnings (bool): If True, displays all warnings. Defaults to False.

        cache (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in a
         future version.

        cache_dir (Optional): Deprecated. This parameter is not used and setting it has no effect. Will be removed in
         a future version.



        Dataset structure:

        ./data/pascal_voc
        ├─images
        │   ├─ train2012
        │   ├─ val2012
        │   ├─ VOCdevkit
        │   │    ├─ VOC2007
        │   │    │  ├──JPEGImages
        │   │    │  ├──SegmentationClass
        │   │    │  ├──ImageSets
        │   │    │  ├──ImageSets/Segmentation
        │   │    │  ├──ImageSets/Main
        │   │    │  ├──ImageSets/Layout
        │   │    │  ├──Annotations
        │   │    │  └──SegmentationObject
        │   │    └──VOC2012
        │   │       ├──JPEGImages
        │   │       ├──SegmentationClass
        │   │       ├──ImageSets
        │   │       ├──ImageSets/Segmentation
        │   │       ├──ImageSets/Main
        │   │       ├──ImageSets/Action
        │   │       ├──ImageSets/Layout
        │   │       ├──Annotations
        │   │       └──SegmentationObject
        │   ├─train2007
        │   ├─test2007
        │   └─val2007
        └─labels
            ├─train2012
            ├─val2012
            ├─train2007
            ├─test2007
            └─val2007

    Note:
        If both 'images_sub_directory' and ('images_dir', 'labels_dir') are provided, a warning will be raised.

    Usage:
        voc_2012_train = PascalVOCDetectionDataset(data_dir="./data/pascal_voc",
                                            images_dir="images/train2012/JPEGImages",
                                            labels_dir="labels/train2012/Annotations",
                                            download=True)
    """

    @deprecated_parameter(
        "images_sub_directory",
        deprecated_since="3.7.0",
        removed_from="3.8.0",
        reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
    )
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        max_num_samples: int = None,
        cache_annotations: bool = True,
        input_dim: Union[int, Tuple[int, int], None] = None,
        transforms: List[AbstractDetectionTransform] = [],
        all_classes_list: Optional[List[str]] = [],
        class_inclusion_list: Optional[List[str]] = None,
        ignore_empty_annotations: bool = True,
        verbose: bool = True,
        show_all_warnings: bool = False,
        cache=None,
        cache_dir=None,
    ):
        """
        Initialize the Pascal VOC Detection Dataset.

        """

        self.data_dir = data_dir

        self.images_dir = os.path.join(data_dir, images_dir)
        self.labels_dir = os.path.join(data_dir, labels_dir)

        super(PascalVOCFormatDetectionDataset, self).__init__(
            data_dir=data_dir,
            original_target_format=XYXY_LABEL,
            max_num_samples=max_num_samples,
            cache_annotations=cache_annotations,
            input_dim=input_dim,
            transforms=transforms,
            all_classes_list=all_classes_list,
            class_inclusion_list=class_inclusion_list,
            ignore_empty_annotations=ignore_empty_annotations,
            verbose=verbose,
            show_all_warnings=show_all_warnings,
            cache=cache,
            cache_dir=cache_dir,
        )

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: List of tuples made of (img_path,target_path)
        """
        if not Path(self.images_dir).exists():
            raise FileNotFoundError(f"{self.images_dir} not found.")

        img_files = list(sorted(glob.glob(os.path.join(self.images_dir, "*.jpg"))))
        if len(img_files) == 0:
            raise FileNotFoundError(f"No image files found in {self.images_dir}")

        target_files = [os.path.join(self.labels_dir, os.path.basename(img_file).replace(".jpg", ".txt")) for img_file in img_files]

        img_and_target_path_list = [(img_file, target_file) for img_file, target_file in zip(img_files, target_files) if os.path.exists(target_file)]
        if len(img_and_target_path_list) == 0:
            raise FileNotFoundError("No target files associated with the images were found")

        num_missing_files = len(img_files) - len(img_and_target_path_list)
        if num_missing_files > 0:
            logger.warning(f"{num_missing_files} label files were not loaded out of {len(img_files)} image files")

        self.img_and_target_path_list = img_and_target_path_list
        return len(self.img_and_target_path_list)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load annotations for a given sample.

        :return: Annotation including:
                    - target in XYXY_LABEL format
                    - img_path
        """
        img_path, target_path = self.img_and_target_path_list[sample_id]
        with open(target_path, "r") as file:
            target = np.array([x.split() for x in file.read().splitlines()], dtype=np.float32)

        height, width = get_image_size_from_path(img_path)
        r = min(self.input_dim[1] / height, self.input_dim[0] / width)
        target[:, :4] *= r
        resized_img_shape = (int(height * r), int(width * r))

        return {"img_path": img_path, "target": target, "resized_img_shape": resized_img_shape}

`init(data_dir, images_dir, labels_dir, max_num_samples=None, cache_annotations=True, input_dim=None, transforms=[], all_classes_list=[], class_inclusion_list=None, ignore_empty_annotations=True, verbose=True, show_all_warnings=False, cache=None, cache_dir=None)`

Initialize the Pascal VOC Detection Dataset.

Source code in src/super_gradients/training/datasets/detection_datasets/pascal_voc_format_detection.py

@deprecated_parameter(
    "images_sub_directory",
    deprecated_since="3.7.0",
    removed_from="3.8.0",
    reason="Support of `images_sub_directory` is removed since it allows less flexibility." " Please use 'images_dir' and 'labels_dir' instead.",
)
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    labels_dir: str,
    max_num_samples: int = None,
    cache_annotations: bool = True,
    input_dim: Union[int, Tuple[int, int], None] = None,
    transforms: List[AbstractDetectionTransform] = [],
    all_classes_list: Optional[List[str]] = [],
    class_inclusion_list: Optional[List[str]] = None,
    ignore_empty_annotations: bool = True,
    verbose: bool = True,
    show_all_warnings: bool = False,
    cache=None,
    cache_dir=None,
):
    """
    Initialize the Pascal VOC Detection Dataset.

    """

    self.data_dir = data_dir

    self.images_dir = os.path.join(data_dir, images_dir)
    self.labels_dir = os.path.join(data_dir, labels_dir)

    super(PascalVOCFormatDetectionDataset, self).__init__(
        data_dir=data_dir,
        original_target_format=XYXY_LABEL,
        max_num_samples=max_num_samples,
        cache_annotations=cache_annotations,
        input_dim=input_dim,
        transforms=transforms,
        all_classes_list=all_classes_list,
        class_inclusion_list=class_inclusion_list,
        ignore_empty_annotations=ignore_empty_annotations,
        verbose=verbose,
        show_all_warnings=show_all_warnings,
        cache=cache,
        cache_dir=cache_dir,
    )

`RoboflowDetectionDataset`

Bases: COCOFormatDetectionDataset

Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection. Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

To use this Dataset you need to:

- Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
    //!\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

- Your dataset should look like this:
    rf100
    ├── 4-fold-defect
    │      ├─ train
    │      │    ├─ 000000000001.jpg
    │      │    ├─ ...
    │      │    └─ _annotations.coco.json
    │      ├─ valid
    │      │    └─ ...
    │      └─ test
    │           └─ ...
    ├── abdomen-mri
    │      └─ ...
    └── ...

- Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

- Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
    >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
    >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

Note: dataset_name refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets) OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py

class RoboflowDetectionDataset(COCOFormatDetectionDataset):
    """Dataset that can be used with ANY of the Roboflow100 benchmark datasets for object detection.
    Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com

    To use this Dataset you need to:

        - Follow the official instructions to download Roboflow100: https://github.com/roboflow/roboflow-100-benchmark?ref=roboflow-blog
            //!\\ To use this dataset, you have to download the "coco" format, NOT the yolov5.

        - Your dataset should look like this:
            rf100
            ├── 4-fold-defect
            │      ├─ train
            │      │    ├─ 000000000001.jpg
            │      │    ├─ ...
            │      │    └─ _annotations.coco.json
            │      ├─ valid
            │      │    └─ ...
            │      └─ test
            │           └─ ...
            ├── abdomen-mri
            │      └─ ...
            └── ...

        - Install CoCo API: https://github.com/pdollar/coco/tree/master/PythonAPI

        - Instantiate the dataset (in this case we load the dataset called "digits-t2eg6")"
            >> train_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="train")
            >> valid_set = RoboflowDetectionDataset(data_dir='<path-to>/rf100', dataset_name="digits-t2eg6", split="valid")

    Note: `dataset_name` refers to the official name of the dataset. You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
          OR you can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6
    """

    def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
        """
        :param data_dir:        Where the data is stored.
        :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
        :param split:           train, valid or test.
        """
        if split not in ("train", "valid", "test"):
            raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

        self.dataset_name = dataset_name
        dataset_split_dir = os.path.join(dataset_name, split)
        json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

        super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

    @staticmethod
    def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
        """List all available datasets of specified categories. By default, list all the datasets."""
        return list_datasets(categories=categories)

    @property
    def metadata(self) -> Optional[Dict[str, Union[str, int]]]:
        """Category of the dataset. Note that each dataset has one and only one category."""
        return get_dataset_metadata(self.dataset_name)

`metadata: Optional[Dict[str, Union[str, int]]]` `property`

Category of the dataset. Note that each dataset has one and only one category.

`init(data_dir, dataset_name, split, *args, **kwargs)`

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Where the data is stored.	required
`dataset_name`	`str`	One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)	required
`split`	`str`	train, valid or test.	required

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py

def __init__(self, data_dir: str, dataset_name: str, split: str, *args, **kwargs):
    """
    :param data_dir:        Where the data is stored.
    :param dataset_name:    One of the 100 dataset name. (You can run RoboflowDetectionDataset.list_datasets() to see all available datasets)
    :param split:           train, valid or test.
    """
    if split not in ("train", "valid", "test"):
        raise ValueError(f"split must be one of ('train', 'valid', 'test'). Got '{split}'.")

    self.dataset_name = dataset_name
    dataset_split_dir = os.path.join(dataset_name, split)
    json_annotation_file = os.path.join(dataset_split_dir, "_annotations.coco.json")

    super().__init__(data_dir=data_dir, json_annotation_file=json_annotation_file, images_dir=dataset_split_dir, class_ids_to_ignore=[0], *args, **kwargs)

`list_datasets(categories=None)` `staticmethod`

List all available datasets of specified categories. By default, list all the datasets.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/roboflow100.py

@staticmethod
def list_datasets(categories: Optional[List[str]] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    return list_datasets(categories=categories)

`get_dataset_metadata(dataset_name)`

Get the metadata of a specific roboflow dataset.

Parameters:

Name	Type	Description	Default
`dataset_name`	`str`	Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv	required

Returns:

Type	Description
`Optional[Dict[str, Union[str, int]]]`	Metadata of the dataset

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py

def get_dataset_metadata(dataset_name: str) -> Optional[Dict[str, Union[str, int]]]:
    """Get the metadata of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Metadata of the dataset
    """
    dataset_metadata = DATASETS_METADATA.get(dataset_name)
    if dataset_metadata is None:
        logger.warning(f"No metadata found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return dataset_metadata

`get_dataset_num_classes(dataset_name)`

Get the number of classes of a specific roboflow dataset.

Parameters:

Name	Type	Description	Default
`dataset_name`	`str`	Name of the dataset, as listed in the official repo - https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv	required

Returns:

Type	Description
`int`	Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py

def get_dataset_num_classes(dataset_name: str) -> int:
    """Get the number of classes of a specific roboflow dataset.
    :param dataset_name: Name of the dataset, as listed in the official repo -
                            https://github.com/roboflow/roboflow-100-benchmark/blob/main/metadata/datasets_stats.csv
    :return:             Number of classes of the dataset. Note that the number of classes in the official documentation is different to the actual one.
    """
    metadata = get_dataset_metadata(dataset_name)
    if metadata is None:
        raise ValueError(f"No num_classes found for dataset_name={dataset_name}. This might be due to a recent change in the dataset name.")
    return metadata["num_classes_found"]

`list_datasets(categories=None)`

List all available datasets of specified categories. By default, list all the datasets.

Source code in src/super_gradients/training/datasets/detection_datasets/roboflow/utils.py

def list_datasets(categories: List[str] = None) -> List[str]:
    """List all available datasets of specified categories. By default, list all the datasets."""
    categories = categories or DATASETS_CATEGORIES
    return [dataset_name for dataset_name, metadata in DATASETS_METADATA.items() if metadata["category"] in categories]

`YoloDarknetFormatDetectionDataset`

Bases: DetectionDataset

Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

Note: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

The dataset can have any structure, as long as images_dir and labels_dir inside data_dir. Each image is expected to have a file with the same name as the label.

Example1: data_dir ├── images │ ├─ 0001.jpg │ ├─ 0002.jpg │ └─ ... └── labels ├─ 0001.txt ├─ 0002.txt └─ ... >> data_set = YoloDarknetFormatDetectionDataset(data_dir='/data_dir', images_dir="images", labels_dir="labels", classes=[])

Example2: data_dir ├── train │ ├── images │ │ ├─ 0001.jpg │ │ ├─ 0002.jpg │ │ └─ ... │ └── labels │ ├─ 0001.txt │ ├─ 0002.txt │ └─ ... └── val ├── images │ ├─ 434343.jpg │ ├─ 434344.jpg │ └─ ... └── labels ├─ 434343.txt ├─ 434344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
    )
>> val_set = YoloDarknetFormatDetectionDataset(
        data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
    )

Example3: data_dir ├── train │ ├─ 0001.jpg │ ├─ 0001.txt │ ├─ 0002.jpg │ ├─ 0002.txt │ └─ ... └── val ├─ 4343.jpg ├─ 4343.txt ├─ 4344.jpg ├─ 4344.txt └─ ...

>> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
>> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

Each label file being in LABEL_NORMALIZED_CXCYWH format: 0 0.33 0.33 0.50 0.44 1 0.21 0.54 0.30 0.60 ...

Output format: XYXY_LABEL (x, y, x, y, class_id)

Source code in src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py

@register_dataset("YoloDarknetFormatDetectionDataset")
class YoloDarknetFormatDetectionDataset(DetectionDataset):
    """Base dataset to load ANY dataset that is with a similar structure to the Yolo/Darknet dataset.

    **Note**: For compatibility reasons, the dataset returns labels in Coco format (XYXY_LABEL) and NOT in Yolo format (LABEL_CXCYWH).

    The dataset can have any structure, as long as `images_dir` and `labels_dir` inside `data_dir`.
    Each image is expected to have a file with the same name as the label.

    Example1:
        data_dir
        ├── images
        │      ├─ 0001.jpg
        │      ├─ 0002.jpg
        │      └─ ...
        └── labels
               ├─ 0001.txt
               ├─ 0002.txt
               └─ ...
        >> data_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="images", labels_dir="labels", classes=[<to-fill>])

    Example2:
        data_dir
        ├── train
        │   ├── images
        │   │      ├─ 0001.jpg
        │   │      ├─ 0002.jpg
        │   │      └─ ...
        │   └── labels
        │          ├─ 0001.txt
        │          ├─ 0002.txt
        │          └─ ...
        └── val
            ├── images
            │      ├─ 434343.jpg
            │      ├─ 434344.jpg
            │      └─ ...
            └── labels
                   ├─ 434343.txt
                   ├─ 434344.txt
                   └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="train/images", labels_dir="train/labels", classes=[<to-fill>]
            )
        >> val_set = YoloDarknetFormatDetectionDataset(
                data_dir='<path-to>/data_dir', images_dir="val/images", labels_dir="val/labels", classes=[<to-fill>]
            )

    Example3:
        data_dir
        ├── train
        │      ├─ 0001.jpg
        │      ├─ 0001.txt
        │      ├─ 0002.jpg
        │      ├─ 0002.txt
        │      └─ ...
        └── val
               ├─ 4343.jpg
               ├─ 4343.txt
               ├─ 4344.jpg
               ├─ 4344.txt
               └─ ...

        >> train_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="train", labels_dir="train", classes=[<to-fill>])
        >> val_set = YoloDarknetFormatDetectionDataset(data_dir='<path-to>/data_dir', images_dir="val", labels_dir="val", classes=[<to-fill>])

    Each label file being in LABEL_NORMALIZED_CXCYWH format:
        0 0.33 0.33 0.50 0.44
        1 0.21 0.54 0.30 0.60
        ...


    Output format: XYXY_LABEL (x, y, x, y, class_id)
    """

    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        labels_dir: str,
        classes: List[str],
        class_ids_to_ignore: Optional[List[int]] = None,
        ignore_invalid_labels: bool = True,
        show_all_warnings: bool = False,
        *args,
        **kwargs,
    ):
        """
        :param data_dir:                Where the data is stored.
        :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
        :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
        :param classes:                 List of class names.
        :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
        """
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.class_ids_to_ignore = class_ids_to_ignore or []
        self.classes = classes
        self.ignore_invalid_labels = ignore_invalid_labels
        self.show_all_warnings = show_all_warnings

        kwargs["target_fields"] = ["target"]
        kwargs["output_fields"] = ["image", "target"]
        kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
        super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

    @property
    def _all_classes(self) -> List[str]:
        return self.classes

    def _setup_data_source(self) -> int:
        """Initialize img_and_target_path_list and warn if label file is missing

        :return: number of images in the dataset
        """
        self.images_folder = os.path.join(self.data_dir, self.images_dir)
        self.labels_folder = os.path.join(self.data_dir, self.labels_dir)

        all_images_file_names = list(image_name for image_name in os.listdir(self.images_folder) if is_image(image_name))
        all_labels_file_names = list(label_name for label_name in os.listdir(self.labels_folder) if label_name.endswith(".txt"))

        remove_file_extension = lambda file_name: os.path.splitext(os.path.basename(file_name))[0]
        unique_image_file_base_names = set(remove_file_extension(image_file_name) for image_file_name in all_images_file_names)
        unique_label_file_base_names = set(remove_file_extension(label_file_name) for label_file_name in all_labels_file_names)

        images_not_in_labels = unique_image_file_base_names - unique_label_file_base_names
        if images_not_in_labels:
            logger.warning(f"{len(images_not_in_labels)} images are note associated to any label file")

        labels_not_in_images = unique_label_file_base_names - unique_image_file_base_names
        if labels_not_in_images:
            logger.warning(f"{len(labels_not_in_images)} label files are not associated to any image.")

        # Only keep names that are in both the images and the labels
        valid_base_names = unique_image_file_base_names & unique_label_file_base_names
        if len(valid_base_names) != len(all_images_file_names):
            logger.warning(
                f"As a consequence, "
                f"{len(valid_base_names)}/{len(all_images_file_names)} images and "
                f"{len(valid_base_names)}/{len(all_labels_file_names)} label files will be used."
            )

        self.images_file_names = []
        self.labels_file_names = []
        for image_full_name in all_images_file_names:
            base_name = remove_file_extension(image_full_name)
            if base_name in valid_base_names:
                self.images_file_names.append(image_full_name)
                self.labels_file_names.append(base_name + ".txt")
        return len(self.images_file_names)

    def _load_annotation(self, sample_id: int) -> dict:
        """Load relevant information of a specific image.

        :param sample_id:   Sample_id in the dataset
        :return:            Dictionary with the following keys:
            - "target":             Target Bboxes (detection) in XYXY_LABEL format
            - "initial_img_shape":  Image (height, width)
            - "resized_img_shape":  Resides image (height, width)
            - "img_path":           Path to the associated image
        """
        image_path = os.path.join(self.images_folder, self.images_file_names[sample_id])
        label_path = os.path.join(self.labels_folder, self.labels_file_names[sample_id])

        image_width, image_height = imagesize.get(image_path)
        image_shape = (image_height, image_width)

        yolo_format_target, invalid_labels = self._parse_yolo_label_file(
            label_file_path=label_path,
            num_classes=len(self.all_classes_list),
            ignore_invalid_labels=self.ignore_invalid_labels,
            show_warnings=self.show_all_warnings,
        )

        converter = ConcatenatedTensorFormatConverter(input_format=LABEL_NORMALIZED_CXCYWH, output_format=XYXY_LABEL, image_shape=image_shape)
        target = converter(yolo_format_target)

        # The base class includes a feature to resize the image, so we need to resize the target as well when self.input_dim is set.
        if self.input_dim is not None:
            r = min(self.input_dim[0] / image_height, self.input_dim[1] / image_width)
            target[:, :4] *= r
            resized_img_shape = (int(image_height * r), int(image_width * r))
        else:
            resized_img_shape = image_shape

        annotation = {
            "target": target,
            "initial_img_shape": image_shape,
            "resized_img_shape": resized_img_shape,
            "img_path": image_path,
            "id": np.array([sample_id]),
            "n_invalid_labels": len(invalid_labels),
        }
        return annotation

    @staticmethod
    def _parse_yolo_label_file(
        label_file_path: str,
        ignore_invalid_labels: bool = True,
        show_warnings: bool = True,
        num_classes: Optional[int] = None,
    ) -> Tuple[np.ndarray, List[str]]:
        """Parse a single label file in yolo format.

        #TODO: Add support for additional fields (with ConcatenatedTensorFormat)
        :param label_file_path:         Path to the label file in yolo format.
        :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
        :param show_warnings:           Whether to show the warnings or not.
        :param num_classes:             Number of classes in the dataset. Used to ensure that class ids are within the range [0, num_classes - 1].
                                        If None, ignore.

        :return:
            - labels:           np.ndarray of shape (n_labels, 5) in yolo format (LABEL_NORMALIZED_CXCYWH)
            - invalid_labels:   List of lines that failed to be parsed
        """
        with open(label_file_path, "r") as f:
            lines = f.readlines()

        labels_yolo_format, invalid_labels = [], []
        for line in filter(lambda x: x != "\n", lines):
            try:
                label_id, cx, cw, w, h = line.split()
                label_id, cx, cw, w, h = int(label_id), float(cx), float(cw), float(w), float(h)

                if (num_classes is not None) and (label_id not in range(num_classes)):
                    raise ValueError(f"`class_id={label_id}` invalid. It should be between (0 - {num_classes - 1}).")

                labels_yolo_format.append([label_id, cx, cw, w, h])
            except Exception as e:
                error_msg = (
                    f"Line `{line}` of file {label_file_path} will be ignored because not cannot be parsed to (label, cx, cy, w, h) format, "
                    f"with Exception:\n{e}"
                )
                if ignore_invalid_labels:
                    invalid_labels.append(line)
                    if show_warnings:
                        logger.warning(error_msg)
                else:
                    raise RuntimeError(error_msg)
        return np.array(labels_yolo_format) if labels_yolo_format else np.zeros((0, 5)), invalid_labels

`init(data_dir, images_dir, labels_dir, classes, class_ids_to_ignore=None, ignore_invalid_labels=True, show_all_warnings=False, *args, **kwargs)`

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Where the data is stored.	required
`images_dir`	`str`	Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.	required
`labels_dir`	`str`	Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.	required
`classes`	`List[str]`	List of class names.	required
`class_ids_to_ignore`	`Optional[List[int]]`	List of class ids to ignore in the dataset. By default, doesnt ignore any class.	`None`
`ignore_invalid_labels`	`bool`	Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.	`True`
`show_all_warnings`	`bool`	Whether to show every yolo format parser warnings or not.	`False`

Source code in src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py

def __init__(
    self,
    data_dir: str,
    images_dir: str,
    labels_dir: str,
    classes: List[str],
    class_ids_to_ignore: Optional[List[int]] = None,
    ignore_invalid_labels: bool = True,
    show_all_warnings: bool = False,
    *args,
    **kwargs,
):
    """
    :param data_dir:                Where the data is stored.
    :param images_dir:              Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
    :param labels_dir:              Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
    :param classes:                 List of class names.
    :param class_ids_to_ignore:     List of class ids to ignore in the dataset. By default, doesnt ignore any class.
    :param ignore_invalid_labels:   Whether to ignore labels that fail to be parsed. If True ignores and logs a warning, otherwise raise an error.
    :param show_all_warnings:       Whether to show every yolo format parser warnings or not.
    """
    self.images_dir = images_dir
    self.labels_dir = labels_dir
    self.class_ids_to_ignore = class_ids_to_ignore or []
    self.classes = classes
    self.ignore_invalid_labels = ignore_invalid_labels
    self.show_all_warnings = show_all_warnings

    kwargs["target_fields"] = ["target"]
    kwargs["output_fields"] = ["image", "target"]
    kwargs["original_target_format"] = XYXY_LABEL  # We convert yolo format (LABEL_CXCYWH) to Coco format (XYXY_LABEL) when loading the annotation
    super().__init__(data_dir=data_dir, show_all_warnings=show_all_warnings, *args, **kwargs)

Mixup and Cutmix

Papers: mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)

CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)

Code Reference: CutMix: https://github.com/clovaai/CutMix-PyTorch CutMix by timm: https://github.com/rwightman/pytorch-image-models/timm

`CollateMixup`

Collate with Mixup/Cutmix that applies different params to each element or whole batch A Mixup impl that's performed while collating the batches.

Source code in src/super_gradients/training/datasets/mixup.py

@register_collate_function()
class CollateMixup:
    """
    Collate with Mixup/Cutmix that applies different params to each element or whole batch
    A Mixup impl that's performed while collating the batches.
    """

    def __init__(
        self,
        mixup_alpha: float = 1.0,
        cutmix_alpha: float = 0.0,
        cutmix_minmax: List[float] = None,
        prob: float = 1.0,
        switch_prob: float = 0.5,
        mode: str = "batch",
        correct_lam: bool = True,
        label_smoothing: float = 0.1,
        num_classes: int = 1000,
    ):
        """
        Mixup/Cutmix that applies different params to each element or whole batch

        :param mixup_alpha: mixup alpha value, mixup is active if > 0.
        :param cutmix_alpha: cutmix alpha value, cutmix is active if > 0.
        :param cutmix_minmax: cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        :param prob: probability of applying mixup or cutmix per batch or element
        :param switch_prob: probability of switching to cutmix instead of mixup when both are active
        :param mode: how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        :param correct_lam: apply lambda correction when cutmix bbox clipped by image borders
        :param label_smoothing: apply label smoothing to the mixed target tensor
        :param num_classes: number of classes for target
        """
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
        if self.cutmix_minmax is not None:
            assert len(self.cutmix_minmax) == 2
            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
            self.cutmix_alpha = 1.0
        self.mix_prob = prob
        self.switch_prob = switch_prob
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

    def _params_per_elem(self, batch_size):
        """
        generate two random masks to define which elements of the batch will be mixed and how (depending on the
        self.mixup_enabled, self.mixup_alpha, self.cutmix_alpha parameters

        :param batch_size:
        :return: two tensors with shape=batch_size - the first contains the lambda value per batch element
        and the second is a binary flag indicating use of cutmix per batch element
        """
        lam = torch.ones(batch_size, dtype=torch.float32)
        use_cutmix = torch.zeros(batch_size, dtype=torch.bool)
        if self.mixup_enabled:
            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
                use_cutmix = torch.rand(batch_size) < self.switch_prob
                lam_mix = torch.where(
                    use_cutmix,
                    torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample(sample_shape=batch_size),
                    torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample(sample_shape=batch_size),
                )
            elif self.mixup_alpha > 0.0:
                lam_mix = torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample(sample_shape=batch_size)
            elif self.cutmix_alpha > 0.0:
                use_cutmix = torch.ones(batch_size, dtype=torch.bool)
                lam_mix = torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample(sample_shape=batch_size)
            else:
                raise IllegalDatasetParameterException("One of mixup_alpha > 0., cutmix_alpha > 0., " "cutmix_minmax not None should be true.")
            lam = torch.where(torch.rand(batch_size) < self.mix_prob, lam_mix.type(torch.float32), lam)
        return lam, use_cutmix

    def _params_per_batch(self):
        """
        generate two random parameters to define if batch will be mixed and how (depending on the
        self.mixup_enabled, self.mixup_alpha, self.cutmix_alpha parameters

        :return: two parameters - the first contains the lambda value for the whole batch
        and the second is a binary flag indicating use of cutmix for the batch
        """
        lam = 1.0
        use_cutmix = False

        if self.mixup_enabled and torch.rand(1) < self.mix_prob:
            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
                use_cutmix = torch.rand(1) < self.switch_prob
                lam_mix = (
                    torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample()
                    if use_cutmix
                    else torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample()
                )
            elif self.mixup_alpha > 0.0:
                lam_mix = torch.distributions.beta.Beta(self.mixup_alpha, self.mixup_alpha).sample()
            elif self.cutmix_alpha > 0.0:
                use_cutmix = True
                lam_mix = torch.distributions.beta.Beta(self.cutmix_alpha, self.cutmix_alpha).sample()
            else:
                raise IllegalDatasetParameterException("One of mixup_alpha > 0., cutmix_alpha > 0., " "cutmix_minmax not None should be true.")
            lam = float(lam_mix)
        return lam, use_cutmix

    def _mix_elem_collate(self, output: torch.Tensor, batch: list, half: bool = False):
        """
        This is the implementation for 'elem' or 'half' modes
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: a tensor containing the lambda values used for the mixing (this vector can be used for
        mixing the labels as well)
        """
        batch_size = len(batch)
        num_elem = batch_size // 2 if half else batch_size
        assert len(output) == num_elem
        lam_batch, use_cutmix = self._params_per_elem(num_elem)
        for i in range(num_elem):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed = batch[i][0]
            if lam != 1.0:
                if use_cutmix[i]:
                    if not half:
                        mixed = torch.clone(mixed)
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    mixed = mixed * lam + batch[j][0] * (1 - lam)
            output[i] += mixed
        if half:
            lam_batch = torch.cat((lam_batch, torch.ones(num_elem)))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_pair_collate(self, output: torch.Tensor, batch: list):
        """
        This is the implementation for 'pair' mode
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: a tensor containing the lambda values used for the mixing (this vector can be used for
        mixing the labels as well)
        """
        batch_size = len(batch)
        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
        for i in range(batch_size // 2):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed_i = batch[i][0]
            mixed_j = batch[j][0]
            assert 0 <= lam <= 1.0
            if lam < 1.0:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    patch_i = torch.clone(mixed_i[:, yl:yh, xl:xh])
                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
                    mixed_j[:, yl:yh, xl:xh] = patch_i
                    lam_batch[i] = lam
                else:
                    mixed_temp = mixed_i.type(torch.float32) * lam + mixed_j.type(torch.float32) * (1 - lam)
                    mixed_j = mixed_j.type(torch.float32) * lam + mixed_i.type(torch.float32) * (1 - lam)
                    mixed_i = mixed_temp
                    torch.rint(mixed_j, out=mixed_j)
                    torch.rint(mixed_i, out=mixed_i)
            output[i] += mixed_i
            output[j] += mixed_j
        lam_batch = torch.cat((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_batch_collate(self, output: torch.Tensor, batch: list):
        """
        This is the implementation for 'batch' mode
        :param output: the output tensor to fill
        :param batch: list of thr batch items
        :return: the lambda value used for the mixing
        """
        batch_size = len(batch)
        lam, use_cutmix = self._params_per_batch()
        if use_cutmix:
            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
        for i in range(batch_size):
            j = batch_size - i - 1
            mixed = batch[i][0]
            if lam != 1.0:
                if use_cutmix:
                    mixed = torch.clone(mixed)  # don't want to modify the original while iterating
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                else:
                    mixed = mixed * lam + batch[j][0] * (1 - lam)
            output[i] += mixed
        return lam

    def __call__(self, batch, _=None):
        batch_size = len(batch)
        if batch_size % 2 != 0:
            raise IllegalDatasetParameterException("Batch size should be even when using this")
        half = "half" in self.mode
        if half:
            batch_size //= 2
        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.float32)
        if self.mode == "elem" or self.mode == "half":
            lam = self._mix_elem_collate(output, batch, half=half)
        elif self.mode == "pair":
            lam = self._mix_pair_collate(output, batch)
        else:
            lam = self._mix_batch_collate(output, batch)
        target = torch.tensor([b[1] for b in batch], dtype=torch.int32)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device="cpu")
        target = target[:batch_size]

        return output, target

`init(mixup_alpha=1.0, cutmix_alpha=0.0, cutmix_minmax=None, prob=1.0, switch_prob=0.5, mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000)`

Mixup/Cutmix that applies different params to each element or whole batch

Parameters:

Name	Type	Description	Default
`mixup_alpha`	`float`	mixup alpha value, mixup is active if > 0.	`1.0`
`cutmix_alpha`	`float`	cutmix alpha value, cutmix is active if > 0.	`0.0`
`cutmix_minmax`	`List[float]`	cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.	`None`
`prob`	`float`	probability of applying mixup or cutmix per batch or element	`1.0`
`switch_prob`	`float`	probability of switching to cutmix instead of mixup when both are active	`0.5`
`mode`	`str`	how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)	`'batch'`
`correct_lam`	`bool`	apply lambda correction when cutmix bbox clipped by image borders	`True`
`label_smoothing`	`float`	apply label smoothing to the mixed target tensor	`0.1`
`num_classes`	`int`	number of classes for target	`1000`

Source code in src/super_gradients/training/datasets/mixup.py

def __init__(
    self,
    mixup_alpha: float = 1.0,
    cutmix_alpha: float = 0.0,
    cutmix_minmax: List[float] = None,
    prob: float = 1.0,
    switch_prob: float = 0.5,
    mode: str = "batch",
    correct_lam: bool = True,
    label_smoothing: float = 0.1,
    num_classes: int = 1000,
):
    """
    Mixup/Cutmix that applies different params to each element or whole batch

    :param mixup_alpha: mixup alpha value, mixup is active if > 0.
    :param cutmix_alpha: cutmix alpha value, cutmix is active if > 0.
    :param cutmix_minmax: cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
    :param prob: probability of applying mixup or cutmix per batch or element
    :param switch_prob: probability of switching to cutmix instead of mixup when both are active
    :param mode: how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
    :param correct_lam: apply lambda correction when cutmix bbox clipped by image borders
    :param label_smoothing: apply label smoothing to the mixed target tensor
    :param num_classes: number of classes for target
    """
    self.mixup_alpha = mixup_alpha
    self.cutmix_alpha = cutmix_alpha
    self.cutmix_minmax = cutmix_minmax
    if self.cutmix_minmax is not None:
        assert len(self.cutmix_minmax) == 2
        # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
        self.cutmix_alpha = 1.0
    self.mix_prob = prob
    self.switch_prob = switch_prob
    self.label_smoothing = label_smoothing
    self.num_classes = num_classes
    self.mode = mode
    self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
    self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

`cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None)`

Generate bbox and apply lambda correction.

Source code in src/super_gradients/training/datasets/mixup.py

def cutmix_bbox_and_lam(img_shape: tuple, lam: float, ratio_minmax: Union[tuple, list] = None, correct_lam: bool = True, count: int = None):
    """
    Generate bbox and apply lambda correction.
    """
    if ratio_minmax is not None:
        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
    else:
        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
    if correct_lam or ratio_minmax is not None:
        bbox_area = (yu - yl) * (xu - xl)
        lam = 1.0 - bbox_area / float(img_shape[-2] * img_shape[-1])
    return (yl, yu, xl, xu), lam

`mixup_target(target, num_classes, lam=1.0, smoothing=0.0, device='cuda')`

generate a smooth target (label) two-hot tensor to support the mixed images with different labels

Parameters:

Name	Type	Description	Default
`target`	`torch.Tensor`	the targets tensor	required
`num_classes`	`int`	number of classes (to set the final tensor size)	required
`lam`	`float`	percentage of label a range [0, 1] in the mixing	`1.0`
`smoothing`	`float`	the smoothing multiplier	`0.0`
`device`	`str`	usable device ['cuda', 'cpu']	`'cuda'`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/mixup.py

def mixup_target(target: torch.Tensor, num_classes: int, lam: float = 1.0, smoothing: float = 0.0, device: str = "cuda"):
    """
    generate a smooth target (label) two-hot tensor to support the mixed images with different labels
    :param target: the targets tensor
    :param num_classes: number of classes (to set the final tensor size)
    :param lam: percentage of label a range [0, 1] in the mixing
    :param smoothing: the smoothing multiplier
    :param device: usable device ['cuda', 'cpu']
    :return:
    """
    off_value = smoothing / num_classes
    on_value = 1.0 - smoothing + off_value
    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
    return y1 * lam + y2 * (1.0 - lam)

`rand_bbox(img_shape, lam, margin=0.0, count=None)`

Standard CutMix bounding-box Generates a random square bbox based on lambda value. This impl includes support for enforcing a border margin as percent of bbox dimensions.

Parameters:

Name	Type	Description	Default
`img_shape`	`tuple`	Image shape as tuple	required
`lam`	`float`	Cutmix lambda value	required
`margin`	`float`	Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)	`0.0`
`count`	`int`	Number of bbox to generate	`None`

Source code in src/super_gradients/training/datasets/mixup.py

def rand_bbox(img_shape: tuple, lam: float, margin: float = 0.0, count: int = None):
    """Standard CutMix bounding-box
    Generates a random square bbox based on lambda value. This impl includes
    support for enforcing a border margin as percent of bbox dimensions.

    :param img_shape: Image shape as tuple
    :param lam: Cutmix lambda value
    :param margin: Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
    :param count: Number of bbox to generate
    """
    ratio = np.sqrt(1 - lam)
    img_h, img_w = img_shape[-2:]
    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
    yl = np.clip(cy - cut_h // 2, 0, img_h)
    yh = np.clip(cy + cut_h // 2, 0, img_h)
    xl = np.clip(cx - cut_w // 2, 0, img_w)
    xh = np.clip(cx + cut_w // 2, 0, img_w)
    return yl, yh, xl, xh

`rand_bbox_minmax(img_shape, minmax, count=None)`

Min-Max CutMix bounding-box Inspired by Darknet cutmix impl, generates a random rectangular bbox based on min/max percent values applied to each dimension of the input image.

Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 range for max.

Parameters:

Name	Type	Description	Default
`img_shape`	`tuple`	Image shape as tuple	required
`minmax`	`Union[tuple, list]`	Min and max bbox ratios (as percent of image size)	required
`count`	`int`	Number of bbox to generate	`None`

Source code in src/super_gradients/training/datasets/mixup.py

def rand_bbox_minmax(img_shape: tuple, minmax: Union[tuple, list], count: int = None):
    """Min-Max CutMix bounding-box
    Inspired by Darknet cutmix impl, generates a random rectangular bbox
    based on min/max percent values applied to each dimension of the input image.

    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.

    :param img_shape: Image shape as tuple
    :param minmax: Min and max bbox ratios (as percent of image size)
    :param count: Number of bbox to generate
    """
    assert len(minmax) == 2
    img_h, img_w = img_shape[-2:]
    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
    yl = np.random.randint(0, img_h - cut_h, size=count)
    xl = np.random.randint(0, img_w - cut_w, size=count)
    yu = yl + cut_h
    xu = xl + cut_w
    return yl, yu, xl, xu

`AbstractPoseEstimationDataset`

Bases: Dataset, HasPreprocessingParams

Abstract class for strongly typed dataset classes for pose estimation task. This new concept introduced in SG 3.3 and will be used in the future to replace the old BaseKeypointsDataset. The reasoning begin strongly typed dataset includes: 1. Introduction of a new concept of "data sample" that has clear definition (via @dataclass) thus reducing change of bugs/confusion. 2. Data sample becomes a central concept in data augmentation transforms and metrics. 3. Dataset implementation decoupled from the model & loss - now the dataset returns the data sample objects and model/loss specific conversion happens only in collate function.

Descendants should implement the load_sample method to read a sample from the disk and return PoseEstimationSample object.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py

class AbstractPoseEstimationDataset(Dataset, HasPreprocessingParams):
    """
    Abstract class for strongly typed dataset classes for pose estimation task.
    This new concept introduced in SG 3.3 and will be used in the future to replace the old BaseKeypointsDataset.
    The reasoning begin strongly typed dataset includes:
    1. Introduction of a new concept of "data sample" that has clear definition (via @dataclass) thus reducing change of bugs/confusion.
    2. Data sample becomes a central concept in data augmentation transforms and metrics.
    3. Dataset implementation decoupled from the model & loss - now the dataset returns the data sample objects
       and model/loss specific conversion happens only in collate function.

    Descendants should implement the load_sample method to read a sample from the disk and return PoseEstimationSample object.
    """

    def __init__(
        self,
        transforms: List[AbstractKeypointTransform],
        num_joints: int,
        edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param transforms: Transforms to be applied to the image & keypoints
        :param num_joints: Number of joints to be predicted
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """
        super().__init__()
        self.transforms = KeypointsCompose(
            transforms,
            load_sample_fn=self.load_random_sample,
        )
        self.num_joints = num_joints

        # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
        # This is necessary to ensure ListConfig objects do not leak to these properties
        # and from there - to checkpoint's state_dict.
        # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
        # and torch.load will attempt to unpickle lot of unnecessary classes.
        edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
        if edge_colors is not None:
            edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
        if keypoint_colors is not None:
            keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

        self.edge_links = edge_links
        self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
        self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

    @abc.abstractmethod
    def __len__(self) -> int:
        raise NotImplementedError()

    @abc.abstractmethod
    def load_sample(self, index: int) -> PoseEstimationSample:
        """
        Read a sample from the disk and return a PoseEstimationSample
        :param index: Sample index
        :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
        """
        raise NotImplementedError()

    def load_random_sample(self) -> PoseEstimationSample:
        """
        Return a random sample from the dataset

        :return: Instance of PoseEstimationSample
        """
        num_samples = len(self)
        random_index = random.randrange(0, num_samples)
        return self.load_sample(random_index)

    def __getitem__(self, index: int) -> PoseEstimationSample:
        sample = self.load_sample(index)
        sample = self.transforms.apply_to_sample(sample)
        return sample

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
        pipeline = self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

`init(transforms, num_joints, edge_links, edge_colors, keypoint_colors)`

Parameters:

Name	Type	Description	Default
`transforms`	`List[AbstractKeypointTransform]`	Transforms to be applied to the image & keypoints	required
`num_joints`	`int`	Number of joints to be predicted	required
`edge_links`	`Union[ListConfig, List[Tuple[int, int]], np.ndarray]`	Edge links between joints	required
`edge_colors`	`Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]`	Color of the edge links. If None, the color will be generated randomly.	required
`keypoint_colors`	`Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]`	Color of the keypoints. If None, the color will be generated randomly.	required

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py

def __init__(
    self,
    transforms: List[AbstractKeypointTransform],
    num_joints: int,
    edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param transforms: Transforms to be applied to the image & keypoints
    :param num_joints: Number of joints to be predicted
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """
    super().__init__()
    self.transforms = KeypointsCompose(
        transforms,
        load_sample_fn=self.load_random_sample,
    )
    self.num_joints = num_joints

    # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
    # This is necessary to ensure ListConfig objects do not leak to these properties
    # and from there - to checkpoint's state_dict.
    # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
    # and torch.load will attempt to unpickle lot of unnecessary classes.
    edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
    if edge_colors is not None:
        edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
    if keypoint_colors is not None:
        keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

    self.edge_links = edge_links
    self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
    self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

`get_dataset_preprocessing_params()`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py

def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
    pipeline = self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

`load_random_sample()`

Return a random sample from the dataset

Returns:

Type	Description
`PoseEstimationSample`	Instance of PoseEstimationSample

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py

def load_random_sample(self) -> PoseEstimationSample:
    """
    Return a random sample from the dataset

    :return: Instance of PoseEstimationSample
    """
    num_samples = len(self)
    random_index = random.randrange(0, num_samples)
    return self.load_sample(random_index)

`load_sample(index)` `abstractmethod`

Read a sample from the disk and return a PoseEstimationSample

Parameters:

Name	Type	Description	Default
`index`	`int`	Sample index	required

Returns:

Type	Description
`PoseEstimationSample`	Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/abstract_pose_estimation_dataset.py

@abc.abstractmethod
def load_sample(self, index: int) -> PoseEstimationSample:
    """
    Read a sample from the disk and return a PoseEstimationSample
    :param index: Sample index
    :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
    """
    raise NotImplementedError()

`BaseKeypointsDataset`

Bases: Dataset, HasPreprocessingParams

Base class for pose estimation datasets. Descendants should implement the load_sample method to read a sample from the disk and return (image, mask, joints, extras) tuple.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

class BaseKeypointsDataset(Dataset, HasPreprocessingParams):
    """
    Base class for pose estimation datasets.
    Descendants should implement the load_sample method to read a sample from the disk and return (image, mask, joints, extras) tuple.
    """

    def __init__(
        self,
        target_generator: KeypointsTargetsGenerator,
        transforms: List[KeypointTransform],
        min_instance_area: float,
        num_joints: int,
        edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param target_generator: Target generator that will be used to generate the targets for the model.
            See DEKRTargetsGenerator for an example.
        :param transforms: Transforms to be applied to the image & keypoints
        :param min_instance_area: Minimum area of an instance to be included in the dataset
        :param num_joints: Number of joints to be predicted
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """
        super().__init__()
        self.target_generator = target_generator
        self.transforms = KeypointsCompose(transforms)
        self.min_instance_area = min_instance_area
        self.num_joints = num_joints

        # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
        # This is necessary to ensure ListConfig objects do not leak to these properties
        # and from there - to checkpoint's state_dict.
        # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
        # and torch.load will attempt to unpickle lot of unnecessary classes.
        edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
        if edge_colors is not None:
            edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
        if keypoint_colors is not None:
            keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

        self.edge_links = edge_links
        self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
        self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

    @abc.abstractmethod
    def __len__(self) -> int:
        raise NotImplementedError()

    @abc.abstractmethod
    def load_sample(self, index) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
        """
        Read a sample from the disk and return (image, mask, joints, extras) tuple
        :param index: Sample index
        :return: Tuple of (image, mask, joints, extras)
            image - Numpy array of [H,W,3] shape, which represents input RGB image
            mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an
                    ignored region which should not be used for training (contribute to loss)
            joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances
            extras - Dictionary of extra information about the sample that should be included in `extras` dictionary.
        """
        raise NotImplementedError()

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, Any, Mapping[str, Any]]:
        img, mask, joints, extras = self.load_sample(index)
        img, mask, joints, _, _ = self.transforms(img, mask, joints, areas=None, bboxes=None)

        joints = self.filter_joints(joints, img)

        targets = self.target_generator(img, joints, mask)
        return img, targets, {"gt_joints": joints, **extras}

    def compute_area(self, joints: np.ndarray) -> np.ndarray:
        """
        Compute area of a bounding box for each instance.
        :param joints:  [Num Instances, Num Joints, 3]
        :return: [Num Instances]
        """
        w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
        h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
        return w * h

    def filter_joints(self, joints: np.ndarray, image: np.ndarray) -> np.ndarray:
        """
        Filter instances that are either too small or do not have visible keypoints
        :param joints: Array of shape [Num Instances, Num Joints, 3]
        :param image:
        :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
        """
        # Update visibility of joints for those that are outside the image
        outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image.shape[1]) | (joints[:, :, 1] >= image.shape[0])
        joints[outside_image_mask, 2] = 0

        # Filter instances with all invisible keypoints
        instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
        joints = joints[instances_with_visible_joints]

        # Remove instances with too small area
        areas = self.compute_area(joints)
        joints = joints[areas > self.min_instance_area]

        return joints

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        pipeline = self.transforms.get_equivalent_preprocessing()
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

`init(target_generator, transforms, min_instance_area, num_joints, edge_links, edge_colors, keypoint_colors)`

Parameters:

Name	Type	Description	Default
`target_generator`	`KeypointsTargetsGenerator`	Target generator that will be used to generate the targets for the model. See DEKRTargetsGenerator for an example.	required
`transforms`	`List[KeypointTransform]`	Transforms to be applied to the image & keypoints	required
`min_instance_area`	`float`	Minimum area of an instance to be included in the dataset	required
`num_joints`	`int`	Number of joints to be predicted	required
`edge_links`	`Union[ListConfig, List[Tuple[int, int]], np.ndarray]`	Edge links between joints	required
`edge_colors`	`Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]`	Color of the edge links. If None, the color will be generated randomly.	required
`keypoint_colors`	`Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None]`	Color of the keypoints. If None, the color will be generated randomly.	required

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

def __init__(
    self,
    target_generator: KeypointsTargetsGenerator,
    transforms: List[KeypointTransform],
    min_instance_area: float,
    num_joints: int,
    edge_links: Union[ListConfig, List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[ListConfig, List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param target_generator: Target generator that will be used to generate the targets for the model.
        See DEKRTargetsGenerator for an example.
    :param transforms: Transforms to be applied to the image & keypoints
    :param min_instance_area: Minimum area of an instance to be included in the dataset
    :param num_joints: Number of joints to be predicted
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """
    super().__init__()
    self.target_generator = target_generator
    self.transforms = KeypointsCompose(transforms)
    self.min_instance_area = min_instance_area
    self.num_joints = num_joints

    # Explicitly convert edge_links, keypoint_colors and edge_colors to lists of tuples
    # This is necessary to ensure ListConfig objects do not leak to these properties
    # and from there - to checkpoint's state_dict.
    # Otherwise, through ListConfig instances a whole configuration file will leak to state_dict
    # and torch.load will attempt to unpickle lot of unnecessary classes.
    edge_links = [(int(from_idx), int(to_idx)) for from_idx, to_idx in edge_links]
    if edge_colors is not None:
        edge_colors = [(int(r), int(g), int(b)) for r, g, b in edge_colors]
    if keypoint_colors is not None:
        keypoint_colors = [(int(r), int(g), int(b)) for r, g, b in keypoint_colors]

    self.edge_links = edge_links
    self.edge_colors = edge_colors or generate_color_mapping(len(edge_links))
    self.keypoint_colors = keypoint_colors or generate_color_mapping(num_joints)

`compute_area(joints)`

Compute area of a bounding box for each instance.

Parameters:

Name	Type	Description	Default
`joints`	`np.ndarray`	[Num Instances, Num Joints, 3]	required

Returns:

Type	Description
`np.ndarray`	[Num Instances]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

def compute_area(self, joints: np.ndarray) -> np.ndarray:
    """
    Compute area of a bounding box for each instance.
    :param joints:  [Num Instances, Num Joints, 3]
    :return: [Num Instances]
    """
    w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
    h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
    return w * h

`filter_joints(joints, image)`

Filter instances that are either too small or do not have visible keypoints

Parameters:

Name	Type	Description	Default
`joints`	`np.ndarray`	Array of shape [Num Instances, Num Joints, 3]	required
`image`	`np.ndarray`		required

Returns:

Type	Description
`np.ndarray`	[New Num Instances, Num Joints, 3], New Num Instances <= Num Instances

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

def filter_joints(self, joints: np.ndarray, image: np.ndarray) -> np.ndarray:
    """
    Filter instances that are either too small or do not have visible keypoints
    :param joints: Array of shape [Num Instances, Num Joints, 3]
    :param image:
    :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
    """
    # Update visibility of joints for those that are outside the image
    outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image.shape[1]) | (joints[:, :, 1] >= image.shape[0])
    joints[outside_image_mask, 2] = 0

    # Filter instances with all invisible keypoints
    instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
    joints = joints[instances_with_visible_joints]

    # Remove instances with too small area
    areas = self.compute_area(joints)
    joints = joints[areas > self.min_instance_area]

    return joints

`get_dataset_preprocessing_params()`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    pipeline = self.transforms.get_equivalent_preprocessing()
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

`load_sample(index)` `abstractmethod`

Read a sample from the disk and return (image, mask, joints, extras) tuple

Parameters:

Name	Type	Description	Default
`index`		Sample index	required

Returns:

Type Description

Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]

Tuple of (image, mask, joints, extras) image - Numpy array of [H,W,3] shape, which represents input RGB image mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an ignored region which should not be used for training (contribute to loss) joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances extras - Dictionary of extra information about the sample that should be included in extras dictionary.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

@abc.abstractmethod
def load_sample(self, index) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
    """
    Read a sample from the disk and return (image, mask, joints, extras) tuple
    :param index: Sample index
    :return: Tuple of (image, mask, joints, extras)
        image - Numpy array of [H,W,3] shape, which represents input RGB image
        mask - Numpy array of [H,W] shape, which represents a binary mask with zero values corresponding to an
                ignored region which should not be used for training (contribute to loss)
        joints - Numpy array of [Num Instances, Num Joints, 3] shape, which represents the skeletons of the instances
        extras - Dictionary of extra information about the sample that should be included in `extras` dictionary.
    """
    raise NotImplementedError()

`KeypointsCollate`

Collate image & targets, return extras as is.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/base_keypoints.py

@register_collate_function()
class KeypointsCollate:
    """
    Collate image & targets, return extras as is.
    """

    def __call__(self, batch):
        images = []
        targets = []
        extras = []
        for image, target, extra in batch:
            images.append(image)
            targets.append(target)
            extras.append(extra)

        extras = {k: [dic[k] for dic in extras] for k in extras[0]}  # Convert list of dicts to dict of lists

        images = default_collate(images)
        targets = default_collate(targets)
        return images, targets, extras

`COCOKeypointsDataset`

Bases: BaseKeypointsDataset

Dataset class for training pose estimation models on COCO Keypoints dataset. Use should pass a target generator class that is model-specific and generates the targets for the model.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py

@register_dataset(Datasets.COCO_KEY_POINTS_DATASET)
class COCOKeypointsDataset(BaseKeypointsDataset):
    """
    Dataset class for training pose estimation models on COCO Keypoints dataset.
    Use should pass a target generator class that is model-specific and generates the targets for the model.
    """

    @resolve_param("transforms", TransformsFactory())
    @resolve_param("target_generator", TargetGeneratorsFactory())
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        json_file: str,
        include_empty_samples: bool,
        target_generator,
        transforms: List[KeypointTransform],
        min_instance_area: float,
        edge_links: Union[List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """

        :param data_dir: Root directory of the COCO dataset
        :param images_dir: path suffix to the images directory inside the dataset_root
        :param json_file: path suffix to the json file inside the dataset_root
        :param include_empty_samples: if True, images without any annotations will be included in the dataset.
            Otherwise, they will be filtered out.
        :param target_generator: Target generator that will be used to generate the targets for the model.
            See DEKRTargetsGenerator for an example.
        :param transforms: Transforms to be applied to the image & keypoints
        :param min_instance_area: Minimum area of an instance to be included in the dataset
        :param edge_links: Edge links between joints
        :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
        """

        json_file = os.path.join(data_dir, json_file)
        self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
            json_file,
            image_path_prefix=os.path.join(data_dir, images_dir),
            remove_duplicate_annotations=False,
            crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
        )
        num_joints = len(self.joints)
        super().__init__(
            transforms=transforms,
            target_generator=target_generator,
            min_instance_area=min_instance_area,
            num_joints=num_joints,
            edge_links=edge_links,
            edge_colors=edge_colors,
            keypoint_colors=keypoint_colors,
        )

        self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
        self.include_empty_samples = include_empty_samples

    def __len__(self):
        if self.include_empty_samples:
            return len(self.annotations)
        else:
            return len(self.non_empty_annotation_indexes)

    def __getitem__(self, index: int) -> Tuple[Tensor, Any, Mapping[str, Any]]:
        img, mask, gt_joints, gt_areas, gt_bboxes, gt_iscrowd = self.load_sample(index)
        img, mask, gt_joints, gt_areas, gt_bboxes = self.transforms(img, mask, gt_joints, areas=gt_areas, bboxes=gt_bboxes)

        image_shape = img.size(1), img.size(2)
        gt_joints, gt_areas, gt_bboxes, gt_iscrowd = self.filter_joints(image_shape, gt_joints, gt_areas, gt_bboxes, gt_iscrowd)

        targets = self.target_generator(img, gt_joints, mask)
        return img, targets, {"gt_joints": gt_joints, "gt_bboxes": gt_bboxes, "gt_iscrowd": gt_iscrowd, "gt_areas": gt_areas}

    def load_sample(self, index):
        if not self.include_empty_samples:
            index = self.non_empty_annotation_indexes[index]
        ann = self.annotations[index]

        image_shape = (ann.image_height, ann.image_width)

        gt_iscrowd = ann.ann_is_crowd.copy()
        gt_joints = ann.ann_keypoints.copy()
        gt_bboxes = ann.ann_boxes_xyxy.copy()
        gt_segmentations = ann.ann_segmentations
        gt_areas = ann.ann_areas.copy()

        orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if orig_image is None:
            # This is a nice fallback/hack to handle case when OpenCV cannot read some images
            # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
            # But we generaly want to read with OpenCV since it's much faster than PIL
            from PIL import Image

            orig_image = Image.open(ann.image_path).convert("BGR")

        if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
            raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

        # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
        image_height, image_width = orig_image.shape[:2]
        gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
        gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
        gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
        gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
        gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

        mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

        return orig_image, mask, gt_joints, gt_areas, gt_bboxes_xywh, gt_iscrowd

    def filter_joints(
        self,
        image_shape,
        joints: np.ndarray,
        areas: np.ndarray,
        bboxes: np.ndarray,
        is_crowd: np.ndarray,
    ):
        """
        Filter instances that are either too small or do not have visible keypoints.

        :param image: Image if [H,W,C] shape. Used to infer image boundaries
        :param joints: Array of shape [Num Instances, Num Joints, 3]
        :param areas: Array of shape [Num Instances] with area of each instance.
                      Instance area comes from segmentation mask from COCO annotation file.
        :param bboxes: Array of shape [Num Instances, 4] for bounding boxes in XYWH format.
                       Bounding boxes comes from segmentation mask from COCO annotation file.
        :param: is_crowd: Array of shape [Num Instances] indicating whether an instance is a crowd target.
        :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
        """

        # Update visibility of joints for those that are outside the image
        outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image_shape[1]) | (joints[:, :, 1] >= image_shape[0])
        joints[outside_image_mask, 2] = 0

        # Filter instances with all invisible keypoints
        instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
        instances_with_good_area = areas > self.min_instance_area

        keep_mask = instances_with_visible_joints & instances_with_good_area

        joints = joints[keep_mask]
        areas = areas[keep_mask]
        bboxes = bboxes[keep_mask]
        is_crowd = is_crowd[keep_mask]

        return joints, areas, bboxes, is_crowd

    def _get_crowd_mask(self, segmentations: List[str], image_shape: Tuple[int, int]) -> np.ndarray:
        """
        This method computes ignore mask, which describes crowd objects / objects w/o keypoints to exclude these predictions from contributing to the loss
        :return: Float mask of [H,W] shape (same as image dimensions),
            where 1.0 values corresponds to pixels that should contribute to the loss, and 0.0 pixels indicates areas that should be excluded.
        """
        m = np.zeros(image_shape, dtype=bool)

        for segmentation in segmentations:
            mask = segmentation2mask(segmentation, image_shape)
            m[mask] = True

        return (m < 0.5).astype(np.float32)

    def get_dataset_preprocessing_params(self):
        """

        :return:
        """
        # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
        # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
        # to match with the expected input of the model.
        pipeline = [Processings.ReverseImageChannels] + self.transforms.get_equivalent_preprocessing()
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

`init(data_dir, images_dir, json_file, include_empty_samples, target_generator, transforms, min_instance_area, edge_links, edge_colors, keypoint_colors)`

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Root directory of the COCO dataset	required
`images_dir`	`str`	path suffix to the images directory inside the dataset_root	required
`json_file`	`str`	path suffix to the json file inside the dataset_root	required
`include_empty_samples`	`bool`	if True, images without any annotations will be included in the dataset. Otherwise, they will be filtered out.	required
`target_generator`		Target generator that will be used to generate the targets for the model. See DEKRTargetsGenerator for an example.	required
`transforms`	`List[KeypointTransform]`	Transforms to be applied to the image & keypoints	required
`min_instance_area`	`float`	Minimum area of an instance to be included in the dataset	required
`edge_links`	`Union[List[Tuple[int, int]], np.ndarray]`	Edge links between joints	required
`edge_colors`	`Union[List[Tuple[int, int, int]], np.ndarray, None]`	Color of the edge links. If None, the color will be generated randomly.	required
`keypoint_colors`	`Union[List[Tuple[int, int, int]], np.ndarray, None]`	Color of the keypoints. If None, the color will be generated randomly.	required

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py

@resolve_param("transforms", TransformsFactory())
@resolve_param("target_generator", TargetGeneratorsFactory())
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    json_file: str,
    include_empty_samples: bool,
    target_generator,
    transforms: List[KeypointTransform],
    min_instance_area: float,
    edge_links: Union[List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
):
    """

    :param data_dir: Root directory of the COCO dataset
    :param images_dir: path suffix to the images directory inside the dataset_root
    :param json_file: path suffix to the json file inside the dataset_root
    :param include_empty_samples: if True, images without any annotations will be included in the dataset.
        Otherwise, they will be filtered out.
    :param target_generator: Target generator that will be used to generate the targets for the model.
        See DEKRTargetsGenerator for an example.
    :param transforms: Transforms to be applied to the image & keypoints
    :param min_instance_area: Minimum area of an instance to be included in the dataset
    :param edge_links: Edge links between joints
    :param edge_colors: Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors: Color of the keypoints. If None, the color will be generated randomly.
    """

    json_file = os.path.join(data_dir, json_file)
    self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
        json_file,
        image_path_prefix=os.path.join(data_dir, images_dir),
        remove_duplicate_annotations=False,
        crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
    )
    num_joints = len(self.joints)
    super().__init__(
        transforms=transforms,
        target_generator=target_generator,
        min_instance_area=min_instance_area,
        num_joints=num_joints,
        edge_links=edge_links,
        edge_colors=edge_colors,
        keypoint_colors=keypoint_colors,
    )

    self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
    self.include_empty_samples = include_empty_samples

`filter_joints(image_shape, joints, areas, bboxes, is_crowd)`

Filter instances that are either too small or do not have visible keypoints.

Parameters:

Name	Type	Description	Default
`image`		Image if [H,W,C] shape. Used to infer image boundaries	required
`joints`	`np.ndarray`	Array of shape [Num Instances, Num Joints, 3]	required
`areas`	`np.ndarray`	Array of shape [Num Instances] with area of each instance. Instance area comes from segmentation mask from COCO annotation file.	required
`bboxes`	`np.ndarray`	Array of shape [Num Instances, 4] for bounding boxes in XYWH format. Bounding boxes comes from segmentation mask from COCO annotation file.	required

Returns:

Type	Description
	[New Num Instances, Num Joints, 3], New Num Instances <= Num Instances

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py

def filter_joints(
    self,
    image_shape,
    joints: np.ndarray,
    areas: np.ndarray,
    bboxes: np.ndarray,
    is_crowd: np.ndarray,
):
    """
    Filter instances that are either too small or do not have visible keypoints.

    :param image: Image if [H,W,C] shape. Used to infer image boundaries
    :param joints: Array of shape [Num Instances, Num Joints, 3]
    :param areas: Array of shape [Num Instances] with area of each instance.
                  Instance area comes from segmentation mask from COCO annotation file.
    :param bboxes: Array of shape [Num Instances, 4] for bounding boxes in XYWH format.
                   Bounding boxes comes from segmentation mask from COCO annotation file.
    :param: is_crowd: Array of shape [Num Instances] indicating whether an instance is a crowd target.
    :return: [New Num Instances, Num Joints, 3], New Num Instances <= Num Instances
    """

    # Update visibility of joints for those that are outside the image
    outside_image_mask = (joints[:, :, 0] < 0) | (joints[:, :, 1] < 0) | (joints[:, :, 0] >= image_shape[1]) | (joints[:, :, 1] >= image_shape[0])
    joints[outside_image_mask, 2] = 0

    # Filter instances with all invisible keypoints
    instances_with_visible_joints = np.count_nonzero(joints[:, :, 2], axis=-1) > 0
    instances_with_good_area = areas > self.min_instance_area

    keep_mask = instances_with_visible_joints & instances_with_good_area

    joints = joints[keep_mask]
    areas = areas[keep_mask]
    bboxes = bboxes[keep_mask]
    is_crowd = is_crowd[keep_mask]

    return joints, areas, bboxes, is_crowd

`get_dataset_preprocessing_params()`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_keypoints.py

def get_dataset_preprocessing_params(self):
    """

    :return:
    """
    # Since we are using cv2.imread to read images, our model in fact is trained on BGR images.
    # In our pipelines the convention that input images are RGB, so we need to reverse the channels to get BGR
    # to match with the expected input of the model.
    pipeline = [Processings.ReverseImageChannels] + self.transforms.get_equivalent_preprocessing()
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

`COCOPoseEstimationDataset`

Bases: AbstractPoseEstimationDataset

Dataset class for training pose estimation models using COCO format dataset. Please note that COCO annotations must have exactly one category (e.g. "person") and keypoints must be defined for this category.

Compatible datasets are - COCO2017 dataset - CrowdPose dataset - Any other dataset that is compatible with COCO format

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py

@register_dataset(Datasets.COCO_POSE_ESTIMATION_DATASET)
class COCOPoseEstimationDataset(AbstractPoseEstimationDataset):
    """
    Dataset class for training pose estimation models using COCO format dataset.
    Please note that COCO annotations must have exactly one category (e.g. "person") and
    keypoints must be defined for this category.

    Compatible datasets are
    - COCO2017 dataset
    - CrowdPose dataset
    - Any other dataset that is compatible with COCO format

    """

    @resolve_param("transforms", TransformsFactory())
    @resolve_param("crowd_annotations_action", TypeFactory.from_enum_cls(CrowdAnnotationActionEnum))
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        json_file: str,
        include_empty_samples: bool,
        transforms: List[AbstractKeypointTransform],
        edge_links: Union[List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        remove_duplicate_annotations: bool = False,
        crowd_annotations_action: CrowdAnnotationActionEnum = CrowdAnnotationActionEnum.NO_ACTION,
    ):
        """

        :param data_dir:                     Root directory of the COCO dataset
        :param images_dir:                   Path suffix to the images directory inside the dataset_root
        :param json_file:                    Path suffix to the json file inside the dataset_root
        :param include_empty_samples:        If True, images without any annotations will be included in the dataset.
                                             Otherwise, they will be filtered out.
        :param transforms:                   Transforms to be applied to the image & keypoints
        :param edge_links:                   Edge links between joints
        :param edge_colors:                  Color of the edge links. If None, the color will be generated randomly.
        :param keypoint_colors:              Color of the keypoints. If None, the color will be generated randomly.
        :param remove_duplicate_annotations: If True will remove duplicate instances from the dataset.
                                             It is known issue of COCO dataset - it has some duplicate annotations that affects the
                                             AP metric on validation greatly. This option allows to remove these duplicates.
                                             However, it is disabled by default to preserve backward compatibility with COCO evaluation.
                                             When remove_duplicate_annotations is False no action will be taken and these duplicate
                                             instances will be left unchanged. Default value is False.
        :param crowd_annotations_action:     Action to take for annotations with iscrowd=1. Can be one of the following:
                                             "drop_sample" - Samples with crowd annotations will be dropped from the dataset.
                                             "drop_annotation" - Crowd annotations will be dropped from the dataset.
                                             "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations.
                                             "no_action" - No action will be taken for crowd annotations.
        """
        json_file = os.path.join(data_dir, json_file)
        if not os.path.exists(json_file) or not os.path.isfile(json_file):
            raise FileNotFoundError(f"Annotation file {json_file} does not exist")

        self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
            json_file,
            image_path_prefix=os.path.join(data_dir, images_dir),
            remove_duplicate_annotations=remove_duplicate_annotations,
            crowd_annotations_action=crowd_annotations_action,
        )

        num_joints = len(self.joints)

        super().__init__(
            transforms=transforms,
            num_joints=num_joints,
            edge_links=edge_links,
            edge_colors=edge_colors,
            keypoint_colors=keypoint_colors,
        )
        self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
        self.include_empty_samples = include_empty_samples

    def __len__(self):
        if self.include_empty_samples:
            return len(self.annotations)
        else:
            return len(self.non_empty_annotation_indexes)

    def load_sample(self, index: int) -> PoseEstimationSample:
        """
        Read a sample from the disk and return a PoseEstimationSample
        :param index: Sample index
        :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
        """
        if not self.include_empty_samples:
            index = self.non_empty_annotation_indexes[index]
        ann = self.annotations[index]

        image_shape = (ann.image_height, ann.image_width)

        gt_iscrowd = ann.ann_is_crowd.copy()
        gt_joints = ann.ann_keypoints.copy()
        gt_bboxes = ann.ann_boxes_xyxy.copy()
        gt_segmentations = ann.ann_segmentations
        gt_areas = ann.ann_areas.copy()

        orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if orig_image is None:
            # This is a nice fallback/hack to handle case when OpenCV cannot read some images
            # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
            # But we generaly want to read with OpenCV since it's much faster than PIL
            from PIL import Image

            orig_image = Image.open(ann.image_path).convert("BGR")

        if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
            raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

        # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
        image_height, image_width = orig_image.shape[:2]
        gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
        gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
        gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
        gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
        gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

        mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

        return PoseEstimationSample(
            image=orig_image, mask=mask, joints=gt_joints, areas=gt_areas, bboxes_xywh=gt_bboxes_xywh, is_crowd=gt_iscrowd, additional_samples=None
        )

    def _get_crowd_mask(self, segmentations: List[str], image_shape: Tuple[int, int]) -> np.ndarray:
        """
        This method computes ignore mask, which describes crowd objects / objects w/o keypoints to exclude these predictions from contributing to the loss
        :return: Float mask of [H,W] shape (same as image dimensions),
            where 1.0 values corresponds to pixels that should contribute to the loss, and 0.0 pixels indicates areas that should be excluded.
        """
        m = np.zeros(image_shape, dtype=bool)

        for segmentation in segmentations:
            mask = segmentation2mask(segmentation, image_shape)
            m[mask] = True

        return (m < 0.5).astype(np.float32)

    def get_dataset_preprocessing_params(self) -> dict:
        """
        This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.
        :return:
        """
        rgb_to_bgr = {Processings.ReverseImageChannels: {}}
        image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
        pipeline = [rgb_to_bgr] + self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
        params = dict(
            conf=0.05,
            image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
            edge_links=self.edge_links,
            edge_colors=self.edge_colors,
            keypoint_colors=self.keypoint_colors,
        )
        return params

`init(data_dir, images_dir, json_file, include_empty_samples, transforms, edge_links, edge_colors, keypoint_colors, remove_duplicate_annotations=False, crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION)`

Parameters:

Name	Type	Description	Default
`data_dir`	`str`	Root directory of the COCO dataset	required
`images_dir`	`str`	Path suffix to the images directory inside the dataset_root	required
`json_file`	`str`	Path suffix to the json file inside the dataset_root	required
`include_empty_samples`	`bool`	If True, images without any annotations will be included in the dataset. Otherwise, they will be filtered out.	required
`transforms`	`List[AbstractKeypointTransform]`	Transforms to be applied to the image & keypoints	required
`edge_links`	`Union[List[Tuple[int, int]], np.ndarray]`	Edge links between joints	required
`edge_colors`	`Union[List[Tuple[int, int, int]], np.ndarray, None]`	Color of the edge links. If None, the color will be generated randomly.	required
`keypoint_colors`	`Union[List[Tuple[int, int, int]], np.ndarray, None]`	Color of the keypoints. If None, the color will be generated randomly.	required
`remove_duplicate_annotations`	`bool`	If True will remove duplicate instances from the dataset. It is known issue of COCO dataset - it has some duplicate annotations that affects the AP metric on validation greatly. This option allows to remove these duplicates. However, it is disabled by default to preserve backward compatibility with COCO evaluation. When remove_duplicate_annotations is False no action will be taken and these duplicate instances will be left unchanged. Default value is False.	`False`
`crowd_annotations_action`	`CrowdAnnotationActionEnum`	Action to take for annotations with iscrowd=1. Can be one of the following: "drop_sample" - Samples with crowd annotations will be dropped from the dataset. "drop_annotation" - Crowd annotations will be dropped from the dataset. "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations. "no_action" - No action will be taken for crowd annotations.	`CrowdAnnotationActionEnum.NO_ACTION`

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py

@resolve_param("transforms", TransformsFactory())
@resolve_param("crowd_annotations_action", TypeFactory.from_enum_cls(CrowdAnnotationActionEnum))
def __init__(
    self,
    data_dir: str,
    images_dir: str,
    json_file: str,
    include_empty_samples: bool,
    transforms: List[AbstractKeypointTransform],
    edge_links: Union[List[Tuple[int, int]], np.ndarray],
    edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    remove_duplicate_annotations: bool = False,
    crowd_annotations_action: CrowdAnnotationActionEnum = CrowdAnnotationActionEnum.NO_ACTION,
):
    """

    :param data_dir:                     Root directory of the COCO dataset
    :param images_dir:                   Path suffix to the images directory inside the dataset_root
    :param json_file:                    Path suffix to the json file inside the dataset_root
    :param include_empty_samples:        If True, images without any annotations will be included in the dataset.
                                         Otherwise, they will be filtered out.
    :param transforms:                   Transforms to be applied to the image & keypoints
    :param edge_links:                   Edge links between joints
    :param edge_colors:                  Color of the edge links. If None, the color will be generated randomly.
    :param keypoint_colors:              Color of the keypoints. If None, the color will be generated randomly.
    :param remove_duplicate_annotations: If True will remove duplicate instances from the dataset.
                                         It is known issue of COCO dataset - it has some duplicate annotations that affects the
                                         AP metric on validation greatly. This option allows to remove these duplicates.
                                         However, it is disabled by default to preserve backward compatibility with COCO evaluation.
                                         When remove_duplicate_annotations is False no action will be taken and these duplicate
                                         instances will be left unchanged. Default value is False.
    :param crowd_annotations_action:     Action to take for annotations with iscrowd=1. Can be one of the following:
                                         "drop_sample" - Samples with crowd annotations will be dropped from the dataset.
                                         "drop_annotation" - Crowd annotations will be dropped from the dataset.
                                         "mask_as_normal" - These annotations will be treated as normal (non-crowd) annotations.
                                         "no_action" - No action will be taken for crowd annotations.
    """
    json_file = os.path.join(data_dir, json_file)
    if not os.path.exists(json_file) or not os.path.isfile(json_file):
        raise FileNotFoundError(f"Annotation file {json_file} does not exist")

    self.category_name, self.joints, self.annotations = parse_coco_into_keypoints_annotations(
        json_file,
        image_path_prefix=os.path.join(data_dir, images_dir),
        remove_duplicate_annotations=remove_duplicate_annotations,
        crowd_annotations_action=crowd_annotations_action,
    )

    num_joints = len(self.joints)

    super().__init__(
        transforms=transforms,
        num_joints=num_joints,
        edge_links=edge_links,
        edge_colors=edge_colors,
        keypoint_colors=keypoint_colors,
    )
    self.non_empty_annotation_indexes = np.argwhere([len(ann.ann_keypoints) > 0 for ann in self.annotations]).flatten()
    self.include_empty_samples = include_empty_samples

`get_dataset_preprocessing_params()`

This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.

Returns:

Type	Description
`dict`

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py

def get_dataset_preprocessing_params(self) -> dict:
    """
    This method returns a dictionary of parameters describing preprocessing steps to be applied to the dataset.
    :return:
    """
    rgb_to_bgr = {Processings.ReverseImageChannels: {}}
    image_to_tensor = {Processings.ImagePermute: {"permutation": (2, 0, 1)}}
    pipeline = [rgb_to_bgr] + self.transforms.get_equivalent_preprocessing() + [image_to_tensor]
    params = dict(
        conf=0.05,
        image_processor={Processings.ComposeProcessing: {"processings": pipeline}},
        edge_links=self.edge_links,
        edge_colors=self.edge_colors,
        keypoint_colors=self.keypoint_colors,
    )
    return params

`load_sample(index)`

Read a sample from the disk and return a PoseEstimationSample

Parameters:

Name	Type	Description	Default
`index`	`int`	Sample index	required

Returns:

Type	Description
`PoseEstimationSample`	Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_pose_estimation_dataset.py

def load_sample(self, index: int) -> PoseEstimationSample:
    """
    Read a sample from the disk and return a PoseEstimationSample
    :param index: Sample index
    :return:      Returns an instance of PoseEstimationSample that holds complete sample (image and annotations)
    """
    if not self.include_empty_samples:
        index = self.non_empty_annotation_indexes[index]
    ann = self.annotations[index]

    image_shape = (ann.image_height, ann.image_width)

    gt_iscrowd = ann.ann_is_crowd.copy()
    gt_joints = ann.ann_keypoints.copy()
    gt_bboxes = ann.ann_boxes_xyxy.copy()
    gt_segmentations = ann.ann_segmentations
    gt_areas = ann.ann_areas.copy()

    orig_image = cv2.imread(ann.image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if orig_image is None:
        # This is a nice fallback/hack to handle case when OpenCV cannot read some images
        # In happens to some OpenCV versions for COCO datasets (There are 1-2 corrupted images)
        # But we generaly want to read with OpenCV since it's much faster than PIL
        from PIL import Image

        orig_image = Image.open(ann.image_path).convert("BGR")

    if orig_image.shape[0] != ann.image_height or orig_image.shape[1] != ann.image_width:
        raise RuntimeError(f"Annotated image size ({ann.image_height,ann.image_width}) does not match image size in file {orig_image.shape[:2]}")

    # Clip bboxes to image boundaries (Some annotations extend 1-2px outside of image boundaries)
    image_height, image_width = orig_image.shape[:2]
    gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, image_width)
    gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, image_height)
    gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, image_width)
    gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, image_height)
    gt_bboxes_xywh = xyxy_to_xywh(gt_bboxes, image_shape=(image_height, image_width))

    mask: np.ndarray = self._get_crowd_mask(gt_segmentations[gt_iscrowd], image_shape)

    return PoseEstimationSample(
        image=orig_image, mask=mask, joints=gt_joints, areas=gt_areas, bboxes_xywh=gt_bboxes_xywh, is_crowd=gt_iscrowd, additional_samples=None
    )

`CrowdAnnotationActionEnum`

Bases: str, Enum

Enum that contains possible actions to take for crowd annotations.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py

class CrowdAnnotationActionEnum(str, Enum):
    """
    Enum that contains possible actions to take for crowd annotations.
    """

    DROP_SAMPLE = "drop_sample"
    DROP_ANNOTATION = "drop_annotation"
    MASK_AS_NORMAL = "mask_as_normal"
    NO_ACTION = "no_action"

`parse_coco_into_keypoints_annotations(ann, image_path_prefix=None, crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION, remove_duplicate_annotations=False)`

Load COCO keypoints dataset from annotation file.

Parameters:

Name	Type	Description	Default
`ann`	`str`	A path to the JSON annotation file in COCO format.	required
`image_path_prefix`		A prefix to add to the image paths in the annotation file.	`None`

Returns:

Type	Description
`Tuple[str, Dict, List[KeypointsAnnotation]]`	Tuple (class_names, annotations) where class_names is a list of class names (respecting include_classes/exclude_classes/class_ids_to_ignore) and annotations is a list of DetectionAnnotation objects.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py

def parse_coco_into_keypoints_annotations(
    ann: str,
    image_path_prefix=None,
    crowd_annotations_action=CrowdAnnotationActionEnum.NO_ACTION,
    remove_duplicate_annotations: bool = False,
) -> Tuple[str, Dict, List[KeypointsAnnotation]]:
    """
    Load COCO keypoints dataset from annotation file.
    :param ann: A path to the JSON annotation file in COCO format.
    :param image_path_prefix:   A prefix to add to the image paths in the annotation file.
    :return:                    Tuple (class_names, annotations) where class_names is a list of class names
                                (respecting include_classes/exclude_classes/class_ids_to_ignore) and
                                annotations is a list of DetectionAnnotation objects.
    """
    with open(ann, "r") as f:
        coco = json.load(f)

    if len(coco["categories"]) != 1:
        raise ValueError("Dataset must contain exactly one category")

    # Extract class names and class ids
    category_name = coco["categories"][0]["name"]
    keypoints = coco["categories"][0]["keypoints"]
    num_keypoints = len(keypoints)

    # Extract box annotations
    ann_box_xyxy = xywh_to_xyxy_inplace(np.array([annotation["bbox"] for annotation in coco["annotations"]], dtype=np.float32), image_shape=None)
    ann_keypoints = np.stack([np.array(annotation["keypoints"], dtype=np.float32).reshape(num_keypoints, 3) for annotation in coco["annotations"]])
    ann_iscrowd = np.array([annotation["iscrowd"] for annotation in coco["annotations"]], dtype=bool)
    ann_image_ids = np.array([annotation["image_id"] for annotation in coco["annotations"]], dtype=int)
    ann_segmentations = np.array([annotation["segmentation"] for annotation in coco["annotations"]], dtype=np.object_)

    # We check whether the area is present in the annotations. If it does we use it, otherwise we compute it from the bbox.
    if "area" in coco["annotations"][0]:
        ann_areas = np.array([annotation["area"] for annotation in coco["annotations"]], dtype=np.float32)
    else:
        # Compute area from box
        # A multiplier of 0.53 is a heuristic from pycocotools to approximate the area of the pose instance
        # from the area of the bounding box.
        ann_areas = np.prod(ann_box_xyxy[:, 2:] - ann_box_xyxy[:, :2], axis=-1) * 0.53

    # Extract image stuff
    img_ids = np.array([img["id"] for img in coco["images"]], dtype=int)
    img_paths = np.array([img["file_name"] if "file_name" in img else "{:012}".format(img["id"]) + ".jpg" for img in coco["images"]], dtype=str)
    img_width_height = np.array([(img["width"], img["height"]) for img in coco["images"]], dtype=int)

    annotations = []

    if crowd_annotations_action == CrowdAnnotationActionEnum.MASK_AS_NORMAL:
        ann_iscrowd = np.zeros_like(ann_iscrowd, dtype=bool)
    elif crowd_annotations_action == CrowdAnnotationActionEnum.DROP_ANNOTATION:
        ann_box_xyxy = ann_box_xyxy[~ann_iscrowd]
        ann_keypoints = ann_keypoints[~ann_iscrowd]
        ann_areas = ann_areas[~ann_iscrowd]
        ann_segmentations = ann_segmentations[~ann_iscrowd]
        ann_image_ids = ann_image_ids[~ann_iscrowd]
        ann_iscrowd = ann_iscrowd[~ann_iscrowd]

    for img_id, image_path, (image_width, image_height) in zip(img_ids, img_paths, img_width_height):
        mask = ann_image_ids == img_id

        if image_path_prefix is not None:
            image_path = os.path.join(image_path_prefix, image_path)

        ann = KeypointsAnnotation(
            image_id=img_id,
            image_path=image_path,
            image_width=image_width,
            image_height=image_height,
            ann_boxes_xyxy=ann_box_xyxy[mask],
            ann_is_crowd=ann_iscrowd[mask],
            ann_areas=ann_areas[mask],
            ann_keypoints=ann_keypoints[mask],
            ann_segmentations=ann_segmentations[mask],
        )

        if remove_duplicate_annotations:
            joints = ann.ann_keypoints[:, :, :2]
            gt_joints1 = np.expand_dims(joints, axis=0)  # [1, Num_people, Num_joints, 2]
            gt_joints2 = np.expand_dims(joints, axis=1)  # [Num_people, 1, Num_joints, 2]
            diff = np.sqrt(np.sum((gt_joints1 - gt_joints2) ** 2, axis=-1))  # [Num_people, Num_people, Num_joints]
            diffmean = np.mean(diff, axis=-1)

            duplicate_mask = np.triu(diffmean < 2, k=1)
            duplicate_indexes_i, duplicate_indexes_j = np.nonzero(duplicate_mask)
            keep_mask = np.ones(len(ann.ann_boxes_xyxy), dtype=bool)
            for i, j in zip(duplicate_indexes_i, duplicate_indexes_j):
                keep_mask[j] = False

            ann.ann_boxes_xyxy = ann.ann_boxes_xyxy[keep_mask]
            ann.ann_keypoints = ann.ann_keypoints[keep_mask]
            ann.ann_areas = ann.ann_areas[keep_mask]
            ann.ann_segmentations = ann.ann_segmentations[keep_mask]
            ann.ann_is_crowd = ann.ann_is_crowd[keep_mask]

        if crowd_annotations_action == CrowdAnnotationActionEnum.DROP_SAMPLE:
            if ann.ann_is_crowd.any():
                continue

        annotations.append(ann)

    return category_name, keypoints, annotations

`rle2mask(rle, image_shape)`

Convert RLE to binary mask

Parameters:

Name	Type	Description	Default
`rle`	`np.ndarray`	A string containing RLE-encoded mask	required
`image_shape`	`Tuple[int, int]`	Output image shape (rows, cols)	required

Returns:

Type	Description
	A decoded binary mask

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py

def rle2mask(rle: np.ndarray, image_shape: Tuple[int, int]):
    """
    Convert RLE to binary mask
    :param rle: A string containing RLE-encoded mask
    :param image_shape: Output image shape (rows, cols)
    :return: A decoded binary mask
    """
    rle = np.array(rle, dtype=int)

    value = 0
    start = 0
    img = np.zeros(image_shape[0] * image_shape[1], dtype=np.uint8)
    for offset in rle:
        img[start : start + offset] = value
        start += offset
        value = 1 - value

    return img.reshape(*reversed(image_shape)).T

`segmentation2mask(segmentation, image_shape)`

Decode segmentation annotation into binary mask

Parameters:

Name	Type	Description	Default
`segmentation`		Input segmentation annotation. Can come in many forms: -	required
`image_shape`	`Tuple[int, int]`		required

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/coco_utils.py

def segmentation2mask(segmentation, image_shape: Tuple[int, int]):
    """
    Decode segmentation annotation into binary mask
    :param segmentation: Input segmentation annotation. Can come in many forms:
                         -
    :param image_shape:
    :return:
    """
    m = np.zeros(image_shape, dtype=np.uint8)

    if isinstance(segmentation, list) and len(segmentation):
        if isinstance(segmentation[0], numbers.Number):
            if len(segmentation) == 4:
                # box?
                unsupported_input_repr = pprint.pformat(segmentation)
                raise ValueError(
                    "Box encoding is not supported yet.\n"
                    "Please open an issue on GitHub (https://github.com/Deci-AI/super-gradients/issues) and attach the following information:\n"
                    "```python\n"
                    f"image_shape = {image_shape}\n"
                    f"segmentation = {unsupported_input_repr}\n"
                    "```python\n"
                )
            else:
                poly2mask(segmentation, m)
        else:
            for seg_i in segmentation:
                poly2mask(seg_i, m)
    elif isinstance(segmentation, dict) and "counts" in segmentation and "size" in segmentation:
        rle = segmentation["counts"]
        m = rle2mask(rle, image_shape)
    else:
        unsupported_input_repr = pprint.pformat(segmentation)
        raise ValueError(
            "Unknown segmentation format\n"
            "Please open an issue on GitHub (https://github.com/Deci-AI/super-gradients/issues) and attach the following information:\n"
            "```python\n"
            f"image_shape = {image_shape}\n"
            f"segmentation = {unsupported_input_repr}\n"
            "```python\n"
        )
    return m

`TrainRescoringDataset`

Bases: RescoringDataset

Implementation of the dataset for training the rescoring network. In this implementation, the dataset is a list of individual poses and DataLoader randomly samples them to form a batch during training.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/rescoring_dataset.py

class TrainRescoringDataset(RescoringDataset):
    """
    Implementation of the dataset for training the rescoring network.
    In this implementation, the dataset is a list of individual poses and DataLoader randomly samples
    them to form a batch during training.
    """

    def __init__(self, pkl_file: str):
        super().__init__(pkl_file)
        self.pred_poses = []
        self.pred_scores = []
        self.iou = []

        for sample in self.parse_pkl_file(pkl_file):
            pred_poses = sample["pred_poses"]
            pred_scores = sample["pred_scores"]
            iou = sample["iou"]

            self.pred_poses.extend(pred_poses)
            self.pred_scores.extend(pred_scores)
            self.iou.extend(iou)

        self.num_samples = len(self.pred_poses)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        inputs = torch.tensor(self.pred_poses[index])
        targets = torch.tensor([self.iou[index]])
        return inputs, targets

`ValTrainRescoringDataset`

Bases: RescoringDataset

Implementation of the dataset for validating the rescoring model. It differs from the training dataset implementation. Each sample represents a single image with all the poses on it, this enables us to compute pose estimation metrics after rescoring.

This dataset is intended to be used with DataLoader with batch_size=1. In this case we don't need to padd poses in collate_fn.

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/rescoring_dataset.py

class ValTrainRescoringDataset(RescoringDataset):
    """
    Implementation of the dataset for validating the rescoring model.
    It differs from the training dataset implementation. Each sample represents a single image with all the poses
    on it, this enables us to compute pose estimation metrics after rescoring.

    This dataset is intended to be used with DataLoader with batch_size=1.
    In this case we don't need to padd poses in collate_fn.
    """

    def __init__(self, pkl_file: str):
        super().__init__(pkl_file)

        self.pred_poses = []
        self.pred_scores = []
        self.extras = []
        self.gt_joints = []
        self.gt_is_crowd = []
        self.gt_area = []
        self.iou = []

        for sample in self.parse_pkl_file(pkl_file):
            pred_poses = sample["pred_poses"]
            pred_scores = sample["pred_scores"]
            extras = dict(gt_joints=sample["gt_joints"], gt_iscrowd=sample["gt_iscrowd"], gt_bboxes=sample["gt_bboxes"], gt_areas=sample["gt_areas"])
            iou = sample["iou"]

            self.pred_poses.append(pred_poses)
            self.pred_scores.append(pred_scores)
            self.extras.append(extras)
            self.iou.append(iou)

        self.num_joints = next(p.shape[1] for p in self.pred_poses if len(p))
        self.num_samples = len(self.pred_poses)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        inputs = torch.tensor(self.pred_poses[index]).reshape(-1, self.num_joints, 3)
        targets = torch.tensor(self.iou[index]).reshape(-1, 1)
        extras = self.extras[index]
        return inputs, targets, extras

`DEKRTargetsGenerator`

Bases: KeypointsTargetsGenerator

Target generator for pose estimation task tailored for the DEKR paper (https://arxiv.org/abs/2104.02300)

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

@register_target_generator()
class DEKRTargetsGenerator(KeypointsTargetsGenerator):
    """
    Target generator for pose estimation task tailored for the DEKR paper (https://arxiv.org/abs/2104.02300)
    """

    def __init__(self, output_stride: int, sigma: float, center_sigma: float, bg_weight: float, offset_radius: float):
        """

        :param output_stride: Downsampling factor for target maps (w.r.t to input image resolution)
        :param sigma: Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)
        :param center_sigma: Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)
        :param bg_weight: Weight assigned to all background pixels (used to re-weight the heatmap loss)
        :param offset_radius: Radius for the offset encoding (in pixels)
        """
        self.output_stride = output_stride
        self.sigma = sigma
        self.center_sigma = center_sigma
        self.bg_weight = bg_weight
        self.offset_radius = offset_radius

    def get_heat_val(self, sigma: float, x, y, x0, y0) -> float:
        g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma**2))
        return g

    def compute_area(self, joints: np.ndarray) -> np.ndarray:
        """
        Compute area of a bounding box for each instance
        :param joints:  [Num Instances, Num Joints, 3]
        :return: [Num Instances]
        """
        w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
        h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
        return w * h

    def sort_joints_by_area(self, joints: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Rearrange joints in descending order of area of bounding box around them
        """
        area = self.compute_area(joints)
        order = np.argsort(-area)
        joints = joints[order]
        area = area[order]
        return joints, area

    def augment_with_center_joint(self, joints: np.ndarray) -> np.ndarray:
        """
        Augment set of joints with additional center joint.
        Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint.
        Only instances with at least one visible joint are returned.

        :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
        :return: [Num Instances, Num Joints + 1, 3]
        """
        augmented_joints = []
        num_joints = joints.shape[1]
        num_joints_with_center = num_joints + 1

        for keypoints in joints:
            # Computing a center point for each person
            visible_keypoints = keypoints[:, 2] > 0
            joints_sum = np.sum(keypoints[:, :2] * np.expand_dims(visible_keypoints, -1), axis=0)
            num_vis_joints = np.count_nonzero(visible_keypoints)
            if num_vis_joints == 0:
                raise ValueError("No visible joints found in instance. ")

            keypoints_with_center = np.zeros((num_joints_with_center, 3))
            keypoints_with_center[0:num_joints] = keypoints
            keypoints_with_center[-1, :2] = joints_sum / num_vis_joints
            keypoints_with_center[-1, 2] = 1

            augmented_joints.append(keypoints_with_center)

        joints = np.array(augmented_joints, dtype=np.float32).reshape((-1, num_joints_with_center, 3))
        return joints

    def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Encode the keypoints into dense targets that participate in loss computation.
        :param image: Image tensor [3, H, W]
        :param joints: [Instances, NumJoints, 3]
        :param mask: [H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.
        :return: Tuple of (heatmap, mask, offset, offset_weight)
            heatmap    - [NumJoints+1, H // Output Stride, W // Output Stride]
            mask       - [NumJoints+1, H // Output Stride, H // Output Stride]
            offset     - [NumJoints*2, H // Output Stride, W // Output Stride]
            offset_weight - [NumJoints*2, H // Output Stride, W // Output Stride]
        """
        if image.shape[1:3] != mask.shape[:2]:
            raise ValueError(f"Image and mask should have the same shape {image.shape[1:3]} != {mask.shape[:2]}")

        if image.shape[1] % self.output_stride != 0 or image.shape[2] % self.output_stride != 0:
            raise ValueError("Image shape should be divisible by output stride")

        num_instances, num_joints, _ = joints.shape
        num_joints_with_center = num_joints + 1

        joints, area = self.sort_joints_by_area(joints)
        joints = self.augment_with_center_joint(joints)

        # Compute the size of the target maps
        rows, cols = mask.shape
        output_rows, output_cols = rows // self.output_stride, cols // self.output_stride

        heatmaps = np.zeros(
            shape=(num_joints_with_center, output_rows, output_cols),
            dtype=np.float32,
        )

        ignored_hms = 2 * np.ones(
            shape=(num_joints_with_center, output_rows, output_cols),
            dtype=np.float32,
        )  # Start with 2 in all places

        offset_map = np.zeros(
            (num_joints * 2, output_rows, output_cols),
            dtype=np.float32,
        )
        offset_weight = np.zeros(
            (num_joints * 2, output_rows, output_cols),
            dtype=np.float32,
        )

        sx = output_cols / cols
        sy = output_rows / rows
        joints = joints.copy()
        joints[:, :, 0] *= sx
        joints[:, :, 1] *= sy

        for person_id, p in enumerate(joints):
            for idx, pt in enumerate(p):
                if idx < num_joints:  # Last joint index is object center
                    sigma = self.sigma
                else:
                    sigma = self.center_sigma

                if pt[2] > 0:
                    x, y = pt[0], pt[1]
                    if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                        continue

                    ul = int(np.floor(x - 3 * sigma - 1)), int(np.floor(y - 3 * sigma - 1))
                    br = int(np.ceil(x + 3 * sigma + 1)), int(np.ceil(y + 3 * sigma + 1))

                    aa, bb = max(0, ul[1]), min(br[1], output_rows)
                    cc, dd = max(0, ul[0]), min(br[0], output_cols)

                    joint_rg = np.zeros((bb - aa, dd - cc), dtype=np.float32)
                    for sy in range(aa, bb):
                        for sx in range(cc, dd):
                            # EK: Note we round x/y values here to obtain clear peak in the center of odd-sized heatmap
                            # joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, x, y)
                            joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, int(x), int(y))

                    # It is important for RFL loss to have 1.0 in heatmap. since 0.9999 would be interpreted as negative pixel
                    joint_rg[joint_rg.shape[0] // 2, joint_rg.shape[1] // 2] = 1

                    heatmaps[idx, aa:bb, cc:dd] = np.maximum(heatmaps[idx, aa:bb, cc:dd], joint_rg)
                    # print(heatmaps[-1, 0, 0])
                    ignored_hms[idx, aa:bb, cc:dd] = 1.0

        for person_id, p in enumerate(joints):
            person_area = area[person_id]
            offset_weight_factor = 1.0 / np.clip(np.sqrt(person_area), a_min=1, a_max=None)
            ct_x = int(p[-1, 0])
            ct_y = int(p[-1, 1])
            ct_v = int(p[-1, 2])
            if ct_v < 1 or ct_x < 0 or ct_y < 0 or ct_x >= output_cols or ct_y >= output_rows:
                continue

            for idx, pt in enumerate(p[:-1]):
                if pt[2] > 0:
                    x, y = pt[0], pt[1]
                    if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                        continue

                    start_x = max(int(ct_x - self.offset_radius), 0)
                    start_y = max(int(ct_y - self.offset_radius), 0)
                    end_x = min(int(ct_x + self.offset_radius), output_cols)
                    end_y = min(int(ct_y + self.offset_radius), output_rows)

                    for pos_x in range(start_x, end_x):
                        for pos_y in range(start_y, end_y):
                            offset_x = pos_x - x
                            offset_y = pos_y - y

                            offset_map[idx * 2, pos_y, pos_x] = offset_x
                            offset_map[idx * 2 + 1, pos_y, pos_x] = offset_y
                            offset_weight[idx * 2, pos_y, pos_x] = offset_weight_factor
                            offset_weight[idx * 2 + 1, pos_y, pos_x] = offset_weight_factor

        ignored_hms[ignored_hms == 2] = self.bg_weight

        mask = cv2.resize(mask, dsize=(output_cols, output_rows), interpolation=cv2.INTER_LINEAR)
        mask = (mask > 0).astype(np.float32)
        mask = mask * ignored_hms

        return heatmaps, mask, offset_map, offset_weight

`call(image, joints, mask)`

Encode the keypoints into dense targets that participate in loss computation.

Parameters:

Name	Type	Description	Default
`image`	`Tensor`	Image tensor [3, H, W]	required
`joints`	`np.ndarray`	[Instances, NumJoints, 3]	required
`mask`	`np.ndarray`	[H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.	required

Returns:

Type	Description
`Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]`	Tuple of (heatmap, mask, offset, offset_weight) heatmap - [NumJoints+1, H // Output Stride, W // Output Stride] mask - [NumJoints+1, H // Output Stride, H // Output Stride] offset - [NumJoints2, H // Output Stride, W // Output Stride] offset_weight - [NumJoints2, H // Output Stride, W // Output Stride]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Encode the keypoints into dense targets that participate in loss computation.
    :param image: Image tensor [3, H, W]
    :param joints: [Instances, NumJoints, 3]
    :param mask: [H,W] A mask that indicates which pixels should be included (1) or which one should be excluded (0) from loss computation.
    :return: Tuple of (heatmap, mask, offset, offset_weight)
        heatmap    - [NumJoints+1, H // Output Stride, W // Output Stride]
        mask       - [NumJoints+1, H // Output Stride, H // Output Stride]
        offset     - [NumJoints*2, H // Output Stride, W // Output Stride]
        offset_weight - [NumJoints*2, H // Output Stride, W // Output Stride]
    """
    if image.shape[1:3] != mask.shape[:2]:
        raise ValueError(f"Image and mask should have the same shape {image.shape[1:3]} != {mask.shape[:2]}")

    if image.shape[1] % self.output_stride != 0 or image.shape[2] % self.output_stride != 0:
        raise ValueError("Image shape should be divisible by output stride")

    num_instances, num_joints, _ = joints.shape
    num_joints_with_center = num_joints + 1

    joints, area = self.sort_joints_by_area(joints)
    joints = self.augment_with_center_joint(joints)

    # Compute the size of the target maps
    rows, cols = mask.shape
    output_rows, output_cols = rows // self.output_stride, cols // self.output_stride

    heatmaps = np.zeros(
        shape=(num_joints_with_center, output_rows, output_cols),
        dtype=np.float32,
    )

    ignored_hms = 2 * np.ones(
        shape=(num_joints_with_center, output_rows, output_cols),
        dtype=np.float32,
    )  # Start with 2 in all places

    offset_map = np.zeros(
        (num_joints * 2, output_rows, output_cols),
        dtype=np.float32,
    )
    offset_weight = np.zeros(
        (num_joints * 2, output_rows, output_cols),
        dtype=np.float32,
    )

    sx = output_cols / cols
    sy = output_rows / rows
    joints = joints.copy()
    joints[:, :, 0] *= sx
    joints[:, :, 1] *= sy

    for person_id, p in enumerate(joints):
        for idx, pt in enumerate(p):
            if idx < num_joints:  # Last joint index is object center
                sigma = self.sigma
            else:
                sigma = self.center_sigma

            if pt[2] > 0:
                x, y = pt[0], pt[1]
                if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                    continue

                ul = int(np.floor(x - 3 * sigma - 1)), int(np.floor(y - 3 * sigma - 1))
                br = int(np.ceil(x + 3 * sigma + 1)), int(np.ceil(y + 3 * sigma + 1))

                aa, bb = max(0, ul[1]), min(br[1], output_rows)
                cc, dd = max(0, ul[0]), min(br[0], output_cols)

                joint_rg = np.zeros((bb - aa, dd - cc), dtype=np.float32)
                for sy in range(aa, bb):
                    for sx in range(cc, dd):
                        # EK: Note we round x/y values here to obtain clear peak in the center of odd-sized heatmap
                        # joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, x, y)
                        joint_rg[sy - aa, sx - cc] = self.get_heat_val(sigma, sx, sy, int(x), int(y))

                # It is important for RFL loss to have 1.0 in heatmap. since 0.9999 would be interpreted as negative pixel
                joint_rg[joint_rg.shape[0] // 2, joint_rg.shape[1] // 2] = 1

                heatmaps[idx, aa:bb, cc:dd] = np.maximum(heatmaps[idx, aa:bb, cc:dd], joint_rg)
                # print(heatmaps[-1, 0, 0])
                ignored_hms[idx, aa:bb, cc:dd] = 1.0

    for person_id, p in enumerate(joints):
        person_area = area[person_id]
        offset_weight_factor = 1.0 / np.clip(np.sqrt(person_area), a_min=1, a_max=None)
        ct_x = int(p[-1, 0])
        ct_y = int(p[-1, 1])
        ct_v = int(p[-1, 2])
        if ct_v < 1 or ct_x < 0 or ct_y < 0 or ct_x >= output_cols or ct_y >= output_rows:
            continue

        for idx, pt in enumerate(p[:-1]):
            if pt[2] > 0:
                x, y = pt[0], pt[1]
                if x < 0 or y < 0 or x >= output_cols or y >= output_rows:
                    continue

                start_x = max(int(ct_x - self.offset_radius), 0)
                start_y = max(int(ct_y - self.offset_radius), 0)
                end_x = min(int(ct_x + self.offset_radius), output_cols)
                end_y = min(int(ct_y + self.offset_radius), output_rows)

                for pos_x in range(start_x, end_x):
                    for pos_y in range(start_y, end_y):
                        offset_x = pos_x - x
                        offset_y = pos_y - y

                        offset_map[idx * 2, pos_y, pos_x] = offset_x
                        offset_map[idx * 2 + 1, pos_y, pos_x] = offset_y
                        offset_weight[idx * 2, pos_y, pos_x] = offset_weight_factor
                        offset_weight[idx * 2 + 1, pos_y, pos_x] = offset_weight_factor

    ignored_hms[ignored_hms == 2] = self.bg_weight

    mask = cv2.resize(mask, dsize=(output_cols, output_rows), interpolation=cv2.INTER_LINEAR)
    mask = (mask > 0).astype(np.float32)
    mask = mask * ignored_hms

    return heatmaps, mask, offset_map, offset_weight

`init(output_stride, sigma, center_sigma, bg_weight, offset_radius)`

Parameters:

Name	Type	Description	Default
`output_stride`	`int`	Downsampling factor for target maps (w.r.t to input image resolution)	required
`sigma`	`float`	Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)	required
`center_sigma`	`float`	Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)	required
`bg_weight`	`float`	Weight assigned to all background pixels (used to re-weight the heatmap loss)	required
`offset_radius`	`float`	Radius for the offset encoding (in pixels)	required

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

def __init__(self, output_stride: int, sigma: float, center_sigma: float, bg_weight: float, offset_radius: float):
    """

    :param output_stride: Downsampling factor for target maps (w.r.t to input image resolution)
    :param sigma: Sigma of the gaussian kernel used to generate the heatmap (Effective radius of the heatmap would be 3*sigma)
    :param center_sigma: Sigma of the gaussian kernel used to generate the instance "center" heatmap (Effective radius of the heatmap would be 3*sigma)
    :param bg_weight: Weight assigned to all background pixels (used to re-weight the heatmap loss)
    :param offset_radius: Radius for the offset encoding (in pixels)
    """
    self.output_stride = output_stride
    self.sigma = sigma
    self.center_sigma = center_sigma
    self.bg_weight = bg_weight
    self.offset_radius = offset_radius

`augment_with_center_joint(joints)`

Augment set of joints with additional center joint. Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint. Only instances with at least one visible joint are returned.

Parameters:

Name	Type	Description	Default
`joints`	`np.ndarray`	[Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)	required

Returns:

Type	Description
`np.ndarray`	[Num Instances, Num Joints + 1, 3]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

def augment_with_center_joint(self, joints: np.ndarray) -> np.ndarray:
    """
    Augment set of joints with additional center joint.
    Returns a new array with shape [Instances, Joints+1, 3] where the last joint is the center joint.
    Only instances with at least one visible joint are returned.

    :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
    :return: [Num Instances, Num Joints + 1, 3]
    """
    augmented_joints = []
    num_joints = joints.shape[1]
    num_joints_with_center = num_joints + 1

    for keypoints in joints:
        # Computing a center point for each person
        visible_keypoints = keypoints[:, 2] > 0
        joints_sum = np.sum(keypoints[:, :2] * np.expand_dims(visible_keypoints, -1), axis=0)
        num_vis_joints = np.count_nonzero(visible_keypoints)
        if num_vis_joints == 0:
            raise ValueError("No visible joints found in instance. ")

        keypoints_with_center = np.zeros((num_joints_with_center, 3))
        keypoints_with_center[0:num_joints] = keypoints
        keypoints_with_center[-1, :2] = joints_sum / num_vis_joints
        keypoints_with_center[-1, 2] = 1

        augmented_joints.append(keypoints_with_center)

    joints = np.array(augmented_joints, dtype=np.float32).reshape((-1, num_joints_with_center, 3))
    return joints

`compute_area(joints)`

Compute area of a bounding box for each instance

Parameters:

Name	Type	Description	Default
`joints`	`np.ndarray`	[Num Instances, Num Joints, 3]	required

Returns:

Type	Description
`np.ndarray`	[Num Instances]

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

def compute_area(self, joints: np.ndarray) -> np.ndarray:
    """
    Compute area of a bounding box for each instance
    :param joints:  [Num Instances, Num Joints, 3]
    :return: [Num Instances]
    """
    w = np.max(joints[:, :, 0], axis=-1) - np.min(joints[:, :, 0], axis=-1)
    h = np.max(joints[:, :, 1], axis=-1) - np.min(joints[:, :, 1], axis=-1)
    return w * h

`sort_joints_by_area(joints)`

Rearrange joints in descending order of area of bounding box around them

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

def sort_joints_by_area(self, joints: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Rearrange joints in descending order of area of bounding box around them
    """
    area = self.compute_area(joints)
    order = np.argsort(-area)
    joints = joints[order]
    area = area[order]
    return joints, area

`KeypointsTargetsGenerator`

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

class KeypointsTargetsGenerator:
    @abc.abstractmethod
    def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]:
        """
        Encode input joints into target tensors

        :param image: [C,H,W] Input image tensor
        :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
        :param mask: [H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets
                           are not used during training and corresponding instances will be zero-masked.
                           Your implementation may use this mask when generating targets.
        :return: Encoded targets
        """
        raise NotImplementedError()

`call(image, joints, mask)` `abstractmethod`

Encode input joints into target tensors

Parameters:

Name	Type	Description	Default
`image`	`Tensor`	[C,H,W] Input image tensor	required
`joints`	`np.ndarray`	[Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)	required
`mask`	`np.ndarray`	[H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets are not used during training and corresponding instances will be zero-masked. Your implementation may use this mask when generating targets.	required

Returns:

Type	Description
`Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]`	Encoded targets

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py

@abc.abstractmethod
def __call__(self, image: Tensor, joints: np.ndarray, mask: np.ndarray) -> Union[Tensor, Tuple[Tensor, ...], Dict[str, Tensor]]:
    """
    Encode input joints into target tensors

    :param image: [C,H,W] Input image tensor
    :param joints: [Num Instances, Num Joints, 3] Last channel represents (x, y, visibility)
    :param mask: [H,W] Mask representing valid image areas. For instance, in COCO dataset crowd targets
                       are not used during training and corresponding instances will be zero-masked.
                       Your implementation may use this mask when generating targets.
    :return: Encoded targets
    """
    raise NotImplementedError()

`YoloNASPoseCollateFN`

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py

@register_collate_function()
class YoloNASPoseCollateFN:
    def __init__(self, set_image_to_none: bool = True):
        """

        :param set_image_to_none: If True, image and mask properties for each sample will be set to None after collation.
                                  After we collate images from samples into batch individual images are not needed anymore.
                                  Keeping them in sample slows down data transfer time and slows training 2X.
                                  If True, image and mask properties will be set to None after collation.
                                  If False, image and mask properties will be converted to torch tensors and kept in the sample.
        """
        self.set_image_to_none = set_image_to_none

    def __call__(self, batch: List[PoseEstimationSample]) -> Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]:
        """
        Collate samples into a batch.
        This collate function is compatible with YoloNASPose model

        :param batch: A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)
        :return: Tuple of (images, (boxes, joints), extras)
        - images: [Batch, 3, H, W]
        - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch
        - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility)
        - extras: A dict of extra information per image need for metric computation
        """
        all_images = []
        all_boxes = []
        all_joints = []
        all_crowd_masks = []

        for sample in batch:
            # Generate targets
            boxes, joints, is_crowd = self._get_targets(sample)
            all_boxes.append(boxes)
            all_joints.append(joints)
            all_crowd_masks.append(is_crowd)

            # Convert image & mask to tensors
            # Change image layout from HWC to CHW
            sample.image = torch.from_numpy(np.transpose(sample.image, [2, 0, 1]))
            sample.mask = torch.from_numpy(sample.mask)
            all_images.append(sample.image)

            # Remove image and mask from sample because at this point we don't need them anymore
            if self.set_image_to_none:
                sample.image = None
                sample.mask = None

            # Make sure additional samples are None, so they don't get collated as it causes collate to slow down
            sample.additional_samples = None

        all_images = default_collate(all_images)
        boxes = flat_collate_tensors_with_batch_index(all_boxes)
        joints = flat_collate_tensors_with_batch_index(all_joints)
        is_crowd = flat_collate_tensors_with_batch_index(all_crowd_masks)
        extras = {"gt_samples": batch}
        return all_images, (boxes, joints, is_crowd), extras

    def _get_targets(self, sample: PoseEstimationSample) -> Tuple[Tensor, Tensor, Tensor]:
        """
        Generate targets for training YoloNASPose from a single PoseEstimationSample
        :param sample: Input PoseEstimationSample
        :return:       Tuple of (boxes, joints, is_crowd)
                       - boxes - [NumInstances, 4] - torch tensor of bounding boxe (XYXY) for each pose instance in a sample
                       - joints - [NumInstances, NumJoints, 3] - torch tensor of pose joints for each pose instance in a sample
                       - is_crowd - [NumInstances, 1] - torch tensor of boolean flags indicating if a pose instance is crowd
        """
        if sample.image.shape[:2] != sample.mask.shape[:2]:
            raise ValueError(f"Image and mask should have the same shape {sample.image.shape[:2]} != {sample.mask.shape[:2]}")

        boxes_xyxy = xywh_to_xyxy(sample.bboxes_xywh, image_shape=None)
        is_crowd = sample.is_crowd
        if is_crowd is None:
            is_crowd = np.zeros(len(boxes_xyxy))

        return torch.from_numpy(boxes_xyxy), torch.from_numpy(sample.joints), torch.from_numpy(is_crowd.astype(int).reshape((-1, 1)))

`call(batch)`

Collate samples into a batch. This collate function is compatible with YoloNASPose model

Parameters:

Name	Type	Description	Default
`batch`	`List[PoseEstimationSample]`	A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)	required

Returns:

Type	Description
`Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]`	Tuple of (images, (boxes, joints), extras) - images: [Batch, 3, H, W] - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility) - extras: A dict of extra information per image need for metric computation

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py

def __call__(self, batch: List[PoseEstimationSample]) -> Tuple[Tensor, Tuple[Tensor, Tensor, Tensor], Dict]:
    """
    Collate samples into a batch.
    This collate function is compatible with YoloNASPose model

    :param batch: A list of samples from the dataset. Each sample is a tuple of (image, (boxes, joints), extras)
    :return: Tuple of (images, (boxes, joints), extras)
    - images: [Batch, 3, H, W]
    - boxes: [NumInstances, 5], last dimension represents (batch_index, x1, y1, x2, y2) of all boxes in a batch
    - joints: [NumInstances, NumJoints, 4] of all poses in a batch. Last dimension represents (batch_index, x, y, visibility)
    - extras: A dict of extra information per image need for metric computation
    """
    all_images = []
    all_boxes = []
    all_joints = []
    all_crowd_masks = []

    for sample in batch:
        # Generate targets
        boxes, joints, is_crowd = self._get_targets(sample)
        all_boxes.append(boxes)
        all_joints.append(joints)
        all_crowd_masks.append(is_crowd)

        # Convert image & mask to tensors
        # Change image layout from HWC to CHW
        sample.image = torch.from_numpy(np.transpose(sample.image, [2, 0, 1]))
        sample.mask = torch.from_numpy(sample.mask)
        all_images.append(sample.image)

        # Remove image and mask from sample because at this point we don't need them anymore
        if self.set_image_to_none:
            sample.image = None
            sample.mask = None

        # Make sure additional samples are None, so they don't get collated as it causes collate to slow down
        sample.additional_samples = None

    all_images = default_collate(all_images)
    boxes = flat_collate_tensors_with_batch_index(all_boxes)
    joints = flat_collate_tensors_with_batch_index(all_joints)
    is_crowd = flat_collate_tensors_with_batch_index(all_crowd_masks)
    extras = {"gt_samples": batch}
    return all_images, (boxes, joints, is_crowd), extras

`init(set_image_to_none=True)`

Parameters:

Name	Type	Description	Default
`set_image_to_none`	`bool`	If True, image and mask properties for each sample will be set to None after collation. After we collate images from samples into batch individual images are not needed anymore. Keeping them in sample slows down data transfer time and slows training 2X. If True, image and mask properties will be set to None after collation. If False, image and mask properties will be converted to torch tensors and kept in the sample.	`True`

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py

def __init__(self, set_image_to_none: bool = True):
    """

    :param set_image_to_none: If True, image and mask properties for each sample will be set to None after collation.
                              After we collate images from samples into batch individual images are not needed anymore.
                              Keeping them in sample slows down data transfer time and slows training 2X.
                              If True, image and mask properties will be set to None after collation.
                              If False, image and mask properties will be converted to torch tensors and kept in the sample.
    """
    self.set_image_to_none = set_image_to_none

`flat_collate_tensors_with_batch_index(labels_batch)`

Concatenate tensors along the first dimension and add a sample index as the first element in the last dimension.

Parameters:

Name	Type	Description	Default
`labels_batch`	`List[Tensor]`	A list of targets per image (each of arbitrary length: [N1, ..., C], [N2, ..., C], [N3, ..., C],...)	required

Returns:

Type	Description
`Tensor`	A single tensor of shape [N1+N2+N3+..., ..., C+1].

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py

def flat_collate_tensors_with_batch_index(labels_batch: List[Tensor]) -> Tensor:
    """
    Concatenate tensors along the first dimension and add a sample index as the first element in the last dimension.

    :param labels_batch: A list of targets per image (each of arbitrary length: [N1, ..., C], [N2, ..., C], [N3, ..., C],...)
    :return:             A single tensor of shape [N1+N2+N3+..., ..., C+1].
    """
    labels_batch_indexed = []
    for i, labels in enumerate(labels_batch):
        batch_column = labels.new_ones(labels.shape[:-1] + (1,)) * i
        labels = torch.cat((batch_column, labels), dim=-1)
        labels_batch_indexed.append(labels)
    return torch.cat(labels_batch_indexed, 0)

`undo_flat_collate_tensors_with_batch_index(flat_tensor, batch_size)`

Unrolls the flat tensor into list of tensors per batch item. As name suggest it undoes what flat_collate_tensors_with_batch_index does.

Parameters:

Name	Type	Description	Default
`flat_tensor`	`Tensor`	Tensor of shape [N1+N2+N3+..., ..., C+1].	required
`batch_size`	`int`	The batch size (Number of items in the batch)	required

Returns:

Type	Description
`List[Tensor]`	List of tensors [N1, ..., C], [N2, ..., C], [N3, ..., C],...

Source code in src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py

def undo_flat_collate_tensors_with_batch_index(flat_tensor: Tensor, batch_size: int) -> List[Tensor]:
    """
    Unrolls the flat tensor into list of tensors per batch item.
    As name suggest it undoes what flat_collate_tensors_with_batch_index does.

    :param flat_tensor: Tensor of shape [N1+N2+N3+..., ..., C+1].
    :param batch_size:  The batch size (Number of items in the batch)
    :return:            List of tensors [N1, ..., C], [N2, ..., C], [N3, ..., C],...
    """
    items = []
    batch_index_roi = [slice(None)] + [0] * (flat_tensor.ndim - 1)
    batch_index = flat_tensor[batch_index_roi]
    for i in range(batch_size):
        mask = batch_index == i
        items.append(flat_tensor[mask][..., 1:])
    return items

`ClassBalancedSampler`

Bases: WeightedRandomSampler

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py

@register_sampler(Samplers.CLASS_BALANCED)
class ClassBalancedSampler(WeightedRandomSampler):
    def __init__(
        self,
        dataset: Optional[HasClassesInformation] = None,
        precomputed_factors_file: Optional[str] = None,
        oversample_threshold: Optional[float] = None,
        oversample_aggressiveness: float = 0.5,
        num_samples: Optional[int] = None,
        generator=None,
    ) -> None:
        """
        Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.
        """

        if dataset is None and precomputed_factors_file is None:
            raise ValueError("`dataset` and `precomputed_factors` cannot be both None.")

        if dataset is not None and precomputed_factors_file is not None:
            # this logic is to simplify `_instantiate_sampler` method.
            warnings.warn("Both `dataset` and `precomputed_factors_file` are passed. `dataset` WILL BE IGNORED!")

        if precomputed_factors_file is not None:
            repeat_factors = ClassBalancer.from_precomputed_sample_repeat_factors(precomputed_factors_file)
        else:
            if not isinstance(dataset, HasClassesInformation):
                raise ValueError(f"`dataset` must be an instance of `{HasClassesInformation.__name__}`.")

            repeat_factors = ClassBalancer.get_sample_repeat_factors(dataset, oversample_threshold, oversample_aggressiveness)

        weights = np.array(repeat_factors) / sum(repeat_factors)

        super().__init__(weights=weights, num_samples=num_samples or len(weights), replacement=True, generator=generator)

`init(dataset=None, precomputed_factors_file=None, oversample_threshold=None, oversample_aggressiveness=0.5, num_samples=None, generator=None)`

Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py

def __init__(
    self,
    dataset: Optional[HasClassesInformation] = None,
    precomputed_factors_file: Optional[str] = None,
    oversample_threshold: Optional[float] = None,
    oversample_aggressiveness: float = 0.5,
    num_samples: Optional[int] = None,
    generator=None,
) -> None:
    """
    Wrap WeightedRandomSampler with weights that are computed from the class frequencies of the dataset.
    """

    if dataset is None and precomputed_factors_file is None:
        raise ValueError("`dataset` and `precomputed_factors` cannot be both None.")

    if dataset is not None and precomputed_factors_file is not None:
        # this logic is to simplify `_instantiate_sampler` method.
        warnings.warn("Both `dataset` and `precomputed_factors_file` are passed. `dataset` WILL BE IGNORED!")

    if precomputed_factors_file is not None:
        repeat_factors = ClassBalancer.from_precomputed_sample_repeat_factors(precomputed_factors_file)
    else:
        if not isinstance(dataset, HasClassesInformation):
            raise ValueError(f"`dataset` must be an instance of `{HasClassesInformation.__name__}`.")

        repeat_factors = ClassBalancer.get_sample_repeat_factors(dataset, oversample_threshold, oversample_aggressiveness)

    weights = np.array(repeat_factors) / sum(repeat_factors)

    super().__init__(weights=weights, num_samples=num_samples or len(weights), replacement=True, generator=generator)

`ClassBalancer`

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py

class ClassBalancer:
    @staticmethod
    def get_sample_repeat_factors(
        class_information_provider: HasClassesInformation,
        oversample_threshold: Optional[float] = None,
        oversample_aggressiveness: float = 0.5,
    ) -> List[float]:
        """
        Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

        :param class_information_provider:      An object (probably a dataset) that provides the class information.
        :param oversample_threshold:            A frequency threshold (fraction, 0-1). Classes that are *less frequent* than this threshold will be oversampled.
                                                The default value is None. If None, the median of the class frequencies will be used.
        :param oversample_aggressiveness:       How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is.
                                                The default value is 0.5, and corresponds to the implementation in the paper.
                                                A value of 0.0 corresponds to no oversampling.


        The repeat factor is computed as followed:
        1. For each class c, compute the fraction # of images that contain it (its frequency): :math:`f(c)`
        2. For each class c, compute the category-level repeat factor: :math:`r(c) = max(1, aggressiveness(threshold/f(c)))`
        3. For each image I, compute the image-level repeat factor: :math:`r(I) = max_{c in I} r(c)`

        Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.
        """

        class_information = class_information_provider.get_dataset_classes_information()  # shape = (dataset_length, num_classes)

        # 1. For each category c, compute the fraction # of images that contain it: f(c)
        class_frequencies = np.sum(class_information, axis=0)
        class_frequencies = class_frequencies / len(class_information)

        # 2. For each class c, compute the class-level repeat factor: r(c) = max(1, sqrt(t/f(c)))
        category_repeat = {
            cat_id: cat_repeat
            for cat_id, cat_repeat in enumerate(_default_oversample_heuristic(class_frequencies, oversample_threshold, oversample_aggressiveness))
        }  # dict for ease of debugging

        # 3. For each image I, compute the image-level repeat factor: r(I) = max_{c in I} r(c)
        repeat_factors = list()
        categories = np.arange(class_information.shape[1])
        for sample_cat_freq in class_information:
            cat_ids = categories[sample_cat_freq != 0]
            if len(cat_ids) == 0:  # in case image doesn't have annotations, we will not over-sample nor ignore it
                repeat_factors.append(1.0)
            else:
                repeat_factors.append(max({category_repeat[cat_id] for cat_id in cat_ids}))

        return repeat_factors  # len = dataset_length

    @staticmethod
    def precompute_sample_repeat_factors(
        output_path: str,
        class_information_provider: HasClassesInformation,
        oversample_threshold: Optional[float] = None,
    ):
        repeat_factors: List[float] = ClassBalancer.get_sample_repeat_factors(
            class_information_provider=class_information_provider,
            oversample_threshold=oversample_threshold,
        )

        str_repeat_factors = [np.format_float_positional(rf, trim="0", precision=4) for rf in repeat_factors]

        with open(output_path, "w") as f:
            json.dump(str_repeat_factors, f)

    @staticmethod
    def from_precomputed_sample_repeat_factors(precomputed_path: str) -> List[float]:
        """
        Loads the repeat factors from a precomputed file.
        """
        if not os.path.exists(precomputed_path):
            raise FileNotFoundError(f"`{precomputed_path}` does not exist.")

        with open(precomputed_path, "r") as f:
            loaded = json.load(f)

        return list(map(lambda x: float(x), loaded))

`from_precomputed_sample_repeat_factors(precomputed_path)` `staticmethod`

Loads the repeat factors from a precomputed file.

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py

@staticmethod
def from_precomputed_sample_repeat_factors(precomputed_path: str) -> List[float]:
    """
    Loads the repeat factors from a precomputed file.
    """
    if not os.path.exists(precomputed_path):
        raise FileNotFoundError(f"`{precomputed_path}` does not exist.")

    with open(precomputed_path, "r") as f:
        loaded = json.load(f)

    return list(map(lambda x: float(x), loaded))

`get_sample_repeat_factors(class_information_provider, oversample_threshold=None, oversample_aggressiveness=0.5)` `staticmethod`

Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

Parameters:

Name	Type	Description	Default
`class_information_provider`	`HasClassesInformation`	An object (probably a dataset) that provides the class information.	required
`oversample_threshold`	`Optional[float]`	A frequency threshold (fraction, 0-1). Classes that are less frequent than this threshold will be oversampled. The default value is None. If None, the median of the class frequencies will be used.	`None`
`oversample_aggressiveness`	`float`	How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is. The default value is 0.5, and corresponds to the implementation in the paper. A value of 0.0 corresponds to no oversampling. The repeat factor is computed as followed: 1. For each class c, compute the fraction # of images that contain it (its frequency): :math:`f(c)` 2. For each class c, compute the category-level repeat factor: :math:`r(c) = max(1, aggressiveness(threshold/f(c)))` 3. For each image I, compute the image-level repeat factor: :math:`r(I) = max_{c in I} r(c)` Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.	`0.5`

Source code in src/super_gradients/training/datasets/samplers/class_balanced_sampler.py

@staticmethod
def get_sample_repeat_factors(
    class_information_provider: HasClassesInformation,
    oversample_threshold: Optional[float] = None,
    oversample_aggressiveness: float = 0.5,
) -> List[float]:
    """
    Oversampling scarce classes from detection dataset, following sampling strategy described in https://arxiv.org/pdf/1908.03195.pdf.

    :param class_information_provider:      An object (probably a dataset) that provides the class information.
    :param oversample_threshold:            A frequency threshold (fraction, 0-1). Classes that are *less frequent* than this threshold will be oversampled.
                                            The default value is None. If None, the median of the class frequencies will be used.
    :param oversample_aggressiveness:       How aggressive the oversampling is. The higher the value, the more aggressive the oversampling is.
                                            The default value is 0.5, and corresponds to the implementation in the paper.
                                            A value of 0.0 corresponds to no oversampling.


    The repeat factor is computed as followed:
    1. For each class c, compute the fraction # of images that contain it (its frequency): :math:`f(c)`
    2. For each class c, compute the category-level repeat factor: :math:`r(c) = max(1, aggressiveness(threshold/f(c)))`
    3. For each image I, compute the image-level repeat factor: :math:`r(I) = max_{c in I} r(c)`

    Returns a list of repeat factors (length = dataset_length). How to read: result[i] is a float, indicates the repeat factor of image i.
    """

    class_information = class_information_provider.get_dataset_classes_information()  # shape = (dataset_length, num_classes)

    # 1. For each category c, compute the fraction # of images that contain it: f(c)
    class_frequencies = np.sum(class_information, axis=0)
    class_frequencies = class_frequencies / len(class_information)

    # 2. For each class c, compute the class-level repeat factor: r(c) = max(1, sqrt(t/f(c)))
    category_repeat = {
        cat_id: cat_repeat
        for cat_id, cat_repeat in enumerate(_default_oversample_heuristic(class_frequencies, oversample_threshold, oversample_aggressiveness))
    }  # dict for ease of debugging

    # 3. For each image I, compute the image-level repeat factor: r(I) = max_{c in I} r(c)
    repeat_factors = list()
    categories = np.arange(class_information.shape[1])
    for sample_cat_freq in class_information:
        cat_ids = categories[sample_cat_freq != 0]
        if len(cat_ids) == 0:  # in case image doesn't have annotations, we will not over-sample nor ignore it
            repeat_factors.append(1.0)
        else:
            repeat_factors.append(max({category_repeat[cat_id] for cat_id in cat_ids}))

    return repeat_factors  # len = dataset_length

`DatasetFromSampler`

Bases: Dataset

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py

class DatasetFromSampler(Dataset):
    def __init__(self, sampler: Sampler):
        self.sampler = sampler
        self.sampler_list = None

    def __getitem__(self, index: int):
        if self.sampler_list is None:  # we don't instantiate the list in __init__ because want to shuffle first (happens in DistributedSamplerWrapper.__iter__)
            self.sampler_list = list(self.sampler)
        return self.sampler_list[index]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.sampler)

`len()`

Returns: int: length of the dataset

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py

def __len__(self) -> int:
    """
    Returns:
        int: length of the dataset
    """
    return len(self.sampler)

`DistributedSamplerWrapper`

Bases: DistributedSampler

Wrapper over Sampler for distributed training. Allows you to use any sampler in distributed mode.

It is especially useful in conjunction with torch.nn.parallel.DistributedDataParallel. In such case, each process can pass a DistributedSamplerWrapper instance as a DataLoader sampler, and load a subset of subsampled data of the original dataset that is exclusive to it.

.. note:: Sampler is assumed to be of constant size.

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py

class DistributedSamplerWrapper(DistributedSampler):
    """
    Wrapper over `Sampler` for distributed training.
    Allows you to use any sampler in distributed mode.

    It is especially useful in conjunction with
    `torch.nn.parallel.DistributedDataParallel`. In such case, each
    process can pass a DistributedSamplerWrapper instance as a DataLoader
    sampler, and load a subset of subsampled data of the original dataset
    that is exclusive to it.

    .. note::
        Sampler is assumed to be of constant size.
    """

    def __init__(
        self,
        sampler,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
    ):
        """

        Args:
            sampler: Sampler used for subsampling
            num_replicas (int, optional): Number of processes participating in
              distributed training
            rank (int, optional): Rank of the current process
              within ``num_replicas``
            shuffle (bool, optional): If true (default),
              sampler will shuffle the indices
        """
        super(DistributedSamplerWrapper, self).__init__(
            DatasetFromSampler(sampler),
            num_replicas=num_replicas,
            rank=rank,
            shuffle=shuffle,
        )
        self.sampler = sampler

    def __iter__(self):

        self.dataset = DatasetFromSampler(self.sampler)
        indexes_of_indexes = super().__iter__()
        subsampler_indexes = self.dataset
        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))

`init(sampler, num_replicas=None, rank=None, shuffle=True)`

Args: sampler: Sampler used for subsampling num_replicas (int, optional): Number of processes participating in distributed training rank (int, optional): Rank of the current process within num_replicas shuffle (bool, optional): If true (default), sampler will shuffle the indices

Source code in src/super_gradients/training/datasets/samplers/distributed_sampler_wrapper.py

def __init__(
    self,
    sampler,
    num_replicas: Optional[int] = None,
    rank: Optional[int] = None,
    shuffle: bool = True,
):
    """

    Args:
        sampler: Sampler used for subsampling
        num_replicas (int, optional): Number of processes participating in
          distributed training
        rank (int, optional): Rank of the current process
          within ``num_replicas``
        shuffle (bool, optional): If true (default),
          sampler will shuffle the indices
    """
    super(DistributedSamplerWrapper, self).__init__(
        DatasetFromSampler(sampler),
        num_replicas=num_replicas,
        rank=rank,
        shuffle=shuffle,
    )
    self.sampler = sampler

`RepeatAugSampler`

Bases: Sampler

Sampler that restricts data loading to a subset of the dataset for distributed, with repeated augmentation. It ensures that different each augmented version of a sample will be visible to a different process (GPU). Heavily based on torch.utils.data.DistributedSampler This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py Copyright (c) 2015-present, Facebook, Inc.

Below code is modified from: https://github.com/rwightman/pytorch-image-models/blame/master/timm/data/distributed_sampler.py

Note this sampler is currently supported only for DDP training.

Arguments: dataset (torch.utils.data.Dataset): dataset to sample from. num_replicas (int): Number of dataset replicas, equals to world_size when set to 0 (default=0). shuffle (bool): whether to shuffle the dataset indices (default=True). num_repeats (int): amount of repetitions for each example. selected_round (int): When > 0, the number of samples to select per epoch for each rank is determined by

    int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))

    (default=256)

selected_ratio (int): ratio to reduce selected samples by, num_replicas if 0.

Source code in src/super_gradients/training/datasets/samplers/repeated_augmentation_sampler.py

@register_sampler(Samplers.REPEAT_AUG)
class RepeatAugSampler(Sampler):
    """
    Sampler that restricts data loading to a subset of the dataset for distributed,
    with repeated augmentation.
    It ensures that different each augmented version of a sample will be visible to a
    different process (GPU). Heavily based on torch.utils.data.DistributedSampler
    This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py
    Copyright (c) 2015-present, Facebook, Inc.

    Below code is modified from:
     https://github.com/rwightman/pytorch-image-models/blame/master/timm/data/distributed_sampler.py

    Note this sampler is currently supported only for DDP training.

    Arguments:
        dataset (torch.utils.data.Dataset): dataset to sample from.
        num_replicas (int): Number of dataset replicas, equals to world_size when set to 0 (default=0).
        shuffle (bool): whether to shuffle the dataset indices (default=True).
        num_repeats (int): amount of repetitions for each example.
        selected_round (int): When > 0, the number of samples to select per epoch for each rank is determined by

            int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))

            (default=256)

        selected_ratio (int): ratio to reduce selected samples by, num_replicas if 0.

    """

    def __init__(
        self,
        dataset: torch.utils.data.Dataset,
        num_replicas: int = None,
        rank: int = None,
        shuffle: bool = True,
        num_repeats: int = 3,
        selected_round: int = 256,
        selected_ratio: int = 0,
    ):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.shuffle = shuffle
        self.num_repeats = num_repeats
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * num_repeats / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
        # Determine the number of samples to select per epoch for each rank.
        # num_selected logic defaults to be the same as original RASampler impl, but this one can be tweaked
        # via selected_ratio and selected_round args.
        selected_ratio = selected_ratio or num_replicas  # ratio to reduce selected samples by, num_replicas if 0

        if selected_round:
            self.num_selected_samples = int(math.floor(len(self.dataset) // selected_round * selected_round / selected_ratio))
        else:
            self.num_selected_samples = int(math.ceil(len(self.dataset) / selected_ratio))

    def __iter__(self):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(self.epoch)
        if self.shuffle:
            indices = torch.randperm(len(self.dataset), generator=g)
        else:
            indices = torch.arange(start=0, end=len(self.dataset))

        # produce repeats e.g. [0, 0, 0, 1, 1, 1, 2, 2, 2....]
        if isinstance(self.num_repeats, float) and not self.num_repeats.is_integer():
            # resample for repeats w/ non-integer ratio
            repeat_size = math.ceil(self.num_repeats * len(self.dataset))
            indices = indices[torch.tensor([int(i // self.num_repeats) for i in range(repeat_size)])]
        else:
            indices = torch.repeat_interleave(indices, repeats=int(self.num_repeats), dim=0)
        indices = indices.tolist()  # leaving as tensor thrashes dataloader memory
        # add extra samples to make it evenly divisible
        padding_size = self.total_size - len(indices)
        if padding_size > 0:
            indices += indices[:padding_size]
        assert len(indices) == self.total_size

        # subsample per rank
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples

        # return up to num selected samples
        return iter(indices[: self.num_selected_samples])

    def __len__(self):
        return self.num_selected_samples

    def set_epoch(self, epoch):
        self.epoch = epoch

`CityscapesConcatDataset`

Bases: ConcatDataset

Support building a Cityscapes dataset which includes multiple group of samples from several list files. i.e to initiate a trainval dataset:

trainval_set = CityscapesConcatDataset( root_dir='/data', list_files=['lists/train.lst', 'lists/val.lst'], labels_csv_path='lists/labels.csv', ... )

i.e to initiate a combination of the train-set with AutoLabelling-set:

train_al_set = CityscapesConcatDataset( root_dir='/data', list_files=['lists/train.lst', 'lists/auto_labelling.lst'], labels_csv_path='lists/labels.csv', ... )

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py

@register_dataset(Datasets.CITYSCAPES_CONCAT_DATASET)
class CityscapesConcatDataset(ConcatDataset):
    """
    Support building a Cityscapes dataset which includes multiple group of samples from several list files.
    i.e to initiate a trainval dataset:
    >>> trainval_set = CityscapesConcatDataset(
    >>>    root_dir='/data', list_files=['lists/train.lst', 'lists/val.lst'], labels_csv_path='lists/labels.csv', ...
    >>> )

    i.e to initiate a combination of the train-set with AutoLabelling-set:
    >>> train_al_set = CityscapesConcatDataset(
    >>>    root_dir='/data', list_files=['lists/train.lst', 'lists/auto_labelling.lst'], labels_csv_path='lists/labels.csv', ...
    >>> )
    """

    def __init__(self, root_dir: str, list_files: List[str], labels_csv_path: str, **kwargs):
        """
        :param root_dir:        Absolute path to root directory of the dataset.
        :param list_files:      List of list files that contains names of images to load,
                                line format: <image_path> <label_path>. The path is relative to root.
        :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
        :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
        """
        super().__init__(
            datasets=[
                CityscapesDataset(
                    root_dir=root_dir,
                    list_file=list_file,
                    labels_csv_path=labels_csv_path,
                    **kwargs,
                )
                for list_file in list_files
            ]
        )

`init(root_dir, list_files, labels_csv_path, **kwargs)`

Parameters:

Name	Type	Description	Default
`root_dir`	`str`	Absolute path to root directory of the dataset.	required
`list_files`	`List[str]`	List of list files that contains names of images to load, line format: . The path is relative to root.	required
`labels_csv_path`	`str`	Path to csv file, with labels metadata and mapping. The path is relative to root.	required
`kwargs`		Any hyper params required for the dataset, i.e img_size, crop_size, cache_images	`{}`

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py

def __init__(self, root_dir: str, list_files: List[str], labels_csv_path: str, **kwargs):
    """
    :param root_dir:        Absolute path to root directory of the dataset.
    :param list_files:      List of list files that contains names of images to load,
                            line format: <image_path> <label_path>. The path is relative to root.
    :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
    :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
    """
    super().__init__(
        datasets=[
            CityscapesDataset(
                root_dir=root_dir,
                list_file=list_file,
                labels_csv_path=labels_csv_path,
                **kwargs,
            )
            for list_file in list_files
        ]
    )

`CityscapesDataset`

Bases: SegmentationDataSet

CityscapesDataset - Segmentation Data Set Class for Cityscapes Segmentation Data Set, main resolution of dataset: (2048 x 1024). Not all the original labels are used for training and evaluation, according to cityscape paper: "Classes that are too rare are excluded from our benchmark, leaving 19 classes for evaluation". For more details about the dataset labels format see: https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py

To use this Dataset you need to:

Download cityscape dataset (https://www.cityscapes-dataset.com/downloads/)

root_dir (in recipe default to /data/cityscapes) ├─── gtFine │ ├── test │ │ ├── berlin │ │ │ ├── berlin_000000_000019_gtFine_color.png │ │ │ ├── berlin_000000_000019_gtFine_instanceIds.png │ │ │ └── ... │ │ ├── bielefeld │ │ │ └── ... │ │ └── ... │ ├─── train │ │ └── ... │ └─── val │ └── ... └─── leftImg8bit ├── test │ └── ... ├─── train │ └── ... └─── val └── ...
Download metadata folder (https://deci-pretrained-models.s3.amazonaws.com/cityscape_lists.zip)

lists ├── labels.csv ├── test.lst ├── train.lst ├── trainval.lst └── val.lst
Move Metadata folder to the Cityscape folder

root_dir (in recipe default to /data/cityscapes) ├─── gtFine │ └── ... ├─── leftImg8bit │ └── ... └─── lists └── ...

Example: >> CityscapesDataset(root_dir='.../root_dir', list_file='lists/train.lst', labels_csv_path='lists/labels.csv', ...)

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py

@register_dataset(Datasets.CITYSCAPES_DATASET)
class CityscapesDataset(SegmentationDataSet):
    """
    CityscapesDataset - Segmentation Data Set Class for Cityscapes Segmentation Data Set,
    main resolution of dataset: (2048 x 1024).
    Not all the original labels are used for training and evaluation, according to cityscape paper:
    "Classes that are too rare are excluded from our benchmark, leaving 19 classes for evaluation".
    For more details about the dataset labels format see:
    https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py

    To use this Dataset you need to:

    - Download cityscape dataset (https://www.cityscapes-dataset.com/downloads/)

        root_dir (in recipe default to /data/cityscapes)
            ├─── gtFine
            │       ├── test
            │       │     ├── berlin
            │       │     │   ├── berlin_000000_000019_gtFine_color.png
            │       │     │   ├── berlin_000000_000019_gtFine_instanceIds.png
            │       │     │   └── ...
            │       │     ├── bielefeld
            │       │     │   └── ...
            │       │     └── ...
            │       ├─── train
            │       │     └── ...
            │       └─── val
            │             └── ...
            └─── leftImg8bit
                    ├── test
                    │     └── ...
                    ├─── train
                    │     └── ...
                    └─── val
                          └── ...

    - Download metadata folder (https://deci-pretrained-models.s3.amazonaws.com/cityscape_lists.zip)

        lists
            ├── labels.csv
            ├── test.lst
            ├── train.lst
            ├── trainval.lst
            └── val.lst

    - Move Metadata folder to the Cityscape folder

        root_dir (in recipe default to /data/cityscapes)
            ├─── gtFine
            │      └── ...
            ├─── leftImg8bit
            │      └── ...
            └─── lists
                   └── ...

    Example:
        >> CityscapesDataset(root_dir='.../root_dir', list_file='lists/train.lst', labels_csv_path='lists/labels.csv', ...)
    """

    def __init__(self, root_dir: str, list_file: str, labels_csv_path: str, **kwargs):
        """
        :param root_dir:        Absolute path to root directory of the dataset.
        :param list_file:       List file that contains names of images to load, line format: <image_path> <label_path>. The path is relative to root.
        :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
        :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
        """

        self.root_dir = root_dir
        super().__init__(root_dir, list_file=list_file, **kwargs)
        # labels dataframe for labels metadata.
        self.labels_data = np.recfromcsv(os.path.join(self.root_dir, labels_csv_path), dtype="<i8,U20,<i8,<i8,U12,<i8,?,?,U7", comments="&")
        # map vector to map ground-truth labels to train labels
        self.labels_map = self.labels_data.field("trainid")
        # class names
        self.classes = self.labels_data.field("name")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()
        # color palette for visualization
        self.train_id_color_palette = self._create_color_palette()

    def _generate_samples_and_targets(self):
        """
        override _generate_samples_and_targets function, to parse list file.
        line format of list file: <image_path> <label_path>
        """
        with open(os.path.join(self.root_dir, self.list_file_path)) as f:
            img_list = [line.strip().split() for line in f]
        for image_path, label_path in img_list:
            self.samples_targets_tuples_list.append((os.path.join(self.root, image_path), os.path.join(self.root, label_path)))
        super(CityscapesDataset, self)._generate_samples_and_targets()

    def target_loader(self, label_path: str) -> Image:
        """
        Override target_loader function, load the labels mask image.
            :param label_path:  Path to the label image.
            :return:                     The mask image created from the array, with converted class labels.
        """
        # assert that is a png file, other file types might alter the class labels value.
        assert os.path.splitext(label_path)[-1].lower() == ".png"

        label = cv2.imread(label_path, cv2.IMREAD_GRAYSCALE)
        # map ground-truth ids to train ids
        label = self.labels_map[label].astype(np.uint8)
        return Image.fromarray(label, "L")

    def _create_color_palette(self):
        """
        Create color pallete for visualizing the segmentation masks
        :return: list of rgb color values
        """
        palette = []
        hex_colors = self.labels_data.field("color")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()

        for hex_color in hex_colors:
            rgb_color = ImageColor.getcolor(hex_color, "RGB")
            palette += [x for x in rgb_color]

        return palette

    def get_train_ids_color_palette(self):
        return self.train_id_color_palette

    def __getitem__(self, index):
        sample, target = super(CityscapesDataset, self).__getitem__(index)
        target[target == 255] = CITYSCAPES_IGNORE_LABEL
        return sample, target

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 1024, 2048

`init(root_dir, list_file, labels_csv_path, **kwargs)`

Parameters:

Name	Type	Description	Default
`root_dir`	`str`	Absolute path to root directory of the dataset.	required
`list_file`	`str`	List file that contains names of images to load, line format: . The path is relative to root.	required
`labels_csv_path`	`str`	Path to csv file, with labels metadata and mapping. The path is relative to root.	required
`kwargs`		Any hyper params required for the dataset, i.e img_size, crop_size, cache_images	`{}`

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py

def __init__(self, root_dir: str, list_file: str, labels_csv_path: str, **kwargs):
    """
    :param root_dir:        Absolute path to root directory of the dataset.
    :param list_file:       List file that contains names of images to load, line format: <image_path> <label_path>. The path is relative to root.
    :param labels_csv_path: Path to csv file, with labels metadata and mapping. The path is relative to root.
    :param kwargs:          Any hyper params required for the dataset, i.e img_size, crop_size, cache_images
    """

    self.root_dir = root_dir
    super().__init__(root_dir, list_file=list_file, **kwargs)
    # labels dataframe for labels metadata.
    self.labels_data = np.recfromcsv(os.path.join(self.root_dir, labels_csv_path), dtype="<i8,U20,<i8,<i8,U12,<i8,?,?,U7", comments="&")
    # map vector to map ground-truth labels to train labels
    self.labels_map = self.labels_data.field("trainid")
    # class names
    self.classes = self.labels_data.field("name")[np.logical_not(self.labels_data.field("ignoreineval"))].tolist()
    # color palette for visualization
    self.train_id_color_palette = self._create_color_palette()

`target_loader(label_path)`

Override target_loader function, load the labels mask image. :param label_path: Path to the label image. :return: The mask image created from the array, with converted class labels.

Source code in src/super_gradients/training/datasets/segmentation_datasets/cityscape_segmentation.py

def target_loader(self, label_path: str) -> Image:
    """
    Override target_loader function, load the labels mask image.
        :param label_path:  Path to the label image.
        :return:                     The mask image created from the array, with converted class labels.
    """
    # assert that is a png file, other file types might alter the class labels value.
    assert os.path.splitext(label_path)[-1].lower() == ".png"

    label = cv2.imread(label_path, cv2.IMREAD_GRAYSCALE)
    # map ground-truth ids to train ids
    label = self.labels_map[label].astype(np.uint8)
    return Image.fromarray(label, "L")

`CoCoSegmentationDataSet`

Bases: SegmentationDataSet

Segmentation Data Set Class for COCO 2017 Segmentation Data Set

To use this Dataset you need to:

- Download coco dataset:
    annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
    train2017: http://images.cocodataset.org/zips/train2017.zip
    val2017: http://images.cocodataset.org/zips/val2017.zip

- Unzip and organize it as below:
    coco
    ├── annotations
    │      ├─ instances_train2017.json
    │      ├─ instances_val2017.json
    │      └─ ...
    └── images
        ├── train2017
        │   ├─ 000000000001.jpg
        │   └─ ...
        └── val2017
            └─ ...

- Instantiate the dataset:
    >> train_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
    >> valid_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)

Source code in src/super_gradients/training/datasets/segmentation_datasets/coco_segmentation.py

@register_dataset(Datasets.COCO_SEGMENTATION_DATASET)
class CoCoSegmentationDataSet(SegmentationDataSet):
    """
    Segmentation Data Set Class for COCO 2017 Segmentation Data Set

    To use this Dataset you need to:

        - Download coco dataset:
            annotations: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
            train2017: http://images.cocodataset.org/zips/train2017.zip
            val2017: http://images.cocodataset.org/zips/val2017.zip

        - Unzip and organize it as below:
            coco
            ├── annotations
            │      ├─ instances_train2017.json
            │      ├─ instances_val2017.json
            │      └─ ...
            └── images
                ├── train2017
                │   ├─ 000000000001.jpg
                │   └─ ...
                └── val2017
                    └─ ...

        - Instantiate the dataset:
            >> train_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/train2017', json_file='instances_train2017.json', ...)
            >> valid_set = CoCoSegmentationDataSet(data_dir='.../coco', subdir='images/val2017', json_file='instances_val2017.json', ...)
    """

    def __init__(self, root_dir: str, dataset_classes_inclusion_tuples_list: list = None, *args, **kwargs):
        # THERE ARE 91 CLASSES, INCLUDING BACKGROUND - BUT WE ENABLE THE USAGE OF SUBCLASSES, TO PARTIALLY USE THE DATA
        self.dataset_classes_inclusion_tuples_list = dataset_classes_inclusion_tuples_list or COCO_DEFAULT_CLASSES_TUPLES_LIST

        self.root_dir = root_dir
        super().__init__(root_dir, *args, **kwargs)

        _, class_names = zip(*self.dataset_classes_inclusion_tuples_list)
        self.classes = class_names

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # FIRST OF ALL LOAD ALL OF THE ANNOTATIONS, AND CREATE THE PATH FOR THE PRE-PROCESSED MASKS
        self.annotations_file_path = os.path.join(self.root, self.targets_sub_directory, self.list_file_path)
        self.coco = COCO(self.annotations_file_path)

        # USE SUB-CLASSES OF THE ENTIRE COCO DATA SET, INSTEAD ALL OF THE DATA -> HIGHLY RELEVANT FOR TRANSFER LEARNING
        sub_dataset_image_ids_file_path = self.annotations_file_path.replace("json", "pth")

        if os.path.exists(sub_dataset_image_ids_file_path):
            self.relevant_image_ids = torch.load(sub_dataset_image_ids_file_path)
        else:
            self.relevant_image_ids = self._sub_dataset_creation(sub_dataset_image_ids_file_path)

        for relevant_image_id in self.relevant_image_ids:
            img_metadata = self.coco.loadImgs(relevant_image_id)[0]
            image_path = os.path.join(self.root, self.samples_sub_directory, img_metadata["file_name"])
            mask_metadata_tuple = (relevant_image_id, img_metadata["height"], img_metadata["width"])
            self.samples_targets_tuples_list.append((image_path, mask_metadata_tuple))

        super(CoCoSegmentationDataSet, self)._generate_samples_and_targets()

    def target_loader(self, mask_metadata_tuple) -> Image:
        """
        target_loader
            :param mask_metadata_tuple:  A tuple of (coco_image_id, original_image_height, original_image_width)
            :return:                     The mask image created from the array
        """
        coco_image_id, original_image_h, original_image_w = mask_metadata_tuple
        coco_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=coco_image_id))

        mask = self._generate_coco_segmentation_mask(coco_annotations, original_image_h, original_image_w)
        return Image.fromarray(mask)

    def _generate_coco_segmentation_mask(self, target_coco_annotations, h, w):
        """
        _generate_segmentation_mask - Extracts a segmentation mask
            :param target_coco_annotations:
            :param h:
            :param w:
            :return:
        """
        mask = np.zeros((h, w), dtype=np.uint8)

        for i, instance in enumerate(target_coco_annotations):
            rle = pycocotools_mask.frPyObjects(instance["segmentation"], h, w)
            coco_segementation_mask = pycocotools_mask.decode(rle)

            if not self.dataset_classes_inclusion_tuples_list:
                # NO CLASSES WERE SELECTED FROM COCO'S 91 CLASSES - ERROR
                raise EmptyCoCoClassesSelectionException
            else:
                # FILTER OUT ALL OF THE MASKS OF INSTANCES THAT ARE NOT IN THE SUB-DATASET CLASSES
                class_category = instance["category_id"]

                sub_classes_category_ids, _ = map(list, zip(*self.dataset_classes_inclusion_tuples_list))
                if class_category not in sub_classes_category_ids:
                    continue

                class_index = sub_classes_category_ids.index(class_category)
                if len(coco_segementation_mask.shape) < 3:
                    mask[:, :] += (mask == 0) * (coco_segementation_mask * class_index)
                else:
                    mask[:, :] += (mask == 0) * (((np.sum(coco_segementation_mask, axis=2)) > 0) * class_index).astype(np.uint8)

        return mask

    def _sub_dataset_creation(self, sub_dataset_image_ids_file_path) -> list:
        """
        _sub_dataset_creation - This method creates the segmentation annotations for coco using
                                self._generate_segmentation_mask that uses the sub-classes inclusion tuple to keep only
                                the annotations that are relevant to the sub-classes selected when instantiating the class
            :param  sub_dataset_image_ids_file_path: The path to save the sub-dataset in for future loading
            :return:            All of the ids with enough pixel data after the sub-classing
        """
        print("Creating sub-dataset , this will take a while but don't worry, it only runs once and caches the results")
        all_coco_image_ids = list(self.coco.imgs.keys())
        sub_dataset_image_ids = []

        with tqdm(all_coco_image_ids, desc="Generating sub-dataset image ids") as tbar:
            for i, img_id in enumerate(tbar):
                coco_target_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
                img_metadata = self.coco.loadImgs(img_id)[0]

                mask = self._generate_coco_segmentation_mask(coco_target_annotations, img_metadata["height"], img_metadata["width"])

                # MAKE SURE THERE IS ENOUGH INPUT IN THE IMAGE (MORE THAN 1K PIXELS) AFTER SUB-CLASSES FILTRATION
                if (mask > 0).sum() > 1000:
                    sub_dataset_image_ids.append(img_id)

                tbar.set_description("Processed images: {}/{}, generated {} qualified images".format(i, len(all_coco_image_ids), len(sub_dataset_image_ids)))
        print("Number of images in sub-dataset: ", len(sub_dataset_image_ids))
        torch.save(sub_dataset_image_ids, sub_dataset_image_ids_file_path)
        return sub_dataset_image_ids

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 512, 512

`target_loader(mask_metadata_tuple)`

target_loader :param mask_metadata_tuple: A tuple of (coco_image_id, original_image_height, original_image_width) :return: The mask image created from the array

Source code in src/super_gradients/training/datasets/segmentation_datasets/coco_segmentation.py

def target_loader(self, mask_metadata_tuple) -> Image:
    """
    target_loader
        :param mask_metadata_tuple:  A tuple of (coco_image_id, original_image_height, original_image_width)
        :return:                     The mask image created from the array
    """
    coco_image_id, original_image_h, original_image_w = mask_metadata_tuple
    coco_annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=coco_image_id))

    mask = self._generate_coco_segmentation_mask(coco_annotations, original_image_h, original_image_w)
    return Image.fromarray(mask)

`MapillaryDataset`

Bases: SegmentationDataSet

Mapillary Vistas is a large-scale urban street-view dataset. This dataset contains 18k, 2k, and 5k images for training, validation and testing with a variety of image resolutions, ranging from 1024 × 768 to 4000 × 6000. Paper: "Gerhard Neuhold, Tobias Ollmann, Samuel Rota Bulò, and Peter Kontschieder. The mapillary vistas dataset for semantic understanding of street scenes. In CVPR, 2017." https://openaccess.thecvf.com/content_ICCV_2017/papers/Neuhold_The_Mapillary_Vistas_ICCV_2017_paper.pdf Official site: https://www.mapillary.com/ (register for free, then download Vistas dataset)

Source code in src/super_gradients/training/datasets/segmentation_datasets/mapillary_dataset.py

@register_dataset(Datasets.MAPILLARY_DATASET)
class MapillaryDataset(SegmentationDataSet):
    """
    Mapillary Vistas is a large-scale urban street-view dataset.
    This dataset contains 18k, 2k, and 5k images for training, validation and testing with a variety of image
    resolutions, ranging from 1024 × 768 to 4000 × 6000.
    Paper:
        "Gerhard Neuhold, Tobias Ollmann, Samuel Rota Bulò, and Peter Kontschieder. The mapillary vistas dataset for
         semantic understanding of street scenes. In CVPR, 2017."
         https://openaccess.thecvf.com/content_ICCV_2017/papers/Neuhold_The_Mapillary_Vistas_ICCV_2017_paper.pdf
    Official site:
        https://www.mapillary.com/ (register for free, then download Vistas dataset)
    """

    """
        Dataset layout:
            root_dir
            ├── config_v1.2.json
            ├── config_v2.0.json
            ├── training
                ├── images
                    ├── {image_name}.jpg            # RGB images
                ├── v1.2
                    ├── labels
                        ├── {image_name}.jpg        # Target masks
                ├── v2.0
                    ├── labels
                        ├── {image_name}.jpg        # Target masks
            ├── validation
            ├── testing
        Note that there are two versions currently available for this dataset, `v1.2` and `v2.0`, the difference according
        to the change log is as follows:
            - Expanded the set of labels to 124 classes (70 instance-specific, 46 stuff, 8 void or crowd).
            - Added raw polygonal annotations as json files. These reflect the ordering in which the segments where
                annotated by the original annotators, i.e. approximately from the background towards the camera.
        The common practice is to use the 65 categorical labels from v1.2 and older.
    """

    IGNORE_LABEL_V1_2 = 65
    IGNORE_LABEL_V2_0 = 123

    def __init__(
        self,
        root_dir: str,
        config_file: str,
        samples_sub_directory: str,
        targets_sub_directory: str,
        sample_extension: str = ".jpg",
        target_extension: str = ".png",
        **kwargs,
    ):
        self.samples_sub_directory = samples_sub_directory
        self.targets_sub_directory = targets_sub_directory
        self.target_extension = target_extension
        self.sample_extension = sample_extension
        # FIXME - Must pass list_file, due to double inheritance error when using DirectoryDataset. See the bug report
        super().__init__(
            root=root_dir,
            samples_sub_directory=samples_sub_directory,
            targets_sub_directory=targets_sub_directory,
            list_file="",
            target_extension=target_extension,
            **kwargs,
        )

        # read in config file
        with open(os.path.join(self.root, config_file), "r") as f:
            config = json.load(f)
        self.labels = config["labels"]
        self.label_colors = [label["color"] for label in self.labels]
        self.label_names = [label["readable"].replace(" ", "_") for label in self.labels]
        # Ignore labels is called `Unlabeled` in config files
        self.ignore_label = self.label_names.index("Unlabeled")
        # SG format requires returning classes as label names without ignore labels, it is also often used to calculate
        # the num of classes.
        self.classes = self.label_names[:-1]

    def _generate_samples_and_targets(self):
        samples_dir = os.path.join(self.root, self.samples_sub_directory)
        labels_dir = os.path.join(self.root, self.targets_sub_directory)

        sample_names = [n for n in sorted(os.listdir(samples_dir)) if n.endswith(self.sample_extension)]
        label_names = [n for n in sorted(os.listdir(labels_dir)) if n.endswith(self.target_extension)]

        assert len(sample_names) == len(label_names), f"Number of samples: {len(sample_names)}," f" doesn't match the number of labels {len(label_names)}"

        for sample_name in sample_names:
            label_path = os.path.join(labels_dir, sample_name.replace(self.sample_extension, self.target_extension))
            sample_path = os.path.join(samples_dir, sample_name)

            if os.path.exists(sample_path) and os.path.exists(label_path):
                self.samples_targets_tuples_list.append((sample_path, label_path))
            else:
                raise AssertionError(f"Sample and/or target file(s) not found or in illegal format " f"(sample path: {sample_path}, target path: {label_path})")

    def apply_color_map(self, target: Image) -> np.ndarray:
        """
        Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.
        """
        target_array = np.array(target)
        rgb_array = np.zeros((target_array.shape[0], target_array.shape[1], 3), dtype=np.uint8)

        for label_id, color in enumerate(self.label_colors):
            # set all pixels with the current label to the color of the current label
            rgb_array[target_array == label_id] = color

        return rgb_array

`apply_color_map(target)`

Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.

Source code in src/super_gradients/training/datasets/segmentation_datasets/mapillary_dataset.py

def apply_color_map(self, target: Image) -> np.ndarray:
    """
    Convert a greyscale target PIL image to an RGB numpy array according to the official Mapillary color map.
    """
    target_array = np.array(target)
    rgb_array = np.zeros((target_array.shape[0], target_array.shape[1], 3), dtype=np.uint8)

    for label_id, color in enumerate(self.label_colors):
        # set all pixels with the current label to the color of the current label
        rgb_array[target_array == label_id] = color

    return rgb_array

`PascalAUG2012SegmentationDataSet`

Bases: PascalVOC2012SegmentationDataSet

Segmentation Data Set Class for Pascal AUG 2012 Data Set

- Download pascal AUG 2012 dataset:
    https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

- Unzip and organize it as below:
    pascal_voc_2012
        └──VOCaug
            ├── aug.txt
            └── dataset
                  ├──inst
                  ├──img
                  └──cls

- Instantiate the dataset:
    >> train_set = PascalAUG2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCaug/dataset/aug.txt',
            samples_sub_directory='VOCaug/dataset/img',
            targets_sub_directory='VOCaug/dataset/cls',
            ...
        )

NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py

@register_dataset(Datasets.PASCAL_AUG_2012_SEGMENTATION_DATASET)
class PascalAUG2012SegmentationDataSet(PascalVOC2012SegmentationDataSet):
    """
    Segmentation Data Set Class for Pascal AUG 2012 Data Set

        - Download pascal AUG 2012 dataset:
            https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

        - Unzip and organize it as below:
            pascal_voc_2012
                └──VOCaug
                    ├── aug.txt
                    └── dataset
                          ├──inst
                          ├──img
                          └──cls

        - Instantiate the dataset:
            >> train_set = PascalAUG2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCaug/dataset/aug.txt',
                    samples_sub_directory='VOCaug/dataset/img',
                    targets_sub_directory='VOCaug/dataset/cls',
                    ...
                )

    NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.
    """

    def __init__(self, *args, **kwargs):
        self.sample_suffix = ".jpg"
        self.target_suffix = ".mat"
        super().__init__(sample_suffix=self.sample_suffix, target_suffix=self.target_suffix, *args, **kwargs)

    @staticmethod
    def target_loader(target_path: str) -> Image:
        """
        target_loader
            :param target_path: The path to the target data
            :return:            The loaded target
        """
        mat = scipy.io.loadmat(target_path, mat_dtype=True, squeeze_me=True, struct_as_record=False)
        mask = mat["GTcls"].Segmentation
        return Image.fromarray(mask)

`target_loader(target_path)` `staticmethod`

target_loader :param target_path: The path to the target data :return: The loaded target

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py

@staticmethod
def target_loader(target_path: str) -> Image:
    """
    target_loader
        :param target_path: The path to the target data
        :return:            The loaded target
    """
    mat = scipy.io.loadmat(target_path, mat_dtype=True, squeeze_me=True, struct_as_record=False)
    mask = mat["GTcls"].Segmentation
    return Image.fromarray(mask)

`PascalVOC2012SegmentationDataSet`

Bases: SegmentationDataSet

Segmentation Data Set Class for Pascal VOC 2012 Data Set.

To use this Dataset you need to:

- Download pascal VOC 2012 dataset:
    http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

- Unzip and organize it as below:
    pascal_voc_2012
        └──VOCdevkit
              └──VOC2012
                 ├──JPEGImages
                 ├──SegmentationClass
                 ├──ImageSets
                 │    ├──Segmentation
                 │    │   └── train.txt
                 │    ├──Main
                 │    ├──Action
                 │    └──Layout
                 ├──Annotations
                 └──SegmentationObject

- Instantiate the dataset:
    >> train_set = PascalVOC2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt',
            samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
            targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
            ...
        )
    >> valid_set = PascalVOC2012SegmentationDataSet(
            root='.../pascal_voc_2012',
            list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt',
            samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
            targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
            ...
        )

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py

@register_dataset(Datasets.PASCAL_VOC_2012_SEGMENTATION_DATASET)
class PascalVOC2012SegmentationDataSet(SegmentationDataSet):
    """
    Segmentation Data Set Class for Pascal VOC 2012 Data Set.

    To use this Dataset you need to:

        - Download pascal VOC 2012 dataset:
            http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

        - Unzip and organize it as below:
            pascal_voc_2012
                └──VOCdevkit
                      └──VOC2012
                         ├──JPEGImages
                         ├──SegmentationClass
                         ├──ImageSets
                         │    ├──Segmentation
                         │    │   └── train.txt
                         │    ├──Main
                         │    ├──Action
                         │    └──Layout
                         ├──Annotations
                         └──SegmentationObject

        - Instantiate the dataset:
            >> train_set = PascalVOC2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt',
                    samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
                    targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
                    ...
                )
            >> valid_set = PascalVOC2012SegmentationDataSet(
                    root='.../pascal_voc_2012',
                    list_file='VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt',
                    samples_sub_directory='VOCdevkit/VOC2012/JPEGImages',
                    targets_sub_directory='VOCdevkit/VOC2012/SegmentationClass',
                    ...
                )
    """

    IGNORE_LABEL = 21
    _ORIGINAL_IGNORE_LABEL = 255

    def __init__(self, sample_suffix=None, target_suffix=None, *args, **kwargs):
        self.sample_suffix = ".jpg" if sample_suffix is None else sample_suffix
        self.target_suffix = ".png" if target_suffix is None else target_suffix
        super().__init__(*args, **kwargs)

        self.classes = PASCAL_VOC_2012_CLASSES

    def __getitem__(self, index):
        sample, target = super(PascalVOC2012SegmentationDataSet, self).__getitem__(index)
        target[target == PascalVOC2012SegmentationDataSet._ORIGINAL_IGNORE_LABEL] = PascalVOC2012SegmentationDataSet.IGNORE_LABEL
        return sample, target

    def decode_segmentation_mask(self, label_mask: np.ndarray):
        """
        decode_segmentation_mask - Decodes the colors for the Segmentation Mask
            :param: label_mask:  an (M,N) array of integer values denoting
                                the class label at each spatial location.
        :return:
        """
        label_colours = self._get_pascal_labels()
        r = label_mask.copy()
        g = label_mask.copy()
        b = label_mask.copy()

        num_classes_to_plot = len(self.classes)
        for ll in range(0, num_classes_to_plot):
            r[label_mask == ll] = label_colours[ll, 0]
            g[label_mask == ll] = label_colours[ll, 1]
            b[label_mask == ll] = label_colours[ll, 2]
        rgb = np.zeros((label_mask.shape[0], label_mask.shape[1], 3))
        rgb[:, :, 0] = r / 255.0
        rgb[:, :, 1] = g / 255.0
        rgb[:, :, 2] = b / 255.0

        return rgb

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # GENERATE SAMPLES AND TARGETS HERE SPECIFICALLY FOR PASCAL VOC 2012
        with open(self.root + os.path.sep + self.list_file_path, "r", encoding="utf-8") as lines:
            for line in lines:
                image_path = os.path.join(self.root, self.samples_sub_directory, line.rstrip("\n") + self.sample_suffix)
                mask_path = os.path.join(self.root, self.targets_sub_directory, line.rstrip("\n") + self.target_suffix)

                if os.path.exists(mask_path) and os.path.exists(image_path):
                    self.samples_targets_tuples_list.append((image_path, mask_path))

        # GENERATE SAMPLES AND TARGETS OF THE SEGMENTATION DATA SET CLASS
        super()._generate_samples_and_targets()

    def _get_pascal_labels(self) -> np.ndarray:
        """Load the mapping that associates pascal classes with label colors
        :return: np.ndarray with dimensions (21, 3)
        """
        return np.asarray(
            [
                [0, 0, 0],
                [128, 0, 0],
                [0, 128, 0],
                [128, 128, 0],
                [0, 0, 128],
                [128, 0, 128],
                [0, 128, 128],
                [128, 128, 128],
                [64, 0, 0],
                [192, 0, 0],
                [64, 128, 0],
                [192, 128, 0],
                [64, 0, 128],
                [192, 0, 128],
                [64, 128, 128],
                [192, 128, 128],
                [0, 64, 0],
                [128, 64, 0],
                [0, 192, 0],
                [128, 192, 0],
                [0, 64, 128],
            ]
        )

    @property
    def _original_dataset_image_shape(self) -> Tuple[int, int]:
        """
        returns image shape when data set contains images of uniform shape.
        """
        return 512, 512

`decode_segmentation_mask(label_mask)`

decode_segmentation_mask - Decodes the colors for the Segmentation Mask :param: label_mask: an (M,N) array of integer values denoting the class label at each spatial location.

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py

def decode_segmentation_mask(self, label_mask: np.ndarray):
    """
    decode_segmentation_mask - Decodes the colors for the Segmentation Mask
        :param: label_mask:  an (M,N) array of integer values denoting
                            the class label at each spatial location.
    :return:
    """
    label_colours = self._get_pascal_labels()
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()

    num_classes_to_plot = len(self.classes)
    for ll in range(0, num_classes_to_plot):
        r[label_mask == ll] = label_colours[ll, 0]
        g[label_mask == ll] = label_colours[ll, 1]
        b[label_mask == ll] = label_colours[ll, 2]
    rgb = np.zeros((label_mask.shape[0], label_mask.shape[1], 3))
    rgb[:, :, 0] = r / 255.0
    rgb[:, :, 1] = g / 255.0
    rgb[:, :, 2] = b / 255.0

    return rgb

`PascalVOCAndAUGUnifiedDataset`

Bases: ConcatDataset

Pascal VOC + AUG train dataset, aka SBD dataset contributed in "Semantic contours from inverse detectors". This is class implement the common usage of the SBD and PascalVOC datasets as a unified augmented trainset. The unified dataset includes a total of 10,582 samples and don't contains duplicate samples from the PascalVOC validation set.

To use this Dataset you need to:

- Download pascal datasets:
    VOC 2012: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
    AUG 2012: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

- Unzip and organize it as below:
    pascal_voc_2012
        ├─VOCdevkit
        │ └──VOC2012
        │    ├──JPEGImages
        │    ├──SegmentationClass
        │    ├──ImageSets
        │    │    ├──Segmentation
        │    │    │   └── train.txt
        │    │    ├──Main
        │    │    ├──Action
        │    │    └──Layout
        │    ├──Annotations
        │    └──SegmentationObject
        └──VOCaug
            ├── aug.txt
            └── dataset
                  ├──inst
                  ├──img
                  └──cls

- Instantiate the dataset:
    >> train_set = PascalVOCAndAUGUnifiedDataset(root='.../pascal_voc_2012', ...)

NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.

Source code in src/super_gradients/training/datasets/segmentation_datasets/pascal_voc_segmentation.py

@register_dataset(Datasets.PASCAL_VOC_AND_AUG_UNIFIED_DATASET)
class PascalVOCAndAUGUnifiedDataset(ConcatDataset):
    """
    Pascal VOC + AUG train dataset, aka `SBD` dataset contributed in "Semantic contours from inverse detectors".
    This is class implement the common usage of the SBD and PascalVOC datasets as a unified augmented trainset.
    The unified dataset includes a total of 10,582 samples and don't contains duplicate samples from the PascalVOC
    validation set.

    To use this Dataset you need to:

        - Download pascal datasets:
            VOC 2012: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
            AUG 2012: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz

        - Unzip and organize it as below:
            pascal_voc_2012
                ├─VOCdevkit
                │ └──VOC2012
                │    ├──JPEGImages
                │    ├──SegmentationClass
                │    ├──ImageSets
                │    │    ├──Segmentation
                │    │    │   └── train.txt
                │    │    ├──Main
                │    │    ├──Action
                │    │    └──Layout
                │    ├──Annotations
                │    └──SegmentationObject
                └──VOCaug
                    ├── aug.txt
                    └── dataset
                          ├──inst
                          ├──img
                          └──cls

        - Instantiate the dataset:
            >> train_set = PascalVOCAndAUGUnifiedDataset(root='.../pascal_voc_2012', ...)

    NOTE: this dataset is only available for training. To test, please use PascalVOC2012SegmentationDataSet.
    """

    def __init__(self, **kwargs):
        print(kwargs)
        if any([kwargs.pop("list_file"), kwargs.pop("samples_sub_directory"), kwargs.pop("targets_sub_directory")]):
            logger.warning(
                "[list_file, samples_sub_directory, targets_sub_directory] arguments passed will not be used"
                " when passed to `PascalVOCAndAUGUnifiedDataset`. Those values are predefined for initiating"
                " the Pascal VOC + AUG training set."
            )
        super().__init__(
            datasets=[
                PascalVOC2012SegmentationDataSet(
                    list_file="VOCdevkit/VOC2012/ImageSets/Segmentation/train.txt",
                    samples_sub_directory="VOCdevkit/VOC2012/JPEGImages",
                    targets_sub_directory="VOCdevkit/VOC2012/SegmentationClass",
                    **kwargs,
                ),
                PascalAUG2012SegmentationDataSet(
                    list_file="VOCaug/dataset/aug.txt", samples_sub_directory="VOCaug/dataset/img", targets_sub_directory="VOCaug/dataset/cls", **kwargs
                ),
            ]
        )

`SegmentationDataSet`

Bases: DirectoryDataSet, ListDataset, HasPreprocessingParams

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py

@register_dataset(Datasets.SEGMENTATION_DATASET)
class SegmentationDataSet(DirectoryDataSet, ListDataset, HasPreprocessingParams):
    @resolve_param("transforms", factory=TransformsFactory())
    def __init__(
        self,
        root: str,
        list_file: str = None,
        samples_sub_directory: str = None,
        targets_sub_directory: str = None,
        cache_labels: bool = False,
        cache_images: bool = False,
        collate_fn: Callable = None,
        target_extension: str = ".png",
        transforms: Iterable = None,
    ):
        """
        SegmentationDataSet
            :param root:                        Root folder of the Data Set
            :param list_file:                   Path to the file with the samples list
            :param samples_sub_directory:       name of the samples sub-directory
            :param targets_sub_directory:       name of the targets sub-directory
            :param cache_labels:                "Caches" the labels -> Pre-Loads to memory as a list
            :param cache_images:                "Caches" the images -> Pre-Loads to memory as a list
            :param collate_fn:                  collate_fn func to process batches for the Data Loader
            :param target_extension:            file extension of the targets (default is .png for PASCAL VOC 2012)
            :param transforms:                  transforms to be applied on image and mask

        """
        self.samples_sub_directory = samples_sub_directory
        self.targets_sub_directory = targets_sub_directory
        self.cache_labels = cache_labels
        self.cache_images = cache_images

        # CREATE A DIRECTORY DATASET OR A LIST DATASET BASED ON THE list_file INPUT VARIABLE
        if list_file is not None:
            ListDataset.__init__(
                self,
                root=root,
                file=list_file,
                target_extension=target_extension,
                sample_loader=self.sample_loader,
                target_loader=self.target_loader,
                collate_fn=collate_fn,
            )
        else:
            DirectoryDataSet.__init__(
                self,
                root=root,
                samples_sub_directory=samples_sub_directory,
                targets_sub_directory=targets_sub_directory,
                target_extension=target_extension,
                sample_loader=self.sample_loader,
                target_loader=self.target_loader,
                collate_fn=collate_fn,
            )

        self.transforms = transforms if transforms else []

    def __getitem__(self, index):
        sample_path, target_path = self.samples_targets_tuples_list[index]

        # TRY TO LOAD THE CACHED IMAGE FIRST
        if self.cache_images:
            sample = self.imgs[index]
        else:
            sample = self.sample_loader(sample_path)

        # TRY TO LOAD THE CACHED LABEL FIRST
        if self.cache_labels:
            target = self.labels[index]
        else:
            target = self.target_loader(target_path)

        # MAKE SURE THE TRANSFORM WORKS ON BOTH IMAGE AND MASK TO ALIGN THE AUGMENTATIONS
        sample, target = self._transform_image_and_mask(sample, target)
        return sample, target

    @staticmethod
    def sample_loader(sample_path: str) -> Image:
        """
        sample_loader - Loads a dataset image from path using PIL
            :param sample_path: The path to the sample image
            :return:            The loaded Image
        """
        image = Image.open(sample_path).convert("RGB")
        return image

    @staticmethod
    def target_loader(target_path: str) -> Image:
        """
        target_loader
            :param target_path: The path to the sample image
            :return:            The loaded Image
        """
        target = Image.open(target_path)
        return target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        # IF THE DERIVED CLASS DID NOT IMPLEMENT AN EXPLICIT _generate_samples_and_targets CHILD METHOD
        if not self.samples_targets_tuples_list:
            super()._generate_samples_and_targets()

        # EXTRACT THE LABELS FROM THE TUPLES LIST
        image_files, label_files = map(list, zip(*self.samples_targets_tuples_list))
        image_indices_to_remove = []

        # CACHE IMAGES INTO MEMORY FOR FASTER TRAINING (WARNING: LARGE DATASETS MAY EXCEED SYSTEM RAM)
        if self.cache_images:
            # CREATE AN EMPTY LIST FOR THE LABELS
            self.imgs = len(self) * [None]
            cached_images_mem_in_gb = 0.0
            with tqdm(image_files, desc="Caching images") as pbar:
                for i, img_path in enumerate(pbar):
                    img = self.sample_loader(img_path)
                    if img is None:
                        image_indices_to_remove.append(i)

                    cached_images_mem_in_gb += os.path.getsize(image_files[i]) / 1024.0**3.0

                    self.imgs[i] = img
                    pbar.desc = "Caching images (%.1fGB)" % (cached_images_mem_in_gb)
            self.img_files = [e for i, e in enumerate(image_files) if i not in image_indices_to_remove]
            self.imgs = [e for i, e in enumerate(self.imgs) if i not in image_indices_to_remove]

        # CACHE LABELS INTO MEMORY FOR FASTER TRAINING - RELEVANT FOR EFFICIENT VALIDATION RUNS DURING TRAINING
        if self.cache_labels:
            # CREATE AN EMPTY LIST FOR THE LABELS
            self.labels = len(self) * [None]
            with tqdm(label_files, desc="Caching labels") as pbar:
                missing_labels, found_labels, duplicate_labels = 0, 0, 0

                for i, file in enumerate(pbar):
                    labels = self.target_loader(file)

                    if labels is None:
                        missing_labels += 1
                        image_indices_to_remove.append(i)
                        continue

                    self.labels[i] = labels
                    found_labels += 1

                    pbar.desc = "Caching labels (%g found, %g missing, %g duplicate, for %g images)" % (
                        found_labels,
                        missing_labels,
                        duplicate_labels,
                        len(image_files),
                    )
            assert found_labels > 0, "No labels found."

            #  REMOVE THE IRRELEVANT ENTRIES FROM THE DATA
            self.label_files = [e for i, e in enumerate(label_files) if i not in image_indices_to_remove]
            self.labels = [e for i, e in enumerate(self.labels) if i not in image_indices_to_remove]

    def _transform_image_and_mask(self, image, mask) -> tuple:
        """
        :param image:           The input image
        :param mask:            The input mask
        :return:                The transformed image, mask
        """
        sample = SegmentationSample(image=image, mask=mask)
        for t in self.transforms:
            sample = t.apply_to_sample(sample)
        return sample.image, sample.mask

    @property
    def _original_dataset_image_shape(self) -> Optional[Tuple[int, int]]:
        """
        Image default shape - (H,W)
        Default shape (model's input) should be defined for additional processing that might be needed
        when using "predict" any input-image/s can be used, the images should be rescaled to match the model's training-data shape
        """
        return None

    def get_dataset_preprocessing_params(self):
        """
        Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
         image_processor as returned as a list of dicts to be resolved by processing factory.
        :return:
        """
        pipeline = []

        if self._original_dataset_image_shape:
            pipeline += [{Processings.SegmentationResizeWithPadding: {"output_shape": self._original_dataset_image_shape, "pad_value": 0}}]
            # Resize image to same image-shape as model input. default shape should be defined in dataset class under "output_image_shape"

        for t in self.transforms:
            pipeline += t.get_equivalent_preprocessing()
        params = dict(class_names=self.classes, image_processor={Processings.ComposeProcessing: {"processings": pipeline}})
        return params

`init(root, list_file=None, samples_sub_directory=None, targets_sub_directory=None, cache_labels=False, cache_images=False, collate_fn=None, target_extension='.png', transforms=None)`

SegmentationDataSet :param root: Root folder of the Data Set :param list_file: Path to the file with the samples list :param samples_sub_directory: name of the samples sub-directory :param targets_sub_directory: name of the targets sub-directory :param cache_labels: "Caches" the labels -> Pre-Loads to memory as a list :param cache_images: "Caches" the images -> Pre-Loads to memory as a list :param collate_fn: collate_fn func to process batches for the Data Loader :param target_extension: file extension of the targets (default is .png for PASCAL VOC 2012) :param transforms: transforms to be applied on image and mask

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py

@resolve_param("transforms", factory=TransformsFactory())
def __init__(
    self,
    root: str,
    list_file: str = None,
    samples_sub_directory: str = None,
    targets_sub_directory: str = None,
    cache_labels: bool = False,
    cache_images: bool = False,
    collate_fn: Callable = None,
    target_extension: str = ".png",
    transforms: Iterable = None,
):
    """
    SegmentationDataSet
        :param root:                        Root folder of the Data Set
        :param list_file:                   Path to the file with the samples list
        :param samples_sub_directory:       name of the samples sub-directory
        :param targets_sub_directory:       name of the targets sub-directory
        :param cache_labels:                "Caches" the labels -> Pre-Loads to memory as a list
        :param cache_images:                "Caches" the images -> Pre-Loads to memory as a list
        :param collate_fn:                  collate_fn func to process batches for the Data Loader
        :param target_extension:            file extension of the targets (default is .png for PASCAL VOC 2012)
        :param transforms:                  transforms to be applied on image and mask

    """
    self.samples_sub_directory = samples_sub_directory
    self.targets_sub_directory = targets_sub_directory
    self.cache_labels = cache_labels
    self.cache_images = cache_images

    # CREATE A DIRECTORY DATASET OR A LIST DATASET BASED ON THE list_file INPUT VARIABLE
    if list_file is not None:
        ListDataset.__init__(
            self,
            root=root,
            file=list_file,
            target_extension=target_extension,
            sample_loader=self.sample_loader,
            target_loader=self.target_loader,
            collate_fn=collate_fn,
        )
    else:
        DirectoryDataSet.__init__(
            self,
            root=root,
            samples_sub_directory=samples_sub_directory,
            targets_sub_directory=targets_sub_directory,
            target_extension=target_extension,
            sample_loader=self.sample_loader,
            target_loader=self.target_loader,
            collate_fn=collate_fn,
        )

    self.transforms = transforms if transforms else []

`get_dataset_preprocessing_params()`

Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB). image_processor as returned as a list of dicts to be resolved by processing factory.

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py

def get_dataset_preprocessing_params(self):
    """
    Return any hardcoded preprocessing + adaptation for PIL.Image image reading (RGB).
     image_processor as returned as a list of dicts to be resolved by processing factory.
    :return:
    """
    pipeline = []

    if self._original_dataset_image_shape:
        pipeline += [{Processings.SegmentationResizeWithPadding: {"output_shape": self._original_dataset_image_shape, "pad_value": 0}}]
        # Resize image to same image-shape as model input. default shape should be defined in dataset class under "output_image_shape"

    for t in self.transforms:
        pipeline += t.get_equivalent_preprocessing()
    params = dict(class_names=self.classes, image_processor={Processings.ComposeProcessing: {"processings": pipeline}})
    return params

`sample_loader(sample_path)` `staticmethod`

sample_loader - Loads a dataset image from path using PIL :param sample_path: The path to the sample image :return: The loaded Image

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py

@staticmethod
def sample_loader(sample_path: str) -> Image:
    """
    sample_loader - Loads a dataset image from path using PIL
        :param sample_path: The path to the sample image
        :return:            The loaded Image
    """
    image = Image.open(sample_path).convert("RGB")
    return image

`target_loader(target_path)` `staticmethod`

target_loader :param target_path: The path to the sample image :return: The loaded Image

Source code in src/super_gradients/training/datasets/segmentation_datasets/segmentation_dataset.py

@staticmethod
def target_loader(target_path: str) -> Image:
    """
    target_loader
        :param target_path: The path to the sample image
        :return:            The loaded Image
    """
    target = Image.open(target_path)
    return target

`SuperviselyPersonsDataset`

Bases: SegmentationDataSet

SuperviselyPersonsDataset - Segmentation Data Set Class for Supervisely Persons Segmentation Data Set, main resolution of dataset: (600 x 800). This dataset is a subset of the original dataset (see below) and contains filtered samples For more details about the ORIGINAL dataset see: https://app.supervise.ly/ecosystem/projects/persons For more details about the FILTERED dataset see: https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.3/contrib/PP-HumanSeg

To use this Dataset you need to:

- Download supervisely dataset:
    https://deci-pretrained-models.s3.amazonaws.com/supervisely-persons.zip)

- Unzip:
    supervisely-persons
     ├──images
     │    ├──image-name.png
     │    └──...
     ├──images_600x800
     │    ├──image-name.png
     │    └──...
     ├──masks
     └──masks_600x800

- Instantiate the dataset:
    >> train_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='train.csv', ...)
    >> valid_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='val.csv', ...)

Source code in src/super_gradients/training/datasets/segmentation_datasets/supervisely_persons_segmentation.py

@register_dataset(Datasets.SUPERVISELY_PERSONS_DATASET)
class SuperviselyPersonsDataset(SegmentationDataSet):
    """
    SuperviselyPersonsDataset - Segmentation Data Set Class for Supervisely Persons Segmentation Data Set,
    main resolution of dataset: (600 x 800).
    This dataset is a subset of the original dataset (see below) and contains filtered samples
    For more details about the ORIGINAL dataset see:
        https://app.supervise.ly/ecosystem/projects/persons
    For more details about the FILTERED dataset see:
        https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.3/contrib/PP-HumanSeg

    To use this Dataset you need to:

        - Download supervisely dataset:
            https://deci-pretrained-models.s3.amazonaws.com/supervisely-persons.zip)

        - Unzip:
            supervisely-persons
             ├──images
             │    ├──image-name.png
             │    └──...
             ├──images_600x800
             │    ├──image-name.png
             │    └──...
             ├──masks
             └──masks_600x800

        - Instantiate the dataset:
            >> train_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='train.csv', ...)
            >> valid_set = SuperviselyPersonsDataset(root_dir='.../supervisely-persons', list_file='val.csv', ...)
    """

    CLASS_LABELS = {0: "background", 1: "person"}

    def __init__(self, root_dir: str, list_file: str, **kwargs):
        """
        :param root_dir:    root directory to dataset.
        :param list_file:   list file that contains names of images to load, line format: <image_path>,<mask_path>
        :param kwargs:      Any hyper params required for the dataset, i.e img_size, crop_size, etc...
        """

        super().__init__(root=root_dir, list_file=list_file, **kwargs)
        self.classes = ["person"]

    def _generate_samples_and_targets(self):
        with open(os.path.join(self.root, self.list_file_path), "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                sample_path = os.path.join(self.root, row[0])
                target_path = os.path.join(self.root, row[1])
                if self._validate_file(sample_path) and self._validate_file(target_path) and os.path.exists(sample_path) and os.path.exists(target_path):
                    self.samples_targets_tuples_list.append((sample_path, target_path))
                else:
                    raise AssertionError(
                        f"Sample and/or target file(s) not found or in illegal format " f"(sample path: {sample_path}, target path: {target_path})"
                    )
        super(SuperviselyPersonsDataset, self)._generate_samples_and_targets()

`init(root_dir, list_file, **kwargs)`

Parameters:

Name	Type	Description	Default
`root_dir`	`str`	root directory to dataset.	required
`list_file`	`str`	list file that contains names of images to load, line format: ,	required
`kwargs`		Any hyper params required for the dataset, i.e img_size, crop_size, etc...	`{}`

Source code in src/super_gradients/training/datasets/segmentation_datasets/supervisely_persons_segmentation.py

def __init__(self, root_dir: str, list_file: str, **kwargs):
    """
    :param root_dir:    root directory to dataset.
    :param list_file:   list file that contains names of images to load, line format: <image_path>,<mask_path>
    :param kwargs:      Any hyper params required for the dataset, i.e img_size, crop_size, etc...
    """

    super().__init__(root=root_dir, list_file=list_file, **kwargs)
    self.classes = ["person"]

`BaseSgVisionDataset`

Bases: VisionDataset

BaseSgVisionDataset

Source code in src/super_gradients/training/datasets/sg_dataset.py

class BaseSgVisionDataset(VisionDataset):
    """
    BaseSgVisionDataset
    """

    def __init__(
        self,
        root: str,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        valid_sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
    ):
        """
        Ctor
            :param root:
            :param sample_loader:
            :param target_loader:
            :param collate_fn:
            :param valid_sample_extensions:
            :param sample_transform:
            :param target_transform:
        """
        super().__init__(root=root, transform=sample_transform, target_transform=target_transform)
        self.samples_targets_tuples_list = list(tuple())
        self.classes = []
        self.valid_sample_extensions = valid_sample_extensions
        self.sample_loader = sample_loader
        self.target_loader = target_loader
        self._generate_samples_and_targets()

        # IF collate_fn IS PROVIDED IN CTOR WE ASSUME THERE IS A BASE-CLASS INHERITANCE W/O collate_fn IMPLEMENTATION
        if collate_fn is not None:
            self.collate_fn = collate_fn

    def __getitem__(self, item):
        """

        :param item:
        :return:
        """
        raise NotImplementedError

    def __len__(self):
        """

        :return:
        """
        return len(self.samples_targets_tuples_list)

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets - An abstract method that fills the samples and targets members of the class
        """
        raise NotImplementedError

    def _validate_file(self, filename: str) -> bool:
        """
        validate_file
            :param filename:
            :return:
        """
        for valid_extension in self.valid_sample_extensions:
            if filename.lower().endswith(valid_extension):
                return True

        return False

    @staticmethod
    def numpy_loader_func(path):
        """
        _numpy_loader_func - Uses numpy load func
            :param path:
            :return:
        """
        return np.load(path)

    @staticmethod
    def text_file_loader_func(text_file_path: str, inline_splitter: str = " ") -> list:
        """
        text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
            :param text_file_path:  Input text file
            :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                    please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('\n') SEPARATED
            :return: a list of tuples, where each tuple is a vector of target values
        """
        if not os.path.isfile(text_file_path):
            raise ValueError(" Error in text file path")

        with open(text_file_path, "r", encoding="utf-8") as text_file:
            targets_list = [tuple(map(float, line.split(inline_splitter))) for line in text_file]

        return targets_list

`getitem(item)`

Parameters:

Name	Type	Description	Default
`item`			required

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __getitem__(self, item):
    """

    :param item:
    :return:
    """
    raise NotImplementedError

`init(root, sample_loader=default_loader, target_loader=None, collate_fn=None, valid_sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None)`

Ctor :param root: :param sample_loader: :param target_loader: :param collate_fn: :param valid_sample_extensions: :param sample_transform: :param target_transform:

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __init__(
    self,
    root: str,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    valid_sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
):
    """
    Ctor
        :param root:
        :param sample_loader:
        :param target_loader:
        :param collate_fn:
        :param valid_sample_extensions:
        :param sample_transform:
        :param target_transform:
    """
    super().__init__(root=root, transform=sample_transform, target_transform=target_transform)
    self.samples_targets_tuples_list = list(tuple())
    self.classes = []
    self.valid_sample_extensions = valid_sample_extensions
    self.sample_loader = sample_loader
    self.target_loader = target_loader
    self._generate_samples_and_targets()

    # IF collate_fn IS PROVIDED IN CTOR WE ASSUME THERE IS A BASE-CLASS INHERITANCE W/O collate_fn IMPLEMENTATION
    if collate_fn is not None:
        self.collate_fn = collate_fn

`len()`

Returns:

Type	Description

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __len__(self):
    """

    :return:
    """
    return len(self.samples_targets_tuples_list)

`numpy_loader_func(path)` `staticmethod`

_numpy_loader_func - Uses numpy load func :param path: :return:

Source code in src/super_gradients/training/datasets/sg_dataset.py

@staticmethod
def numpy_loader_func(path):
    """
    _numpy_loader_func - Uses numpy load func
        :param path:
        :return:
    """
    return np.load(path)

`text_file_loader_func(text_file_path, inline_splitter=' ')` `staticmethod`

    text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
        :param text_file_path:  Input text file
        :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('

') SEPARATED :return: a list of tuples, where each tuple is a vector of target values

Source code in src/super_gradients/training/datasets/sg_dataset.py

@staticmethod
def text_file_loader_func(text_file_path: str, inline_splitter: str = " ") -> list:
    """
    text_file_loader_func - Uses a line by line based code to get vectorized data from a text-based file
        :param text_file_path:  Input text file
        :param inline_splitter: The char to use in order to separate between different VALUES of the SAME vector
                                please notice that DIFFERENT VECTORS SHOULD BE IN SEPARATE LINES ('\n') SEPARATED
        :return: a list of tuples, where each tuple is a vector of target values
    """
    if not os.path.isfile(text_file_path):
        raise ValueError(" Error in text file path")

    with open(text_file_path, "r", encoding="utf-8") as text_file:
        targets_list = [tuple(map(float, line.split(inline_splitter))) for line in text_file]

    return targets_list

`DirectoryDataSet`

Bases: BaseSgVisionDataset

DirectoryDataSet - A PyTorch Vision Data Set extension that receives a root Dir and two separate sub directories: - Sub-Directory for Samples - Sub-Directory for Targets

Source code in src/super_gradients/training/datasets/sg_dataset.py

class DirectoryDataSet(BaseSgVisionDataset):
    """
    DirectoryDataSet - A PyTorch Vision Data Set extension that receives a root Dir and two separate sub directories:
                        - Sub-Directory for Samples
                        - Sub-Directory for Targets

    """

    def __init__(
        self,
        root: str,
        samples_sub_directory: str,
        targets_sub_directory: str,
        target_extension: str,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
    ):
        """
        CTOR
            :param root:                    root directory that contains all of the Data Set
            :param samples_sub_directory:   name of the samples sub-directory
            :param targets_sub_directory:   name of the targets sub-directory
            :param sample_extensions:       file extensions for samples
            :param target_extension:        file extension of the targets
            :param sample_loader:           Func to load samples
            :param target_loader:           Func to load targets
            :param collate_fn:              collate_fn func to process batches for the Data Loader
            :param sample_transform:        Func to pre-process samples for data loading
            :param target_transform:        Func to pre-process targets for data loading
        """

        # INITIALIZING THE TARGETS LOADER TO USE THE TEXT FILE LOADER FUNC
        if target_loader is None:
            target_loader = self.text_file_loader_func

        self.target_extension = target_extension
        self.samples_dir_suffix = samples_sub_directory
        self.targets_dir_suffix = targets_sub_directory

        super().__init__(
            root=root,
            sample_loader=sample_loader,
            target_loader=target_loader,
            collate_fn=collate_fn,
            valid_sample_extensions=sample_extensions,
            sample_transform=sample_transform,
            target_transform=target_transform,
        )

    def __getitem__(self, item):
        """
        getter method for iteration
            :param item:
            :return:
        """
        sample_path, target_path = self.samples_targets_tuples_list[item]
        sample = self.sample_loader(sample_path)
        target = self.target_loader(target_path)
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return sample, target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets - Uses class built in members to generate the list of (SAMPLE, TARGET/S)
                                        that is saved in self.samples_targets_tuples_list
        """
        missing_sample_files, missing_target_files = 0, 0

        # VALIDATE DATA PATH
        samples_dir_path = self.root + os.path.sep + self.samples_dir_suffix
        targets_dir_path = self.root + os.path.sep + self.targets_dir_suffix

        if not os.path.exists(samples_dir_path) or not os.path.exists(targets_dir_path):
            raise ValueError(" Error in data path")

        # ITERATE OVER SAMPLES AND MAKE SURE THERE ARE MATCHING LABELS
        for sample_file_name in os.listdir(samples_dir_path):
            sample_file_path = samples_dir_path + os.path.sep + sample_file_name
            if os.path.isfile(sample_file_path) and self._validate_file(sample_file_path):
                sample_file_prefix = str(sample_file_name.split(".")[:-1][0])

                # TRY TO GET THE MATCHING LABEL
                matching_target_file_name = sample_file_prefix + self.target_extension
                target_file_path = targets_dir_path + os.path.sep + matching_target_file_name
                if os.path.isfile(target_file_path):
                    self.samples_targets_tuples_list.append((sample_file_path, target_file_path))

                else:
                    missing_target_files += 1
            else:
                missing_sample_files += 1

        for counter_name, missing_files_counter in [("samples", missing_sample_files), ("targets", missing_target_files)]:
            if missing_files_counter > 0:
                print(__name__ + " There are " + str(missing_files_counter) + " missing  " + counter_name)

`getitem(item)`

getter method for iteration :param item: :return:

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __getitem__(self, item):
    """
    getter method for iteration
        :param item:
        :return:
    """
    sample_path, target_path = self.samples_targets_tuples_list[item]
    sample = self.sample_loader(sample_path)
    target = self.target_loader(target_path)
    if self.transform is not None:
        sample = self.transform(sample)
    if self.target_transform is not None:
        target = self.target_transform(target)

    return sample, target

`init(root, samples_sub_directory, targets_sub_directory, target_extension, sample_loader=default_loader, target_loader=None, collate_fn=None, sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None)`

CTOR :param root: root directory that contains all of the Data Set :param samples_sub_directory: name of the samples sub-directory :param targets_sub_directory: name of the targets sub-directory :param sample_extensions: file extensions for samples :param target_extension: file extension of the targets :param sample_loader: Func to load samples :param target_loader: Func to load targets :param collate_fn: collate_fn func to process batches for the Data Loader :param sample_transform: Func to pre-process samples for data loading :param target_transform: Func to pre-process targets for data loading

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __init__(
    self,
    root: str,
    samples_sub_directory: str,
    targets_sub_directory: str,
    target_extension: str,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
):
    """
    CTOR
        :param root:                    root directory that contains all of the Data Set
        :param samples_sub_directory:   name of the samples sub-directory
        :param targets_sub_directory:   name of the targets sub-directory
        :param sample_extensions:       file extensions for samples
        :param target_extension:        file extension of the targets
        :param sample_loader:           Func to load samples
        :param target_loader:           Func to load targets
        :param collate_fn:              collate_fn func to process batches for the Data Loader
        :param sample_transform:        Func to pre-process samples for data loading
        :param target_transform:        Func to pre-process targets for data loading
    """

    # INITIALIZING THE TARGETS LOADER TO USE THE TEXT FILE LOADER FUNC
    if target_loader is None:
        target_loader = self.text_file_loader_func

    self.target_extension = target_extension
    self.samples_dir_suffix = samples_sub_directory
    self.targets_dir_suffix = targets_sub_directory

    super().__init__(
        root=root,
        sample_loader=sample_loader,
        target_loader=target_loader,
        collate_fn=collate_fn,
        valid_sample_extensions=sample_extensions,
        sample_transform=sample_transform,
        target_transform=target_transform,
    )

`ListDataset`

Bases: BaseSgVisionDataset

ListDataset - A PyTorch Vision Data Set extension that receives a file with FULL PATH to each of the samples. Then, the assumption is that for every sample, there is a * matching target * in the same path but with a different extension, i.e: for the samples paths: (That appear in the list file) /root/dataset/class_x/sample1.png /root/dataset/class_y/sample123.png

                the matching labels paths:  (That DO NOT appear in the list file)
                                            /root/dataset/class_x/sample1.ext
                                            /root/dataset/class_y/sample123.ext

Source code in src/super_gradients/training/datasets/sg_dataset.py

class ListDataset(BaseSgVisionDataset):
    """
    ListDataset - A PyTorch Vision Data Set extension that receives a file with FULL PATH to each of the samples.
                  Then, the assumption is that for every sample, there is a * matching target * in the same
                  path but with a different extension, i.e:
                        for the samples paths:  (That appear in the list file)
                                                    /root/dataset/class_x/sample1.png
                                                    /root/dataset/class_y/sample123.png

                        the matching labels paths:  (That DO NOT appear in the list file)
                                                    /root/dataset/class_x/sample1.ext
                                                    /root/dataset/class_y/sample123.ext
    """

    def __init__(
        self,
        root,
        file,
        sample_loader: Callable = default_loader,
        target_loader: Callable = None,
        collate_fn: Callable = None,
        sample_extensions: tuple = IMG_EXTENSIONS,
        sample_transform: Callable = None,
        target_transform: Callable = None,
        target_extension=".npy",
    ):
        """
        CTOR
            :param root:                    root directory that contains all of the Data Set
            :param file:                    Path to the file with the samples list
            :param sample_extensions:       file extension for samples
            :param target_extension:        file extension of the targets
            :param sample_loader:           Func to load samples
            :param target_loader:           Func to load targets
            :param collate_fn:              collate_fn func to process batches for the Data Loader
            :param sample_transform:        Func to pre-process samples for data loading
            :param target_transform:        Func to pre-process targets for data loading
        """

        if target_loader is None:
            target_loader = self.numpy_loader_func

        self.list_file_path = file
        self.loader = sample_loader
        self.target_loader = target_loader
        self.extensions = sample_extensions
        self.target_extension = target_extension

        super().__init__(
            root,
            sample_loader=sample_loader,
            target_loader=target_loader,
            collate_fn=collate_fn,
            sample_transform=sample_transform,
            valid_sample_extensions=sample_extensions,
            target_transform=target_transform,
        )

    def __getitem__(self, item: int) -> Tuple[Any, Any]:
        """
        :param item: Index
        :return: Tuple (sample, target) where target is class_index of the target class.
        """
        sample_path, target_path = self.samples_targets_tuples_list[item]
        sample = self.loader(sample_path)
        target = self.target_loader(target_path)[0]
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return sample, target

    def _generate_samples_and_targets(self):
        """
        _generate_samples_and_targets
        """
        file = open(self.root + os.path.sep + self.list_file_path, "r", encoding="utf-8")

        reader = csv.reader(file)
        data = [row[0] for row in reader]

        for f in data:
            path = self.root + os.path.sep + f
            target_path = path[:-4] + self.target_extension
            if self._validate_file(path) and os.path.exists(target_path):
                self.samples_targets_tuples_list.append((path, target_path))

`getitem(item)`

Parameters:

Name	Type	Description	Default
`item`	`int`	Index	required

Returns:

Type	Description
`Tuple[Any, Any]`	Tuple (sample, target) where target is class_index of the target class.

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __getitem__(self, item: int) -> Tuple[Any, Any]:
    """
    :param item: Index
    :return: Tuple (sample, target) where target is class_index of the target class.
    """
    sample_path, target_path = self.samples_targets_tuples_list[item]
    sample = self.loader(sample_path)
    target = self.target_loader(target_path)[0]
    if self.transform is not None:
        sample = self.transform(sample)
    if self.target_transform is not None:
        target = self.target_transform(target)

    return sample, target

`init(root, file, sample_loader=default_loader, target_loader=None, collate_fn=None, sample_extensions=IMG_EXTENSIONS, sample_transform=None, target_transform=None, target_extension='.npy')`

CTOR :param root: root directory that contains all of the Data Set :param file: Path to the file with the samples list :param sample_extensions: file extension for samples :param target_extension: file extension of the targets :param sample_loader: Func to load samples :param target_loader: Func to load targets :param collate_fn: collate_fn func to process batches for the Data Loader :param sample_transform: Func to pre-process samples for data loading :param target_transform: Func to pre-process targets for data loading

Source code in src/super_gradients/training/datasets/sg_dataset.py

def __init__(
    self,
    root,
    file,
    sample_loader: Callable = default_loader,
    target_loader: Callable = None,
    collate_fn: Callable = None,
    sample_extensions: tuple = IMG_EXTENSIONS,
    sample_transform: Callable = None,
    target_transform: Callable = None,
    target_extension=".npy",
):
    """
    CTOR
        :param root:                    root directory that contains all of the Data Set
        :param file:                    Path to the file with the samples list
        :param sample_extensions:       file extension for samples
        :param target_extension:        file extension of the targets
        :param sample_loader:           Func to load samples
        :param target_loader:           Func to load targets
        :param collate_fn:              collate_fn func to process batches for the Data Loader
        :param sample_transform:        Func to pre-process samples for data loading
        :param target_transform:        Func to pre-process targets for data loading
    """

    if target_loader is None:
        target_loader = self.numpy_loader_func

    self.list_file_path = file
    self.loader = sample_loader
    self.target_loader = target_loader
    self.extensions = sample_extensions
    self.target_extension = target_extension

    super().__init__(
        root,
        sample_loader=sample_loader,
        target_loader=target_loader,
        collate_fn=collate_fn,
        sample_transform=sample_transform,
        valid_sample_extensions=sample_extensions,
        target_transform=target_transform,
    )

Datasets

AugmentOp

RandAugment

rand_augment_transform(config_str, crop_size, img_mean)

Cifar10

get_dataset_preprocessing_params()

Cifar100

__init__(root, train=True, transforms=None, target_transform=None, download=False)

get_dataset_preprocessing_params()

ImageNetDataset

get_dataset_preprocessing_params()

get_torchvision_transforms_equivalent_processing(transforms)

Lighting

RandomErase

BoundingBoxFormat

from_xyxy(bboxes, image_shape, inplace)

to_xyxy(bboxes, image_shape, inplace)

convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)

cxcywh_to_xyxy(bboxes, image_shape)

cxcywh_to_xyxy_inplace(bboxes, image_shape)

xyxy_to_cxcywh(bboxes, image_shape)

xyxy_to_cxcywh_inplace(bboxes, image_shape)

NormalizedXYXYCoordinateFormat

normalized_xyxy_to_xyxy(bboxes, image_shape)

normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)

xyxy_to_normalized_xyxy(bboxes, image_shape)

xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)

xywh_to_xyxy(bboxes, image_shape)

xywh_to_xyxy_inplace(bboxes, image_shape)

xyxy_to_xywh(bboxes, image_shape)

xyxy_to_xywh_inplace(bboxes, image_shape)

XYXYCoordinateFormat

YXYXCoordinateFormat

ConcatenatedTensorFormatConverter

__init__(input_format, output_format, image_shape)

ConcatenatedTensorFormat

apply_on_bboxes(fn, tensor, tensor_format)

apply_on_layout(fn, tensor, tensor_format, layout_name)

filter_on_bboxes(fn, tensor, tensor_format)

filter_on_layout(fn, tensor, tensor_format, layout_name)

get_permutation_indexes(input_format, output_format)

ConvertBoundingBoxes

forward(x)

DetectionOutputAdapter

Suppose we want to return predictions in another format.

Let it be:

- Bounding boxes in normalized XYWH [4]

- Predicted attributes [4]

- Predicted label [1]

Now we can construct output adapter and attach it to the model

__init__(input_format, output_format, image_shape)

forward(predictions)

RearrangeOutput

forward(x)

AbstractCollateFunction

AbstractPrePredictionCallback

ComposedCollateFunction

DatasetStatisticsTensorboardLogger

analyze(data_loader, title, all_classes, anchors=None)

DetectionMultiscalePrePredictionCallback

MultiScaleCollateFunction

__init__(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)

MultiscalePrePredictionCallback

RandomResizedCropAndInterpolation

forward(img)

get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])

get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)

worker_init_reset_seed(worker_id)

AbstractDepthEstimationDataset

__getitem__(index)

load_random_sample()

load_sample(index) abstractmethod

plot(max_samples_per_plot=8, n_plots=1, plot_transformed_data=True, color_scheme=None, drop_extreme_percentage=0, inverse=False)

NYUv2DepthEstimationDataset

__init__(root, df_path, transforms=None)

__len__()

load_sample(index)

COCODetectionDataset

__init__(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)

COCOFormatDetectionDataset

`AugmentOp`

`RandAugment`

`rand_augment_transform(config_str, crop_size, img_mean)`

`Cifar10`

`get_dataset_preprocessing_params()`

`Cifar100`

`init(root, train=True, transforms=None, target_transform=None, download=False)`

`get_dataset_preprocessing_params()`

`ImageNetDataset`

`get_dataset_preprocessing_params()`

`get_torchvision_transforms_equivalent_processing(transforms)`

`Lighting`

`RandomErase`

`BoundingBoxFormat`

`from_xyxy(bboxes, image_shape, inplace)`

`to_xyxy(bboxes, image_shape, inplace)`

`convert_bboxes(bboxes, image_shape, source_format, target_format, inplace)`

`cxcywh_to_xyxy(bboxes, image_shape)`

`cxcywh_to_xyxy_inplace(bboxes, image_shape)`

`xyxy_to_cxcywh(bboxes, image_shape)`

`xyxy_to_cxcywh_inplace(bboxes, image_shape)`

`NormalizedXYXYCoordinateFormat`

`normalized_xyxy_to_xyxy(bboxes, image_shape)`

`normalized_xyxy_to_xyxy_inplace(bboxes, image_shape)`

`xyxy_to_normalized_xyxy(bboxes, image_shape)`

`xyxy_to_normalized_xyxy_inplace(bboxes, image_shape)`

`xywh_to_xyxy(bboxes, image_shape)`

`xywh_to_xyxy_inplace(bboxes, image_shape)`

`xyxy_to_xywh(bboxes, image_shape)`

`xyxy_to_xywh_inplace(bboxes, image_shape)`

`XYXYCoordinateFormat`

`YXYXCoordinateFormat`

`ConcatenatedTensorFormatConverter`

`init(input_format, output_format, image_shape)`

`ConcatenatedTensorFormat`

`apply_on_bboxes(fn, tensor, tensor_format)`

`apply_on_layout(fn, tensor, tensor_format, layout_name)`

`filter_on_bboxes(fn, tensor, tensor_format)`

`filter_on_layout(fn, tensor, tensor_format, layout_name)`

`get_permutation_indexes(input_format, output_format)`

`ConvertBoundingBoxes`

`forward(x)`

`DetectionOutputAdapter`

`init(input_format, output_format, image_shape)`

`forward(predictions)`

`RearrangeOutput`

`forward(x)`

`AbstractCollateFunction`

`AbstractPrePredictionCallback`

`ComposedCollateFunction`

`DatasetStatisticsTensorboardLogger`

`analyze(data_loader, title, all_classes, anchors=None)`

`DetectionMultiscalePrePredictionCallback`

`MultiScaleCollateFunction`

`init(target_size=None, min_image_size=None, max_image_size=None, image_size_steps=32, change_frequency=10)`

`MultiscalePrePredictionCallback`

`RandomResizedCropAndInterpolation`

`forward(img)`

`get_color_augmentation(rand_augment_config_string, color_jitter, crop_size=224, img_mean=[0.485, 0.456, 0.406])`

`get_mean_and_std_torch(data_dir=None, dataloader=None, num_workers=4, RandomResizeSize=224)`

`worker_init_reset_seed(worker_id)`

`AbstractDepthEstimationDataset`

`getitem(index)`

`load_random_sample()`

`load_sample(index)` `abstractmethod`

`plot(max_samples_per_plot=8, n_plots=1, plot_transformed_data=True, color_scheme=None, drop_extreme_percentage=0, inverse=False)`

`NYUv2DepthEstimationDataset`

`init(root, df_path, transforms=None)`

`len()`

`load_sample(index)`

`COCODetectionDataset`

`init(json_file='instances_train2017.json', subdir='images/train2017', *args, **kwargs)`

`COCOFormatDetectionDataset`

`init(data_dir, json_annotation_file, images_dir, with_crowd=True, class_ids_to_ignore=None, tight_box_rotation=None, *args, **kwargs)`

`parse_coco_into_detection_annotations(ann, exclude_classes=None, include_classes=None, class_ids_to_ignore=None, image_path_prefix=None)`

`DetectionDataset`

`getitem(index)`

`len()`

`apply_transforms(sample)`

`get_dataset_preprocessing_params()`