Environment

`pop_arg(arg_name, default_value=None)`

Get the specified args and remove them from argv

Source code in latest/src/super_gradients/common/environment/argparse_utils.py

def pop_arg(arg_name: str, default_value: Any = None) -> Any:
    """Get the specified args and remove them from argv"""

    parser = argparse.ArgumentParser()
    parser.add_argument(f"--{arg_name}", default=default_value)
    args, _ = parser.parse_known_args()

    # Remove the ddp args to not have a conflict with the use of hydra
    for val in filter(lambda x: x.startswith(f"--{arg_name}"), sys.argv):
        EXTRA_ARGS.append(val)
        sys.argv.remove(val)
    return vars(args)[arg_name]

`pop_local_rank()`

Pop the python arg "local-rank". If exists inform the user with a log, otherwise return -1.

Source code in latest/src/super_gradients/common/environment/argparse_utils.py

def pop_local_rank() -> int:
    """Pop the python arg "local-rank". If exists inform the user with a log, otherwise return -1."""
    local_rank = pop_arg("local_rank", default_value=-1)
    if local_rank != -1:
        logger.info("local_rank was automatically parsed from your config.")
    return local_rank

`add_params_to_cfg(cfg, params)`

Add parameters to an existing config

Parameters:

Name	Type	Description	Default
`cfg`	`DictConfig`	OmegaConf config	required
`params`	`List[str]`	List of parameters to add, in dotlist format (i.e. ["training_hyperparams.resume=True"])	required

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def add_params_to_cfg(cfg: DictConfig, params: List[str]):
    """Add parameters to an existing config

    :param cfg:     OmegaConf config
    :param params:  List of parameters to add, in dotlist format (i.e. ["training_hyperparams.resume=True"])"""
    new_cfg = OmegaConf.from_dotlist(params)
    override_cfg(cfg, new_cfg)

`export_recipe(config_name, save_path, config_dir=pkg_resources.resource_filename('super_gradients.recipes', ''))`

saves a complete (i.e no inheritance from other yaml configuration files), .yaml file that can be ran on its own without the need to keep other configurations which the original file inherits from.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	The .yaml config filename (can leave the .yaml postfix out, but not mandatory).	required
`save_path`	`str`	The config directory path, as absolute file system path. When None, will use SG's recipe directory (i.e path/to/super_gradients/recipes)	required
`config_dir`	`str`	The config directory path, as absolute file system path. When None, will use SG's recipe directory (i.e path/to/super_gradients/recipes)	`pkg_resources.resource_filename('super_gradients.recipes', '')`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def export_recipe(config_name: str, save_path: str, config_dir: str = pkg_resources.resource_filename("super_gradients.recipes", "")):
    """
    saves a complete (i.e no inheritance from other yaml configuration files),
     .yaml file that can be ran on its own without the need to keep other configurations which the original
      file inherits from.

    :param config_name: The .yaml config filename (can leave the .yaml postfix out, but not mandatory).

    :param save_path: The config directory path, as absolute file system path.
        When None, will use SG's recipe directory (i.e path/to/super_gradients/recipes)

    :param config_dir: The config directory path, as absolute file system path.
        When None, will use SG's recipe directory (i.e path/to/super_gradients/recipes)

    """
    # NEED TO REGISTER RESOLVERS FIRST
    register_hydra_resolvers()
    GlobalHydra.instance().clear()
    with initialize_config_dir(config_dir=normalize_path(config_dir), version_base="1.2"):
        cfg = compose(config_name=config_name)
        OmegaConf.save(config=cfg, f=save_path)
        logger.info(f"Successfully saved recipe at {save_path}. \n" f"Recipe content:\n {cfg}")

`load_arch_params(config_name, recipes_dir_path=None, overrides=None)`

Load a single arch_params file.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "resnet18_cifar_arch_params")	required
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "arch_params" folder, which itself should include the config file named after config_name.	`None`
`overrides`	`Optional[list]`	List of hydra overrides for config file	`None`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_arch_params(config_name: str, recipes_dir_path: Optional[str] = None, overrides: Optional[list] = None) -> DictConfig:
    """Load a single arch_params file.
    :param config_name:         Name of the yaml to load (e.g. "resnet18_cifar_arch_params")
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "arch_params" folder,
                                which itself should include the config file named after config_name.
    :param overrides:           List of hydra overrides for config file
    """
    return load_recipe_from_subconfig(config_name=config_name, recipes_dir_path=recipes_dir_path, overrides=overrides, config_type="arch_params")

`load_dataset_params(config_name, recipes_dir_path=None, overrides=None)`

Load a single dataset_params file.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "cifar10_dataset_params")	required
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "training_hyperparams" folder, which itself should include the config file named after config_name.	`None`
`overrides`	`Optional[list]`	List of hydra overrides for config file	`None`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_dataset_params(config_name: str, recipes_dir_path: Optional[str] = None, overrides: Optional[list] = None) -> DictConfig:
    """Load a single dataset_params file.
    :param config_name:         Name of the yaml to load (e.g. "cifar10_dataset_params")
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "training_hyperparams" folder,
                                which itself should include the config file named after config_name.
    :param overrides:           List of hydra overrides for config file
    """
    return load_recipe_from_subconfig(config_name=config_name, recipes_dir_path=recipes_dir_path, overrides=overrides, config_type="dataset_params")

`load_experiment_cfg(experiment_name, ckpt_root_dir=None, run_id=None)`

Load the hydra config associated to a specific experiment.

Background Information: every time an experiment is launched based on a recipe, all the hydra config params are stored in a hidden folder ".hydra". This hidden folder is used here to recreate the exact same config as the one that was used to launch the experiment (Also include hydra overrides).

The motivation is to be able to resume or evaluate an experiment with the exact same config as the one that was used when the experiment was initially started, regardless of any change that might have been introduced to the recipe, and also while using the same overrides that were used for that experiment.

Parameters:

Name	Type	Description	Default
`experiment_name`	`str`	Name of the experiment to resume	required
`ckpt_root_dir`	`Optional[str]`	Directory including the checkpoints	`None`
`run_id`	`Optional[str]`	Optional. Run id of the experiment. If None, the most recent run will be loaded.	`None`

Returns:

Type	Description
`DictConfig`	The config that was used for that experiment

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_experiment_cfg(experiment_name: str, ckpt_root_dir: Optional[str] = None, run_id: Optional[str] = None) -> DictConfig:
    """
    Load the hydra config associated to a specific experiment.

    Background Information: every time an experiment is launched based on a recipe, all the hydra config params are stored in a hidden folder ".hydra".
    This hidden folder is used here to recreate the exact same config as the one that was used to launch the experiment (Also include hydra overrides).

    The motivation is to be able to resume or evaluate an experiment with the exact same config as the one that was used when the experiment was
    initially started, regardless of any change that might have been introduced to the recipe, and also while using the same overrides that were used
    for that experiment.

    :param experiment_name:     Name of the experiment to resume
    :param ckpt_root_dir:       Directory including the checkpoints
    :param run_id:              Optional. Run id of the experiment. If None, the most recent run will be loaded.
    :return:                    The config that was used for that experiment
    """
    if not experiment_name:
        raise ValueError(f"experiment_name should be non empty string but got :{experiment_name}")

    checkpoints_dir_path = Path(get_checkpoints_dir_path(ckpt_root_dir=ckpt_root_dir, experiment_name=experiment_name, run_id=run_id))
    if not checkpoints_dir_path.exists():
        raise FileNotFoundError(f"Impossible to find checkpoint dir ({checkpoints_dir_path})")

    resume_dir = Path(checkpoints_dir_path) / ".hydra"
    if not resume_dir.exists():
        raise FileNotFoundError(f"The checkpoint directory {checkpoints_dir_path} does not include .hydra artifacts to resume the experiment.")

    # Load overrides that were used in previous run
    overrides_cfg = list(OmegaConf.load(resume_dir / "overrides.yaml"))

    cfg = load_recipe(config_name="config.yaml", recipes_dir_path=normalize_path(str(resume_dir)), overrides=overrides_cfg)
    return cfg

`load_recipe(config_name, recipes_dir_path=None, overrides=None)`

Load a single a file of the recipe directory.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "cifar10_resnet")	required
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a folder corresponding to the subconfig, which itself should include the config file named after config_name.	`None`
`overrides`	`Optional[list]`	List of hydra overrides for config file	`None`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_recipe(config_name: str, recipes_dir_path: Optional[str] = None, overrides: Optional[list] = None) -> DictConfig:
    """Load a single a file of the recipe directory.

    :param config_name:         Name of the yaml to load (e.g. "cifar10_resnet")
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a folder corresponding to the subconfig, which itself should
                                include the config file named after config_name.
    :param overrides:           List of hydra overrides for config file
    """
    GlobalHydra.instance().clear()

    config_dir = recipes_dir_path or pkg_resources.resource_filename("super_gradients.recipes", "")

    with initialize_config_dir(config_dir=normalize_path(config_dir), version_base="1.2"):
        try:
            cfg = compose(config_name=normalize_path(config_name), overrides=overrides if overrides else [])
        except hydra.errors.MissingConfigException:
            raise RecipeNotFoundError(config_name=config_name, config_dir=config_dir, recipes_dir_path=recipes_dir_path)
    return cfg

`load_recipe_from_subconfig(config_name, config_type, recipes_dir_path=None, overrides=None)`

Load a single a file (e.g. "resnet18_cifar_arch_params") stored in a subconfig (e.g. "arch_param") of the recipe directory,.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "resnet18_cifar_arch_params")	required
`config_type`	`str`	Type of the subconfig (e.g. "arch_params")	required
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a folder corresponding to the subconfig, which itself should include the config file named after config_name.	`None`
`overrides`	`Optional[list]`	List of hydra overrides for config file	`None`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_recipe_from_subconfig(config_name: str, config_type: str, recipes_dir_path: Optional[str] = None, overrides: Optional[list] = None) -> DictConfig:
    """Load a single a file (e.g. "resnet18_cifar_arch_params") stored in a subconfig (e.g. "arch_param") of the recipe directory,.

    :param config_name:         Name of the yaml to load (e.g. "resnet18_cifar_arch_params")
    :param config_type:         Type of the subconfig (e.g. "arch_params")
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a folder corresponding to the subconfig,
                                which itself should include the config file named after config_name.
    :param overrides:           List of hydra overrides for config file
    """

    try:
        cfg = load_recipe(config_name=os.path.join(config_type, config_name), recipes_dir_path=recipes_dir_path, overrides=overrides)
    except RecipeNotFoundError as e:
        postfix_err_msg = (
            f"Note: If your recipe is saved at '{os.path.join(e.config_dir, config_name.replace('.yaml', ''))}.yaml', you can load it with load_recipe(...).\n"
        )

        raise RecipeNotFoundError(
            config_name=config_name,
            config_dir=e.config_dir,
            config_type=config_type,
            recipes_dir_path=recipes_dir_path,
            postfix_err_msg=postfix_err_msg,
        )

    # Because of the way we load the subconfig, cfg will start with a single key corresponding to the type (arch_params, ...) and don't want that.
    cfg = cfg[config_type]

    return cfg

`load_training_hyperparams(config_name, recipes_dir_path=None, overrides=None)`

Load a single training_hyperparams file.

Parameters:

Name	Type	Description	Default
`config_name`	`str`	Name of the yaml to load (e.g. "cifar10_resnet_train_params")	required
`recipes_dir_path`	`Optional[str]`	Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes) This directory should include a "training_hyperparams" folder, which itself should include the config file named after config_name.	`None`
`overrides`	`Optional[list]`	List of hydra overrides for config file	`None`

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def load_training_hyperparams(config_name: str, recipes_dir_path: Optional[str] = None, overrides: Optional[list] = None) -> DictConfig:
    """Load a single training_hyperparams file.
    :param config_name:         Name of the yaml to load (e.g. "cifar10_resnet_train_params")
    :param recipes_dir_path:    Optional. Main directory where every recipe are stored. (e.g. ../super_gradients/recipes)
                                This directory should include a "training_hyperparams" folder,
                                which itself should include the config file named after config_name.
    :param overrides:           List of hydra overrides for config file
    """
    return load_recipe_from_subconfig(config_name=config_name, recipes_dir_path=recipes_dir_path, overrides=overrides, config_type="training_hyperparams")

`maybe_instantiate_test_loaders(cfg)`

Instantiate test loaders if they are defined in the config.

Parameters:

Name	Type	Description	Default
`cfg`		Recipe config	required

Returns:

Type	Description
`Optional[Mapping[str, DataLoader]]`	A mapping from dataset name to test loader or None if no test loaders are defined.

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def maybe_instantiate_test_loaders(cfg) -> Optional[Mapping[str, DataLoader]]:
    """
    Instantiate test loaders if they are defined in the config.

    :param cfg: Recipe config
    :return:    A mapping from dataset name to test loader or None if no test loaders are defined.
    """
    from super_gradients.training.utils.utils import get_param
    from super_gradients.training import dataloaders

    test_loaders = None
    if "test_dataset_params" in cfg.dataset_params:
        test_dataloaders = get_param(cfg, "test_dataloaders")
        test_dataset_params = cfg.dataset_params.test_dataset_params
        test_dataloader_params = get_param(cfg.dataset_params, "test_dataloader_params")

        if test_dataloaders is not None:
            if not isinstance(test_dataloaders, Mapping):
                raise ValueError("`test_dataloaders` should be a mapping from test_loader_name to test_loader_params.")

            if test_dataloader_params is not None and test_dataloader_params.keys() != test_dataset_params.keys():
                raise ValueError("test_dataloader_params and test_dataset_params should have the same keys.")

        test_loaders = {}
        for dataset_name, dataset_params in test_dataset_params.items():
            loader_name = test_dataloaders[dataset_name] if test_dataloaders is not None else None
            dataset_params = test_dataset_params[dataset_name]
            dataloader_params = test_dataloader_params[dataset_name] if test_dataloader_params is not None else cfg.dataset_params.val_dataloader_params
            loader = dataloaders.get(loader_name, dataset_params=dataset_params, dataloader_params=dataloader_params)
            test_loaders[dataset_name] = loader

    return test_loaders

`override_cfg(cfg, overrides)`

Override inplace a config with a list of hydra overrides

Parameters:

Name	Type	Description	Default
`cfg`	`DictConfig`	OmegaConf config	required
`overrides`	`Union[DictConfig, Dict[str, Any]]`	Dictionary like object that will be used to override cfg	required

Source code in latest/src/super_gradients/common/environment/cfg_utils.py

def override_cfg(cfg: DictConfig, overrides: Union[DictConfig, Dict[str, Any]]) -> None:
    """Override inplace a config with a list of hydra overrides
    :param cfg:         OmegaConf config
    :param overrides:   Dictionary like object that will be used to override cfg
    """
    with open_dict(cfg):  # This is required to add new fields to existing config
        cfg.merge_with(overrides)

`generate_run_id()`

Generate a unique run ID based on the current timestamp.

Returns:

Type	Description
`str`	Unique run ID. in the format "RUN_" (E.g. "RUN_20230802_131052_651906")

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

@execute_and_distribute_from_master
def generate_run_id() -> str:
    """Generate a unique run ID based on the current timestamp.

    :return: Unique run ID. in the format "RUN_<year><month><day>_<hour><minute><second>_<microseconds>" (E.g. "RUN_20230802_131052_651906")
    """
    return datetime.now().strftime("RUN_%Y%m%d_%H%M%S_%f")

`get_checkpoints_dir_path(experiment_name, ckpt_root_dir=None, run_id=None)`

Get the directory that includes all the checkpoints (and logs) of an experiment. ckpt_root_dir - experiment_name - run_id - ... - ...

Parameters:

Name	Type	Description	Default
`experiment_name`	`str`	Name of the experiment.	required
`ckpt_root_dir`	`Optional[str]`	Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment. If None, SG will first check if a package named 'checkpoints' exists. If not, SG will look for the root of the project that includes the script that was launched. If not found, raise an error.	`None`
`run_id`	`Optional[str]`	Optional. Run id of the experiment. If None, the most recent run will be loaded.	`None`

Returns:

Type	Description
`str`	Path of folder where the experiment checkpoints and logs will be stored.

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

def get_checkpoints_dir_path(experiment_name: str, ckpt_root_dir: Optional[str] = None, run_id: Optional[str] = None) -> str:
    """Get the directory that includes all the checkpoints (and logs) of an experiment.
    ckpt_root_dir
        - experiment_name
            - run_id
                - ...
                - ...

    :param experiment_name:     Name of the experiment.
    :param ckpt_root_dir:       Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment.
                                    If None, SG will first check if a package named 'checkpoints' exists.
                                    If not, SG will look for the root of the project that includes the script that was launched.
                                    If not found, raise an error.
    :param run_id:              Optional. Run id of the experiment. If None, the most recent run will be loaded.
    :return:                    Path of folder where the experiment checkpoints and logs will be stored.
    """
    experiment_dir = get_experiment_dir_path(checkpoints_root_dir=ckpt_root_dir, experiment_name=experiment_name)
    checkpoint_dir = experiment_dir if run_id is None else os.path.join(experiment_dir, run_id)
    os.makedirs(checkpoint_dir, exist_ok=True)
    return checkpoint_dir

`get_ckpt_local_path(experiment_name, ckpt_name, external_checkpoint_path, ckpt_root_dir=None, run_id=None)`

Gets the local path to the checkpoint file, which will be: - By default: YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name/ckpt_name. - external_checkpoint_path when external_checkpoint_path != None - ckpt_root_dir/experiment_name/ckpt_name when ckpt_root_dir != None. - if the checkpoint file is remotely located: when overwrite_local_checkpoint=True then it will be saved in a temporary path which will be returned, otherwise it will be downloaded to YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name and overwrite YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name/ckpt_name if such file exists.

Parameters:

Name	Type	Description	Default
`experiment_name`	`str`	Name of the experiment.	required
`ckpt_name`	`str`	Checkpoint filename	required
`external_checkpoint_path`	`str`	Full path to checkpoint file (that might be located outside of super_gradients/checkpoints directory)	required
`ckpt_root_dir`	`str`	Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment. If None, SG will first check if a package named 'checkpoints' exists. If not, SG will look for the root of the project that includes the script that was launched. If not found, raise an error.	`None`
`run_id`	`Optional[str]`	Optional. Run id of the experiment. If None, the most recent run will be loaded.	`None`

Returns:

Type	Description
`str`	Path of folder where the experiment checkpoints and logs will be stored. :return: local path of the checkpoint file (Str)

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

def get_ckpt_local_path(experiment_name: str, ckpt_name: str, external_checkpoint_path: str, ckpt_root_dir: str = None, run_id: Optional[str] = None) -> str:
    """
    Gets the local path to the checkpoint file, which will be:
        - By default: YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name/ckpt_name.
        - external_checkpoint_path when external_checkpoint_path != None
        - ckpt_root_dir/experiment_name/ckpt_name when ckpt_root_dir != None.
        - if the checkpoint file is remotely located:
            when overwrite_local_checkpoint=True then it will be saved in a temporary path which will be returned,
            otherwise it will be downloaded to YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name and overwrite
            YOUR_REPO_ROOT/super_gradients/checkpoints/experiment_name/ckpt_name if such file exists.


    :param experiment_name:         Name of the experiment.
    :param ckpt_name:               Checkpoint filename
    :param external_checkpoint_path: Full path to checkpoint file (that might be located outside of super_gradients/checkpoints directory)
    :param ckpt_root_dir:           Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment.
                                        If None, SG will first check if a package named 'checkpoints' exists.
                                        If not, SG will look for the root of the project that includes the script that was launched.
                                        If not found, raise an error.
    :param run_id:                  Optional. Run id of the experiment. If None, the most recent run will be loaded.
    :return:                        Path of folder where the experiment checkpoints and logs will be stored.
     :return: local path of the checkpoint file (Str)
    """
    if external_checkpoint_path:
        return external_checkpoint_path
    else:
        checkpoints_dir_path = get_checkpoints_dir_path(ckpt_root_dir=ckpt_root_dir, experiment_name=experiment_name, run_id=run_id)
        return os.path.join(checkpoints_dir_path, ckpt_name)

`get_latest_run_id(experiment_name, checkpoints_root_dir=None)`

Parameters:

Name	Type	Description	Default
`experiment_name`	`str`	Name of the experiment.	required
`checkpoints_root_dir`	`Optional[str]`	Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment. If None, SG will first check if a package named 'checkpoints' exists. If not, SG will look for the root of the project that includes the script that was launched. If not found, raise an error.	`None`

Returns:

Type	Description
`Optional[str]`	Latest valid run ID. in the format "RUN_"

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

def get_latest_run_id(experiment_name: str, checkpoints_root_dir: Optional[str] = None) -> Optional[str]:
    """
    :param experiment_name:         Name of the experiment.
    :param checkpoints_root_dir:    Path to the directory where all the experiments are organised, each sub-folder representing a specific experiment.
                                    If None, SG will first check if a package named 'checkpoints' exists.
                                    If not, SG will look for the root of the project that includes the script that was launched.
                                    If not found, raise an error.
    :return:                        Latest valid run ID. in the format "RUN_<year>"
    """
    experiment_dir = get_experiment_dir_path(checkpoints_root_dir=checkpoints_root_dir, experiment_name=experiment_name)

    run_dirs = [os.path.join(experiment_dir, folder) for folder in os.listdir(experiment_dir) if is_run_dir(folder)]
    for run_dir in sorted(run_dirs, reverse=True):
        if "ckpt_latest.pth" not in os.listdir(run_dir):
            logger.warning(
                f"Latest run directory {run_dir} does not contain a `ckpt_latest.pth` file, so it cannot be resumed. "
                f"Trying to load the n-1 most recent run..."
            )
        else:
            return os.path.basename(run_dir)

`get_project_checkpoints_dir_path()`

Get the checkpoints' directory that is at the root of the users project. Create it if it doesn't exist. Return None if root not found.

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

def get_project_checkpoints_dir_path() -> Optional[str]:
    """Get the checkpoints' directory that is at the root of the users project. Create it if it doesn't exist. Return None if root not found."""
    project_root_path = _get_project_root_path()
    if project_root_path is None:
        return None

    checkpoints_path = os.path.join(project_root_path, "checkpoints")
    if not os.path.exists(checkpoints_path):
        os.makedirs(checkpoints_path, exist_ok=True)
        logger.info(f'A checkpoints directory was just created at "{checkpoints_path}". To work with another directory, please set "ckpt_root_dir"')
    return checkpoints_path

`is_run_dir(dirname)`

Check if a directory is a run directory.

Parameters:

Name	Type	Description	Default
`dirname`	`str`	Directory name.	required

Returns:

Type	Description
`bool`	True if the directory is a run directory, False otherwise.

Source code in latest/src/super_gradients/common/environment/checkpoints_dir_utils.py

def is_run_dir(dirname: str) -> bool:
    """Check if a directory is a run directory.

    :param dirname: Directory name.
    :return:        True if the directory is a run directory, False otherwise.
    """
    return os.path.basename(dirname).startswith("RUN_")

`broadcast_from_master(data)`

Broadcast data from master node to all other nodes. This may be required when you want to compute something only on master node (e.g computational-heavy metric) and don't want to waste CPU of other nodes doing the same work simultaneously.

Parameters:

Name	Type	Description	Default
`data`	`Any`	Data to be broadcasted from master node (rank 0)	required

Returns:

Type	Description
`Any`	Data from rank 0 node

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def broadcast_from_master(data: Any) -> Any:
    """
    Broadcast data from master node to all other nodes. This may be required when you
    want to compute something only on master node (e.g computational-heavy metric) and
    don't want to waste CPU of other nodes doing the same work simultaneously.

    :param data:    Data to be broadcasted from master node (rank 0)
    :return:        Data from rank 0 node
    """
    world_size = get_world_size()
    if world_size == 1:
        return data
    broadcast_list = [data] if dist.get_rank() == 0 else [None]
    dist.broadcast_object_list(broadcast_list, src=0)
    return broadcast_list[0]

`execute_and_distribute_from_master(func)`

Decorator to execute a function on the master process and distribute the result to all other processes. Useful in parallel computing scenarios where a computational task needs to be performed only on the master node (e.g., a computational-heavy calculation), and the result must be shared with other nodes without redundant computation.

Example usage: >>> @execute_and_distribute_from_master >>> def some_code_to_run(param1, param2): >>> return param1 + param2

The wrapped function will only be executed on the master node, and the result will be propagated to all other nodes.

Parameters:

Name	Type	Description	Default
`func`	`Callable[..., Any]`	The function to be executed on the master process and whose result is to be distributed.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper function that encapsulates the execute-and-distribute logic.

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def execute_and_distribute_from_master(func: Callable[..., Any]) -> Callable[..., Any]:
    """
    Decorator to execute a function on the master process and distribute the result to all other processes.
    Useful in parallel computing scenarios where a computational task needs to be performed only on the master
    node (e.g., a computational-heavy calculation), and the result must be shared with other nodes without
    redundant computation.

    Example usage:
        >>> @execute_and_distribute_from_master
        >>> def some_code_to_run(param1, param2):
        >>>     return param1 + param2

    The wrapped function will only be executed on the master node, and the result will be propagated to all
    other nodes.

    :param func:    The function to be executed on the master process and whose result is to be distributed.
    :return:        A wrapper function that encapsulates the execute-and-distribute logic.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        # Run the function only if it's the master process
        if device_config.assigned_rank <= 0:
            result = func(*args, **kwargs)
        else:
            result = None

        # Broadcast the result from the master process to all nodes
        return broadcast_from_master(result)

    return wrapper

`find_free_port()`

Find an available port of current machine/node. Note: there is still a chance the port could be taken by other processes.

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def find_free_port() -> int:
    """Find an available port of current machine/node.
    Note: there is still a chance the port could be taken by other processes."""

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        # Binding to port 0 will cause the OS to find an available port for us
        sock.bind(("", 0))
        _ip, port = sock.getsockname()
    return port

`get_local_rank()`

Returns the local rank if running in DDP, and 0 otherwise

Returns:

Type	Description
	local rank

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def get_local_rank():
    """
    Returns the local rank if running in DDP, and 0 otherwise
    :return: local rank
    """
    return dist.get_rank() if dist.is_initialized() else 0

`get_world_size()`

Returns the world size if running in DDP, and 1 otherwise

Returns:

Type	Description
`int`	world size

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def get_world_size() -> int:
    """
    Returns the world size if running in DDP, and 1 otherwise
    :return: world size
    """
    if not dist.is_available():
        return 1
    if not dist.is_initialized():
        return 1
    return dist.get_world_size()

`init_trainer()`

Initialize the super_gradients environment.

This function should be the first thing to be called by any code running super_gradients.

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def init_trainer():
    """
    Initialize the super_gradients environment.

    This function should be the first thing to be called by any code running super_gradients.
    """
    register_hydra_resolvers()
    pop_local_rank()

`is_distributed()`

Check if current process is a DDP subprocess.

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def is_distributed() -> bool:
    """Check if current process is a DDP subprocess."""
    return device_config.assigned_rank >= 0

`is_launched_using_sg()`

Check if the current process is a subprocess launched using SG restart_script_with_ddp

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def is_launched_using_sg():
    """Check if the current process is a subprocess launched using SG restart_script_with_ddp"""
    return os.environ.get("TORCHELASTIC_RUN_ID") == "sg_initiated"

`is_main_process()`

Check if current process is considered as the main process (i.e. is responsible for sanity check, atexit upload, ...). The definition ensures that 1 and only 1 process follows this condition, regardless of how the run was started.

The rule is as follow: - If not DDP: main process is current process - If DDP launched using SuperGradients: main process is the launching process (rank=-1) - If DDP launched with torch: main process is rank 0

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def is_main_process():
    """Check if current process is considered as the main process (i.e. is responsible for sanity check, atexit upload, ...).
    The definition ensures that 1 and only 1 process follows this condition, regardless of how the run was started.

    The rule is as follow:
        - If not DDP: main process is current process
        - If DDP launched using SuperGradients: main process is the launching process (rank=-1)
        - If DDP launched with torch: main process is rank 0
    """

    if not is_distributed():  # If no DDP, or DDP launching process
        return True
    elif (
        device_config.assigned_rank == 0 and not is_launched_using_sg()
    ):  # If DDP launched using torch.distributed.launch or torchrun, we need to run the check on rank 0
        return True
    else:
        return False

`multi_process_safe(func)`

A decorator for making sure a function runs only in main process. If not in DDP mode (local_rank = -1), the function will run. If in DDP mode, the function will run only in the main process (local_rank = 0) This works only for functions with no return value

Source code in latest/src/super_gradients/common/environment/ddp_utils.py

def multi_process_safe(func):
    """
    A decorator for making sure a function runs only in main process.
    If not in DDP mode (local_rank = -1), the function will run.
    If in DDP mode, the function will run only in the main process (local_rank = 0)
    This works only for functions with no return value
    """

    def do_nothing(*args, **kwargs):
        pass

    @wraps(func)
    def wrapper(*args, **kwargs):
        if device_config.assigned_rank <= 0:
            return func(*args, **kwargs)
        else:
            return do_nothing(*args, **kwargs)

    return wrapper

`EnvironmentVariables`

Class to dynamically get any environment variables.

Source code in latest/src/super_gradients/common/environment/env_variables.py

class EnvironmentVariables:
    """Class to dynamically get any environment variables."""

    # Infra

    @property
    def WANDB_BASE_URL(self) -> str:
        return os.getenv("WANDB_BASE_URL")

    @property
    def AWS_PROFILE(self) -> str:
        return os.getenv("AWS_PROFILE")

    # DDP
    @property
    def LOCAL_RANK(self) -> int:
        return int(os.getenv("LOCAL_RANK", -1))

    # Turn ON/OFF features
    @property
    def CRASH_HANDLER(self) -> str:
        return os.getenv("CRASH_HANDLER", "TRUE")

    @property
    def UPLOAD_LOGS(self) -> bool:
        return os.getenv("UPLOAD_LOGS", "TRUE") == "TRUE"

    @property
    def FILE_LOG_LEVEL(self) -> str:
        return os.getenv("FILE_LOG_LEVEL", default="DEBUG").upper()

    @property
    def CONSOLE_LOG_LEVEL(self) -> str:
        return os.getenv("CONSOLE_LOG_LEVEL", default="INFO").upper()

    @property
    def HYDRA_FULL_ERROR(self) -> Optional[str]:
        return os.getenv("HYDRA_FULL_ERROR")

    @HYDRA_FULL_ERROR.setter
    def HYDRA_FULL_ERROR(self, value: str):
        os.environ["HYDRA_FULL_ERROR"] = value

    @property
    def SUPER_GRADIENTS_LOG_DIR(self) -> str:
        return os.getenv("SUPER_GRADIENTS_LOG_DIR", default=str(Path.home() / "sg_logs"))

`get_cpu_percent()`

Average of all the CPU utilization.

Source code in latest/src/super_gradients/common/environment/monitoring/cpu.py

def get_cpu_percent() -> float:
    """Average of all the CPU utilization."""
    return psutil.cpu_percent(interval=None, percpu=False)

`GPUStatAggregatorIterator` `dataclass`

Iterator of multiple StatAggregator, that accumulate samples and aggregates them for each NVIDIA device.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the statistic	required
`sampling_fn`		How the statistic is sampled	required
`aggregate_fn`		How the statistic samples are aggregated	required

Source code in latest/src/super_gradients/common/environment/monitoring/data_models.py

@dataclasses.dataclass
class GPUStatAggregatorIterator:
    """Iterator of multiple StatAggregator, that accumulate samples and aggregates them for each NVIDIA device.

    :param name:            Name of the statistic
    :param sampling_fn:     How the statistic is sampled
    :param aggregate_fn:    How the statistic samples are aggregated
    """

    name: str
    device_sampling_fn: Callable
    device_aggregate_fn: Callable
    _per_device_stat_aggregator: List[StatAggregator] = dataclasses.field(init=False)

    def __post_init__(self):
        """Initialize nvidia_management_lib and create a list of StatAggregator, one for each NVIDIA device."""
        init_nvidia_management_lib()
        self._per_device_stat_aggregator = [
            StatAggregator(name=f"{self.name}/device_{i}", sampling_fn=partial(self.device_sampling_fn, i), aggregate_fn=self.device_aggregate_fn)
            for i in range(count_gpus())
        ]

    def __iter__(self) -> Iterator[StatAggregator]:
        """Iterate over the StatAggregator of each node"""
        return iter(self._per_device_stat_aggregator)

`iter()`

Iterate over the StatAggregator of each node

Source code in latest/src/super_gradients/common/environment/monitoring/data_models.py

def __iter__(self) -> Iterator[StatAggregator]:
    """Iterate over the StatAggregator of each node"""
    return iter(self._per_device_stat_aggregator)

`__post_init__()`

Initialize nvidia_management_lib and create a list of StatAggregator, one for each NVIDIA device.

Source code in latest/src/super_gradients/common/environment/monitoring/data_models.py

def __post_init__(self):
    """Initialize nvidia_management_lib and create a list of StatAggregator, one for each NVIDIA device."""
    init_nvidia_management_lib()
    self._per_device_stat_aggregator = [
        StatAggregator(name=f"{self.name}/device_{i}", sampling_fn=partial(self.device_sampling_fn, i), aggregate_fn=self.device_aggregate_fn)
        for i in range(count_gpus())
    ]

`StatAggregator` `dataclass`

Accumulate statistics samples and aggregates them.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the statistic	required
`sampling_fn`	`Callable`	How the statistic is sampled	required
`aggregate_fn`	`Callable[[List[Any], float], float]`	How the statistic samples are aggregated, has to take "samples: List[Any]" and "time: float" as parameters	required
`reset_callback_fn`	`Optional[Callable]`	Optional, can be used to reset any system metric	`None`

Source code in latest/src/super_gradients/common/environment/monitoring/data_models.py

@dataclasses.dataclass
class StatAggregator:
    """Accumulate statistics samples and aggregates them.

    :param name:                Name of the statistic
    :param sampling_fn:         How the statistic is sampled
    :param aggregate_fn:        How the statistic samples are aggregated, has to take "samples: List[Any]" and "time: float" as parameters
    :param reset_callback_fn:   Optional, can be used to reset any system metric
    """

    name: str
    sampling_fn: Callable
    aggregate_fn: Callable[[List[Any], float], float]
    reset_callback_fn: Optional[Callable] = None
    _samples: List = dataclasses.field(default_factory=list)
    _reset_time: float = None

    def sample(self):
        try:
            self._samples.append(self.sampling_fn())
        except Exception:
            pass

    def aggregate(self) -> Union[float, None]:
        if len(self._samples) > 0:
            time_diff = time.time() - self._reset_time
            return self.aggregate_fn(self._samples, time_diff)

    def reset(self):
        self._samples = []
        self._reset_time = time.time()
        if self.reset_callback_fn:
            self.reset_callback_fn()

`get_disk_usage_percent()`

Disk memory used in percent.

Source code in latest/src/super_gradients/common/environment/monitoring/disk.py

def get_disk_usage_percent() -> float:
    """Disk memory used in percent."""
    return psutil.disk_usage("/").percent

`get_io_read_mb()`

Number of MegaBytes read since import

Source code in latest/src/super_gradients/common/environment/monitoring/disk.py

def get_io_read_mb() -> float:
    """Number of MegaBytes read since import"""
    return bytes_to_megabytes(psutil.disk_io_counters().read_bytes - buffer_io_read_bytes)

`get_io_write_mb()`

Number of MegaBytes written since import

Source code in latest/src/super_gradients/common/environment/monitoring/disk.py

def get_io_write_mb() -> float:
    """Number of MegaBytes written since import"""
    return bytes_to_megabytes(psutil.disk_io_counters().write_bytes - buffer_io_write_bytes)

`reset_io_read()`

Reset the value of net_io_counters

Source code in latest/src/super_gradients/common/environment/monitoring/disk.py

def reset_io_read():
    """Reset the value of net_io_counters"""
    global buffer_io_read_bytes
    buffer_io_read_bytes = psutil.disk_io_counters().read_bytes

`reset_io_write()`

Reset the value of net_io_counters

Source code in latest/src/super_gradients/common/environment/monitoring/disk.py

def reset_io_write():
    """Reset the value of net_io_counters"""
    global buffer_io_write_bytes
    buffer_io_write_bytes = psutil.disk_io_counters().write_bytes