diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb84bdfd..69e58b11b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,8 @@ gets uploaded to AzureML, by skipping all test folders. - ([#546](https://github.com/microsoft/InnerEye-DeepLearning/pull/546)) Environment and hello_world_model documentation updated - ([#525](https://github.com/microsoft/InnerEye-DeepLearning/pull/525)) Enable --store_dataset_sample - ([#495](https://github.com/microsoft/InnerEye-DeepLearning/pull/495)) Fix model comparison. +- ([#547](https://github.com/microsoft/InnerEye-DeepLearning/pull/547)) The parameter pl_find_unused_parameters was no longer used +to initialize the DDP Plugin. - ([#482](https://github.com/microsoft/InnerEye-DeepLearning/pull/482)) Check bool parameter is either true or false. - ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train any large models anymore because data loaders ran out of memory. diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index 85c70acbe..287f4a0c1 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -4,18 +4,14 @@ # ------------------------------------------------------------------------------------------ import logging import os -import subprocess import sys from pathlib import Path -from time import sleep from typing import Any, Dict, Optional, Tuple, TypeVar -import numpy as np from pytorch_lightning import LightningModule, Trainer, seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.plugins import DDPPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException from InnerEye.Azure.azure_runner import ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context @@ -131,8 +127,15 @@ def create_lightning_trainer(container: LightningContainer, num_gpus = container.num_gpus_per_node effective_num_gpus = num_gpus * num_nodes # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory). - # For unit tests, only "ddp_spawn" works - accelerator = "ddp" if effective_num_gpus > 1 else None + if effective_num_gpus > 1: + accelerator: Optional[str] = "ddp" + # Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin prints out + # lengthy warnings about the performance impact of find_unused_parameters. + plugins = [DDPPlugin(num_nodes=num_nodes, sync_batchnorm=True, + find_unused_parameters=container.pl_find_unused_parameters)] + else: + accelerator = None + plugins = [] logging.info(f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") loggers = [tensorboard_logger, AzureMLLogger()] @@ -172,6 +175,7 @@ def create_lightning_trainer(container: LightningContainer, deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, + plugins=plugins, max_epochs=container.num_epochs, num_sanity_val_steps=container.pl_num_sanity_val_steps, callbacks=callbacks, @@ -269,8 +273,10 @@ def model_train(checkpoint_path: Optional[Path], # Per-subject model outputs for regression models are written per rank, and need to be aggregated here. # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them. if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning): - upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, container.outputs_folder) # type: ignore - upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, container.outputs_folder) # type: ignore + upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, # type: ignore + container.outputs_folder) + upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, # type: ignore + container.outputs_folder) # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training. # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set # all necessary properties. @@ -335,76 +341,3 @@ def aggregate_and_create_subject_metrics_file(outputs_folder: Path) -> None: else: # For all files but the first one, cut off the header line. f.write("\n".join(temp_file_contents.splitlines()[1:])) - - -class InnerEyeDDPPlugin(DDPPlugin): - """ - This is a temporary fix for the broken DDP plugin in Pytorch-Lightning v1.2.8 - Hopefully we can remove it once it is fixed in Pytorch-Lightning. - """ - - def _call_children_scripts(self) -> None: - # This is the only line changed compared to DDPPlugin - assert self.local_rank == 0 - - # The code below is in the same as the original DDPPlugin - self._check_can_spawn_children() - self._has_spawned_children = True - - # DDP Environment variables - os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() # type: ignore - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) # type: ignore - - # allow the user to pass the node rank - os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) # type: ignore - os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # type: ignore - - path_lib = os.path.abspath - - # pull out the commands used to run the script and resolve the abs file path - command = sys.argv - try: - full_path = path_lib(command[0]) - except Exception: - full_path = os.path.abspath(command[0]) - - command[0] = full_path - # use the same python interpreter and actually running - command = [sys.executable] + command - - # the visible devices tell us how many GPUs we want to use. - # when the trainer script was called the device has already been scoped by the time - # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone - # but forward the GPUs selected via environment variables - if self.parallel_devices is None: - raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") - - os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) - os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - - if self.lightning_module.logger is not None: - os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) - - num_gpus = len(self.parallel_devices) - os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" - - self.interactive_ddp_procs = [] - - for local_rank in range(1, self.num_processes): # type: ignore - env_copy = os.environ.copy() - env_copy["LOCAL_RANK"] = f"{local_rank}" - - # remove env var if global seed not set - if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: - del env_copy["PL_GLOBAL_SEED"] - - # start process - # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = None - proc = subprocess.Popen(command, env=env_copy, cwd=cwd) - self.interactive_ddp_procs.append(proc) - - # starting all processes at once can cause issues - # with dataloaders delay between 1-10 seconds - delay = np.random.uniform(1, 5, 1)[0] - sleep(delay)