diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index dbd2dfd790bd6..83d7d1f60c891 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -24,7 +24,7 @@ Lightning supports multiple ways of doing distributed training. - DistributedDataParallel (multiple-gpus across many machines) - Regular (``strategy='ddp'``) - Spawn (``strategy='ddp_spawn'``) - - Fork (``strategy='ddp_fork'``) + - Notebook/Fork (``strategy='ddp_notebook'``) - Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime) - Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms) @@ -101,7 +101,7 @@ There are cases in which it is NOT possible to use DDP. Examples are: - Jupyter Notebook, Google COLAB, Kaggle, etc. - You have a nested script without a root package -In these situations you should use `dp` or `ddp_spawn` instead. +In these situations you should use `ddp_notebook` or `dp` instead. Distributed Data Parallel 2 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -201,18 +201,25 @@ You can then call your scripts anywhere python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp' -Distributed Data Parallel Fork -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Distributed Data Parallel in Notebooks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on: +DDP Notebook/Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on: +The Trainer enables it by default when such environments are detected. .. code-block:: python # train on 8 GPUs in a Jupyter notebook + trainer = Trainer(accelerator="gpu", devices=8) + + # can be set explicitly + trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_notebook") + + # can also be used in non-interactive environments trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_fork") Data Parallel (``strategy="dp"``) is the only other strategy supported in interactive environments but is slower, is discouraged by PyTorch and has other limitations. -Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability but it can only be used with scripts. +Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork/Notebook for its speed and stability but it can only be used with scripts. Comparison of DDP variants and tradeoffs @@ -225,7 +232,7 @@ Comparison of DDP variants and tradeoffs * - - DDP - DDP Spawn - - DDP Fork + - DDP Notebook/Fork * - Works in Jupyter notebooks / IPython environments - No - No diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index a0b0a7865869f..b7bbe8dfce91d 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -53,6 +53,13 @@ log = logging.getLogger(__name__) +_DDP_FORK_ALIASES = ( + "ddp_fork", + "ddp_fork_find_unused_parameters_false", + "ddp_notebook", + "ddp_notebook_find_unused_parameters_false", +) + class DDPSpawnStrategy(ParallelStrategy): """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training @@ -283,20 +290,22 @@ def post_training_step(self): @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: - for start_method in ("spawn", "fork"): + entries = ( + ("ddp_spawn", "spawn"), + ("ddp_spawn_find_unused_parameters_false", "spawn"), + ("ddp_fork", "fork"), + ("ddp_fork_find_unused_parameters_false", "fork"), + ("ddp_notebook", "fork"), + ("ddp_notebook_find_unused_parameters_false", "fork"), + ) + for name, start_method in entries: strategy_registry.register( - f"ddp_{start_method}_find_unused_parameters_false", + name, cls, - description=f"DDP {start_method.title()} strategy with `find_unused_parameters` as False", + description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'", find_unused_parameters=False, start_method=start_method, ) - strategy_registry.register( - f"ddp_{start_method}", - cls, - description=f"DDP {start_method.title()} strategy", - start_method=start_method, - ) def teardown(self) -> None: log.detail(f"{self.__class__.__name__}: tearing down strategy") diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 9f87a68b4df7d..b20e40d1a5f0b 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -72,6 +72,7 @@ StrategyRegistry, TPUSpawnStrategy, ) +from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import ( _StrategyType, @@ -614,10 +615,7 @@ def _check_strategy_and_fallback(self) -> None: f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) - if ( - strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false") - and "fork" not in torch.multiprocessing.get_all_start_methods() - ): + if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index a04418b62ebd9..b29f983f31e53 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -44,6 +44,7 @@ DeepSpeedStrategy, SingleDeviceStrategy, ) +from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf @@ -749,7 +750,7 @@ def test_accelerator_specific_checkpoint_io(*_): assert trainer.strategy.checkpoint_io is ckpt_plugin -@pytest.mark.parametrize("strategy", ["ddp_fork", "ddp_fork_find_unused_parameters_false"]) +@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES) @mock.patch( "pytorch_lightning.trainer.connectors.accelerator_connector.torch.multiprocessing.get_all_start_methods", return_value=[], diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index f5576fa14eb8a..9cabe33b9f037 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -124,6 +124,12 @@ def test_fsdp_strategy_registry(tmpdir): {"find_unused_parameters": False, "start_method": "fork"}, marks=RunIf(skip_windows=True), ), + pytest.param( + "ddp_notebook_find_unused_parameters_false", + DDPSpawnStrategy, + {"find_unused_parameters": False, "start_method": "fork"}, + marks=RunIf(skip_windows=True), + ), ( "ddp_sharded_spawn_find_unused_parameters_false", DDPSpawnShardedStrategy,