Lightning-AI · awaelchli · Jul 23, 2022 · Jun 22, 2022 · Jun 23, 2022 · Jun 23, 2022
@@ -24,7 +24,7 @@ Lightning supports multiple ways of doing distributed training.
 - DistributedDataParallel (multiple-gpus across many machines)
     - Regular (``strategy='ddp'``)
     - Spawn (``strategy='ddp_spawn'``)
-    - Fork (``strategy='ddp_fork'``)
+    - Notebook/Fork (``strategy='ddp_notebook'``)
 - Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime)
 - Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms)
 
@@ -101,7 +101,7 @@ There are cases in which it is NOT possible to use DDP. Examples are:
 - Jupyter Notebook, Google COLAB, Kaggle, etc.
 - You have a nested script without a root package
 
-In these situations you should use `dp` or `ddp_spawn` instead.
+In these situations you should use `ddp_notebook` or `dp` instead.
 
 Distributed Data Parallel 2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -201,18 +201,25 @@ You can then call your scripts anywhere
     python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp'
 
 
-Distributed Data Parallel Fork
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Distributed Data Parallel in Notebooks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on:
+DDP Notebook/Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on:
+The Trainer enables it by default when such environments are detected.
 
 .. code-block:: python
 
     # train on 8 GPUs in a Jupyter notebook
+    trainer = Trainer(accelerator="gpu", devices=8)
+
+    # can be set explicitly
+    trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_notebook")
+
+    # can also be used in non-interactive environments
     trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_fork")
 
 Data Parallel (``strategy="dp"``) is the only other strategy supported in interactive environments but is slower, is discouraged by PyTorch and has other limitations.
-Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability but it can only be used with scripts.
+Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork/Notebook for its speed and stability but it can only be used with scripts.
 
 
 Comparison of DDP variants and tradeoffs
@@ -225,7 +232,7 @@ Comparison of DDP variants and tradeoffs
    * -
      - DDP
      - DDP Spawn
-     - DDP Fork
+     - DDP Notebook/Fork
    * - Works in Jupyter notebooks / IPython environments
      - No
      - No

@@ -53,6 +53,13 @@
 
 log = logging.getLogger(__name__)
 
+_DDP_FORK_ALIASES = (
+    "ddp_fork",
+    "ddp_fork_find_unused_parameters_false",
+    "ddp_notebook",
+    "ddp_notebook_find_unused_parameters_false",
+)
+
 
 class DDPSpawnStrategy(ParallelStrategy):
     """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training
@@ -283,20 +290,22 @@ def post_training_step(self):
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
-        for start_method in ("spawn", "fork"):
+        entries = (
+            ("ddp_spawn", "spawn"),
+            ("ddp_spawn_find_unused_parameters_false", "spawn"),
+            ("ddp_fork", "fork"),
+            ("ddp_fork_find_unused_parameters_false", "fork"),
+            ("ddp_notebook", "fork"),
+            ("ddp_notebook_find_unused_parameters_false", "fork"),
+        )
+        for name, start_method in entries:
             strategy_registry.register(
-                f"ddp_{start_method}_find_unused_parameters_false",
+                name,
                 cls,
-                description=f"DDP {start_method.title()} strategy with `find_unused_parameters` as False",
+                description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'",
                 find_unused_parameters=False,
                 start_method=start_method,
             )
-            strategy_registry.register(
-                f"ddp_{start_method}",
-                cls,
-                description=f"DDP {start_method.title()} strategy",
-                start_method=start_method,
-            )
 
     def teardown(self) -> None:
         log.detail(f"{self.__class__.__name__}: tearing down strategy")

@@ -72,6 +72,7 @@
     StrategyRegistry,
     TPUSpawnStrategy,
 )
+from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _StrategyType,
@@ -614,10 +615,7 @@ def _check_strategy_and_fallback(self) -> None:
                 f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                 "but GPU accelerator is not used."
             )
-        if (
-            strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false")
-            and "fork" not in torch.multiprocessing.get_all_start_methods()
-        ):
+        if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods():
             raise ValueError(
                 f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                 f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."

@@ -44,6 +44,7 @@
     DeepSpeedStrategy,
     SingleDeviceStrategy,
 )
+from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests_pytorch.helpers.runif import RunIf
@@ -749,7 +750,7 @@ def test_accelerator_specific_checkpoint_io(*_):
     assert trainer.strategy.checkpoint_io is ckpt_plugin
 
 
-@pytest.mark.parametrize("strategy", ["ddp_fork", "ddp_fork_find_unused_parameters_false"])
+@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES)
 @mock.patch(
     "pytorch_lightning.trainer.connectors.accelerator_connector.torch.multiprocessing.get_all_start_methods",
     return_value=[],

@@ -124,6 +124,12 @@ def test_fsdp_strategy_registry(tmpdir):
             {"find_unused_parameters": False, "start_method": "fork"},
             marks=RunIf(skip_windows=True),
         ),
+        pytest.param(
+            "ddp_notebook_find_unused_parameters_false",
+            DDPSpawnStrategy,
+            {"find_unused_parameters": False, "start_method": "fork"},
+            marks=RunIf(skip_windows=True),
+        ),
         (
             "ddp_sharded_spawn_find_unused_parameters_false",
             DDPSpawnShardedStrategy,