From b9fc5b8ed12ced8d20616274c4af13a7a7279715 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 14:14:55 +0200
Subject: [PATCH 01/40] update

---
 MANIFEST.in                                   |  27 ++++
 examples/app_multi_node/app.py                |   5 +
 examples/app_multi_node/{ => bare}/.gitignore |   0
 .../app_multi_node/{ => bare}/multi_node.py   |   0
 .../{ => bare}/requirements.txt               |   0
 examples/app_multi_node/train.py              |   7 +
 setup.py                                      |   2 +-
 src/lightning_app/components/training.py      | 144 ++++++++++++++++++
 src/lightning_app/core/flow.py                |   2 +
 src/lightning_app/structures/dict.py          |   7 +-
 .../utilities/packaging/cloud_compute.py      |   5 +
 11 files changed, 196 insertions(+), 3 deletions(-)
 create mode 100644 examples/app_multi_node/app.py
 rename examples/app_multi_node/{ => bare}/.gitignore (100%)
 rename examples/app_multi_node/{ => bare}/multi_node.py (100%)
 rename examples/app_multi_node/{ => bare}/requirements.txt (100%)
 create mode 100644 examples/app_multi_node/train.py
 create mode 100644 src/lightning_app/components/training.py

diff --git a/MANIFEST.in b/MANIFEST.in
index a8dbcff69b631..2b1bf5a0d9d99 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,30 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
new file mode 100644
index 0000000000000..23e77c9ced766
--- /dev/null
+++ b/examples/app_multi_node/app.py
@@ -0,0 +1,5 @@
+from lightning import LightningApp
+from lightning.app.components.training import LightningTrainingComponent
+from lightning_app.utilities.packaging.cloud_compute import CloudCompute
+
+app = LightningApp(LightningTrainingComponent("train.py", num_nodes=2, cloud_compute=CloudCompute("cpu")))
diff --git a/examples/app_multi_node/.gitignore b/examples/app_multi_node/bare/.gitignore
similarity index 100%
rename from examples/app_multi_node/.gitignore
rename to examples/app_multi_node/bare/.gitignore
diff --git a/examples/app_multi_node/multi_node.py b/examples/app_multi_node/bare/multi_node.py
similarity index 100%
rename from examples/app_multi_node/multi_node.py
rename to examples/app_multi_node/bare/multi_node.py
diff --git a/examples/app_multi_node/requirements.txt b/examples/app_multi_node/bare/requirements.txt
similarity index 100%
rename from examples/app_multi_node/requirements.txt
rename to examples/app_multi_node/bare/requirements.txt
diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py
new file mode 100644
index 0000000000000..d312d86bb780a
--- /dev/null
+++ b/examples/app_multi_node/train.py
@@ -0,0 +1,7 @@
+from lightning.pytorch import Trainer
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+if __name__ == "__main__":
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1, devices=2, strategy="ddp")
+    trainer.fit(model)
diff --git a/setup.py b/setup.py
index a542b3c1e0291..6d271cc40b0aa 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
+_PACKAGE_NAME = ""
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
new file mode 100644
index 0000000000000..349fb5644a440
--- /dev/null
+++ b/src/lightning_app/components/training.py
@@ -0,0 +1,144 @@
+import os
+from typing import List, Optional, Tuple, Union
+
+from lightning import CloudCompute
+from lightning_app import LightningFlow, structures
+from lightning_app.components.python import TracerPythonScript
+from lightning_app.utilities.imports import _is_pytorch_lightning_available
+
+if _is_pytorch_lightning_available():
+    from pytorch_lightning import Callback
+
+    class IntrospectionCallback(Callback):
+        def on_train_start(self, trainer, pl_module):
+            print(trainer.strategy)
+            print(trainer.world_size)
+            print(pl_module)
+
+
+class _LightningTrainerWork(TracerPythonScript):
+    def __init__(
+        self,
+        script_path: str,
+        script_args: Optional[Union[list, str]] = None,
+        node_rank: int = 1,
+        num_nodes: int = 1,
+        global_rank: int = 0,
+        sanity_serving: bool = False,
+        cloud_compute: Optional[CloudCompute] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            script_path, script_args, raise_exception=True, parallel=True, cloud_compute=cloud_compute, **kwargs
+        )
+        self.node_rank = node_rank
+        self.num_nodes = num_nodes
+        self.global_rank = global_rank
+        self.best_model_path: None
+        self.best_model_score = None
+        self.sanity_serving = sanity_serving
+        self.has_finished = False
+
+    def configure_tracer(self):
+        from pytorch_lightning import Trainer
+
+        tracer = super().configure_tracer()
+        tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
+        return tracer
+
+    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
+        if not internal_urls:
+            print(f"The node {self.node_rank} started !")
+            return
+
+        print(f"Internal URLS: {internal_urls}")
+        master_address = str(internal_urls[0][0])
+        master_port = str(internal_urls[0][1])
+        devices = self.cloud_compute.devices
+
+        distributed_env_vars = {
+            "NODE_RANK": str(self.node_rank),
+            "LOCAL_RANK": str(self.global_rank),
+            "GLOBAL_RANK": str(self.global_rank),
+            "MASTER_ADDRESS": master_address,
+            "MASTER_PORT": master_port,
+            "WORLD_SIZE": str(self.num_nodes * devices),
+        }
+        print(distributed_env_vars)
+        os.environ.update(distributed_env_vars)
+        return super().run()
+
+    def on_after_run(self, script_globals):
+        # TODO: Why does it hang there.
+        self.has_finished = True
+        raise SystemExit(0)
+
+    def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
+        from pytorch_lightning.serve import ServableModuleValidator
+
+        callbacks = kwargs.get("callbacks", [])
+        if self.sanity_serving:
+            callbacks = callbacks + [ServableModuleValidator()]
+        callbacks += [IntrospectionCallback()]
+        kwargs["callbacks"] = callbacks
+        return {}, args, kwargs
+
+
+class LightningTrainingComponent(LightningFlow):
+    def __init__(
+        self,
+        script_path: str,
+        script_args: Optional[Union[list, str]] = None,
+        num_nodes: int = 1,
+        cloud_compute: CloudCompute = CloudCompute("cpu"),
+        sanity_serving: bool = False,
+    ):
+        super().__init__()
+        self.ws = structures.Dict()
+        self.has_initialized = False
+        self.script_path = script_path
+        self.script_args = script_args
+        self.num_nodes = num_nodes
+        self._cloud_compute = cloud_compute  # TODO: Add support for cloudCOmpute
+        self.sanity_serving = sanity_serving
+
+    def run(self):
+        if not self.has_initialized:
+            for node_rank in range(self.num_nodes):
+
+                if self.is_running_in_cloud:
+                    devices = self._cloud_compute.devices
+                    global_rank = (node_rank + 1) * devices - 1 if node_rank else 0
+                    work_node_rank = node_rank
+                else:
+                    global_rank = node_rank
+                    work_node_rank = 0
+
+                self.ws[str(node_rank)] = _LightningTrainerWork(
+                    script_path=self.script_path,
+                    script_args=self.script_args,
+                    cloud_compute=self._cloud_compute,
+                    node_rank=work_node_rank,
+                    global_rank=global_rank,
+                    sanity_serving=self.sanity_serving,
+                    num_nodes=self.num_nodes,
+                )
+
+            self.has_initialized = True
+
+        for work in self.ws.values():
+            if self.ready:
+                internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()]
+                work.run(internal_urls)
+                if all(w.has_finished for w in self.ws.values()):
+                    self._exit("Finished training")
+            else:
+                work.run()
+
+    @property
+    def ready(self) -> bool:
+        return all(w.internal_ip for w in self.ws.values())
+
+    @property
+    def is_running_in_cloud(self) -> bool:
+        return "LIGHTNING_APP_STATE_URL" in os.environ
diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py
index a5dcfd0a77e2e..4e6d7ee15e398 100644
--- a/src/lightning_app/core/flow.py
+++ b/src/lightning_app/core/flow.py
@@ -207,8 +207,10 @@ def _attach_backend(flow: "LightningFlow", backend):
             structure = getattr(flow, struct_name)
             for flow in structure.flows:
                 LightningFlow._attach_backend(flow, backend)
+                flow._backend = backend
             for work in structure.works:
                 backend._wrap_run_method(_LightningAppRef().get_current(), work)
+                work._backend = backend
 
         for name in flow._structures:
             getattr(flow, name)._backend = backend
diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py
index 2aa02d4ebfa50..93e2b161b2e7a 100644
--- a/src/lightning_app/structures/dict.py
+++ b/src/lightning_app/structures/dict.py
@@ -58,7 +58,10 @@ def __init__(self, **kwargs: T):
     def __setitem__(self, k, v):
         from lightning_app import LightningFlow, LightningWork
 
-        if "." in k:
+        if not isinstance(k, str):
+            raise Exception("The provided key should be an string")
+
+        if isinstance(k, str) and "." in k:
             raise Exception(f"The provided name {k} contains . which is forbidden.")
 
         if self._backend:
@@ -67,7 +70,7 @@ def __setitem__(self, k, v):
                 _set_child_name(self, v, k)
             elif isinstance(v, LightningWork):
                 self._backend._wrap_run_method(_LightningAppRef().get_current(), v)
-            v._name = f"{self.name}.{k}"
+        v._name = f"{self.name}.{k}"
         super().__setitem__(k, v)
 
     @property
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index 6527911855bae..a36d875c85982 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -58,3 +58,8 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, d):
         return cls(**d["__cloud_compute__"])
+
+    @property
+    def devices(self) -> int:
+        # TODO: Add a resolver here.
+        return 1

From 5e607e308aed6ffdc46d06254f5acc35e0d31e0c Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 14:15:49 +0200
Subject: [PATCH 02/40] update

---
 MANIFEST.in | 27 ---------------------------
 setup.py    |  2 +-
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 2b1bf5a0d9d99..a8dbcff69b631 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,30 +3,3 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
diff --git a/setup.py b/setup.py
index 6d271cc40b0aa..a542b3c1e0291 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = ""
+_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/

From ee161b33e6669b1227a14e4c3ace39e0ddb6ae85 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 14:16:41 +0200
Subject: [PATCH 03/40] update

---
 examples/app_multi_node/app.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 23e77c9ced766..51db0a4080d9d 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -2,4 +2,10 @@
 from lightning.app.components.training import LightningTrainingComponent
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
-app = LightningApp(LightningTrainingComponent("train.py", num_nodes=2, cloud_compute=CloudCompute("cpu")))
+app = LightningApp(
+    LightningTrainingComponent(
+        "train.py",
+        num_nodes=2,
+        cloud_compute=CloudCompute("gpu"),
+    ),
+)

From 3fffc0bde2d5040951fd68be20569cfc1c943e04 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 19:04:09 +0200
Subject: [PATCH 04/40] update

---
 MANIFEST.in                                   |  3 ++
 examples/app_multi_node/.lightning            |  2 +-
 setup.py                                      |  2 +-
 src/lightning_app/components/training.py      | 36 ++++++++++++-------
 .../utilities/packaging/cloud_compute.py      |  5 +++
 5 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index a8dbcff69b631..c22e9b09d4985 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,6 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 7befcc74ea6d3..2e661fac0e588 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: multi-node-demo
+name: '18'
diff --git a/setup.py b/setup.py
index a542b3c1e0291..6d271cc40b0aa 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
+_PACKAGE_NAME = ""
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 349fb5644a440..3e919eef56312 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -4,16 +4,6 @@
 from lightning import CloudCompute
 from lightning_app import LightningFlow, structures
 from lightning_app.components.python import TracerPythonScript
-from lightning_app.utilities.imports import _is_pytorch_lightning_available
-
-if _is_pytorch_lightning_available():
-    from pytorch_lightning import Callback
-
-    class IntrospectionCallback(Callback):
-        def on_train_start(self, trainer, pl_module):
-            print(trainer.strategy)
-            print(trainer.world_size)
-            print(pl_module)
 
 
 class _LightningTrainerWork(TracerPythonScript):
@@ -24,6 +14,7 @@ def __init__(
         node_rank: int = 1,
         num_nodes: int = 1,
         global_rank: int = 0,
+        local_rank: int = 0,
         sanity_serving: bool = False,
         cloud_compute: Optional[CloudCompute] = None,
         **kwargs,
@@ -34,6 +25,7 @@ def __init__(
         self.node_rank = node_rank
         self.num_nodes = num_nodes
         self.global_rank = global_rank
+        self.local_rank = local_rank
         self.best_model_path: None
         self.best_model_score = None
         self.sanity_serving = sanity_serving
@@ -51,20 +43,33 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             print(f"The node {self.node_rank} started !")
             return
 
+        import torch.distributed as dist
+
         print(f"Internal URLS: {internal_urls}")
         master_address = str(internal_urls[0][0])
         master_port = str(internal_urls[0][1])
         devices = self.cloud_compute.devices
+        world_size = self.num_nodes * devices
 
         distributed_env_vars = {
             "NODE_RANK": str(self.node_rank),
-            "LOCAL_RANK": str(self.global_rank),
+            "LOCAL_RANK": str(self.local_rank),
             "GLOBAL_RANK": str(self.global_rank),
             "MASTER_ADDRESS": master_address,
             "MASTER_PORT": master_port,
-            "WORLD_SIZE": str(self.num_nodes * devices),
+            "WORLD_SIZE": str(world_size),
         }
         print(distributed_env_vars)
+
+        backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
+
+        dist.init_process_group(
+            backend=backend,
+            init_method=f"tcp://{master_address}:{master_port}",
+            world_size=world_size,
+            rank=self.global_rank,
+        )
+
         os.environ.update(distributed_env_vars)
         return super().run()
 
@@ -79,8 +84,10 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
         callbacks = kwargs.get("callbacks", [])
         if self.sanity_serving:
             callbacks = callbacks + [ServableModuleValidator()]
-        callbacks += [IntrospectionCallback()]
         kwargs["callbacks"] = callbacks
+        kwargs["devices"] = self.cloud_compute.devices
+        kwargs["num_nodes"] = self.num_nodes
+        kwargs["accelerator"] = "auto"
         return {}, args, kwargs
 
 
@@ -110,9 +117,11 @@ def run(self):
                     devices = self._cloud_compute.devices
                     global_rank = (node_rank + 1) * devices - 1 if node_rank else 0
                     work_node_rank = node_rank
+                    local_rank = 0
                 else:
                     global_rank = node_rank
                     work_node_rank = 0
+                    local_rank = node_rank
 
                 self.ws[str(node_rank)] = _LightningTrainerWork(
                     script_path=self.script_path,
@@ -122,6 +131,7 @@ def run(self):
                     global_rank=global_rank,
                     sanity_serving=self.sanity_serving,
                     num_nodes=self.num_nodes,
+                    local_rank=local_rank,
                 )
 
             self.has_initialized = True
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index a36d875c85982..075c0c24e86a9 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -63,3 +63,8 @@ def from_dict(cls, d):
     def devices(self) -> int:
         # TODO: Add a resolver here.
         return 1
+
+    @property
+    def accelerator(self) -> str:
+        # TODO: Add a resolver here.
+        return self.name

From 6ddd07abb8dd65d1bea36dd1005a739b6e20c189 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 19:40:09 +0200
Subject: [PATCH 05/40] update

---
 MANIFEST.in                        | 3 ---
 examples/app_multi_node/.lightning | 2 +-
 setup.py                           | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index c22e9b09d4985..a8dbcff69b631 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,6 +3,3 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 2e661fac0e588..7befcc74ea6d3 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: '18'
+name: multi-node-demo
diff --git a/setup.py b/setup.py
index 6d271cc40b0aa..a542b3c1e0291 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = ""
+_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/

From acfb717cedd2eb1625021b25d27d093672aaa580 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 20:03:14 +0200
Subject: [PATCH 06/40] update

---
 examples/app_multi_node/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 51db0a4080d9d..0d63abb675005 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -6,6 +6,6 @@
     LightningTrainingComponent(
         "train.py",
         num_nodes=2,
-        cloud_compute=CloudCompute("gpu"),
+        cloud_compute=CloudCompute("gpu", preemptible=True),
     ),
 )

From 9dc18644ce798b7a29dca5a4f33e3d38097a71b9 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 20:11:04 +0200
Subject: [PATCH 07/40] update

---
 examples/app_multi_node/.lightning            |  2 +-
 examples/app_multi_node/app.py                |  2 +-
 src/lightning_app/components/training.py      | 22 ++++++++++++++++---
 .../utilities/packaging/cloud_compute.py      |  8 +++++++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 7befcc74ea6d3..7e3ebdf752f5c 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: multi-node-demo
+name: '22'
diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 0d63abb675005..2829ea3b157c9 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -6,6 +6,6 @@
     LightningTrainingComponent(
         "train.py",
         num_nodes=2,
-        cloud_compute=CloudCompute("gpu", preemptible=True),
+        cloud_compute=CloudCompute("gpu-fast-multi", preemptible=True),
     ),
 )
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 3e919eef56312..e118aab4b78c9 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import List, Optional, Tuple, Union
 
@@ -5,6 +6,8 @@
 from lightning_app import LightningFlow, structures
 from lightning_app.components.python import TracerPythonScript
 
+_logger = logging.getLogger(__name__)
+
 
 class _LightningTrainerWork(TracerPythonScript):
     def __init__(
@@ -40,12 +43,12 @@ def configure_tracer(self):
 
     def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
         if not internal_urls:
-            print(f"The node {self.node_rank} started !")
+            _logger.info(f"The node {self.node_rank} started !")
             return
 
         import torch.distributed as dist
 
-        print(f"Internal URLS: {internal_urls}")
+        _logger.debug(f"Internal URLS: {internal_urls}")
         master_address = str(internal_urls[0][0])
         master_port = str(internal_urls[0][1])
         devices = self.cloud_compute.devices
@@ -59,7 +62,7 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             "MASTER_PORT": master_port,
             "WORLD_SIZE": str(world_size),
         }
-        print(distributed_env_vars)
+        _logger.info(distributed_env_vars)
 
         backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
 
@@ -100,6 +103,16 @@ def __init__(
         cloud_compute: CloudCompute = CloudCompute("cpu"),
         sanity_serving: bool = False,
     ):
+        """This component enables to perform distributed training.
+
+        Arguments:
+            script_path: Path to the script to be executed.
+            script_args: The arguments to be pass to the script.
+            num_nodes: Number of nodes.
+            cloud_compute: The cloud compute object used in the cloud.
+            sanity_serving: Whether to validate the model correctly implements
+                the ServableModule API
+        """
         super().__init__()
         self.ws = structures.Dict()
         self.has_initialized = False
@@ -109,6 +122,9 @@ def __init__(
         self._cloud_compute = cloud_compute  # TODO: Add support for cloudCOmpute
         self.sanity_serving = sanity_serving
 
+        if not self.is_running_in_cloud and num_nodes > 1:
+            _logger.info(f"This app is running locally, `num_nodes` would be mapped to devices * {num_nodes}.")
+
     def run(self):
         if not self.has_initialized:
             for node_rank in range(self.num_nodes):
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index 075c0c24e86a9..d181cd32204ec 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -1,6 +1,12 @@
 from dataclasses import asdict, dataclass
 from typing import List, Optional, Union
 
+_name_to_devices_map = {
+    "gpu": 1,
+    "gpu-fast": 1,
+    "gpu-fast-multi": 4,
+}
+
 
 @dataclass
 class CloudCompute:
@@ -62,6 +68,8 @@ def from_dict(cls, d):
     @property
     def devices(self) -> int:
         # TODO: Add a resolver here.
+        if self.name in _name_to_devices_map:
+            return _name_to_devices_map[self.name]
         return 1
 
     @property

From abce33af56b89318568173e0f586883e9efa0a0a Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 20:11:30 +0200
Subject: [PATCH 08/40] update

---
 examples/app_multi_node/.lightning | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 7e3ebdf752f5c..7befcc74ea6d3 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: '22'
+name: multi-node-demo

From 6e00b78e8092582322a0056957d3fc03dad3fe22 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 20:24:44 +0200
Subject: [PATCH 09/40] update

---
 src/lightning_app/utilities/packaging/cloud_compute.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index d181cd32204ec..64f4fbe8bce01 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -7,6 +7,11 @@
     "gpu-fast-multi": 4,
 }
 
+_short_name_to_instance_map = {
+    "gpu-fast": "p3.2xlarge",
+    "gpu-fast-multi": "p3.8xlarge",
+}
+
 
 @dataclass
 class CloudCompute:
@@ -58,6 +63,9 @@ def __post_init__(self):
 
         self.name = self.name.lower()
 
+        if self.name in _short_name_to_instance_map:
+            self.name = _short_name_to_instance_map[self.name]
+
     def to_dict(self):
         return {"__cloud_compute__": asdict(self)}
 

From aa548c94497f728cec43c585e619e15ac070461a Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Sat, 23 Jul 2022 21:10:03 +0200
Subject: [PATCH 10/40] update

---
 examples/app_multi_node/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py
index d312d86bb780a..b5e83d905047d 100644
--- a/examples/app_multi_node/train.py
+++ b/examples/app_multi_node/train.py
@@ -3,5 +3,5 @@
 
 if __name__ == "__main__":
     model = BoringModel()
-    trainer = Trainer(max_epochs=1, devices=2, strategy="ddp")
+    trainer = Trainer(max_epochs=1, strategy="ddp")
     trainer.fit(model)

From 7b8e831602668a91a513a45b520b49f87cafe16e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 15:26:45 +0200
Subject: [PATCH 11/40] update

---
 MANIFEST.in                                   |  60 ++++++++++
 examples/app_multi_node/.lightning            |   2 +-
 examples/app_multi_node/app.py                |   2 +-
 examples/app_multi_node/train.py              |   1 +
 setup.py                                      |   2 +-
 src/lightning_app/components/training.py      | 108 ++++++++++++++----
 .../utilities/packaging/cloud_compute.py      |   9 --
 src/pytorch_lightning/accelerators/cuda.py    |   4 +-
 src/pytorch_lightning/strategies/ddp.py       |  10 +-
 .../strategies/launchers/subprocess_script.py |   7 ++
 .../connectors/accelerator_connector.py       |  33 ++++++
 src/pytorch_lightning/trainer/trainer.py      |  12 ++
 .../utilities/distributed.py                  |   3 +
 13 files changed, 218 insertions(+), 35 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index a8dbcff69b631..297119d8e20b3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,63 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 7befcc74ea6d3..ccc66f8a00a3a 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: multi-node-demo
+name: '59'
diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 2829ea3b157c9..fc92b83647860 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -6,6 +6,6 @@
     LightningTrainingComponent(
         "train.py",
         num_nodes=2,
-        cloud_compute=CloudCompute("gpu-fast-multi", preemptible=True),
+        cloud_compute=CloudCompute("gpu-fast-multi"),
     ),
 )
diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py
index b5e83d905047d..ec82459279640 100644
--- a/examples/app_multi_node/train.py
+++ b/examples/app_multi_node/train.py
@@ -4,4 +4,5 @@
 if __name__ == "__main__":
     model = BoringModel()
     trainer = Trainer(max_epochs=1, strategy="ddp")
+    print("Strategy", trainer.strategy.__dict__)
     trainer.fit(model)
diff --git a/setup.py b/setup.py
index a542b3c1e0291..6d271cc40b0aa 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
+_PACKAGE_NAME = ""
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index e118aab4b78c9..c9cfdb55415f3 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -5,11 +5,57 @@
 from lightning import CloudCompute
 from lightning_app import LightningFlow, structures
 from lightning_app.components.python import TracerPythonScript
+from pytorch_lightning.plugins.environments import ClusterEnvironment
 
 _logger = logging.getLogger(__name__)
 
 
-class _LightningTrainerWork(TracerPythonScript):
+class _Environment(ClusterEnvironment):
+
+    def __init__(self, main_address, main_port, world_size, global_rank, node_rank):
+        self._main_address = main_address
+        self._main_port = main_port
+        self._world_size = world_size
+        self._global_rank = global_rank
+        self._node_rank = node_rank
+        self._local_rank = None
+
+    def detect(self):
+        return True
+
+    @property
+    def creates_processes_externally(self) -> bool:
+        return False
+
+    @property
+    def main_address(self):
+        return self._main_address
+
+    @property
+    def main_port(self) -> int:
+        return self._main_port
+
+    def global_rank(self):
+        return self._global_rank
+
+    def node_rank(self) -> int:
+        return self._node_rank
+
+    def world_size(self):
+        return self._world_size
+
+    def set_world_size(self, size: int) -> None:
+        self._world_size = size
+
+    def set_global_rank(self, rank: int) -> None:
+        self._global_rank = rank
+
+    def local_rank(self):
+        if self._local_rank is None:
+            return 0
+        return self._local_rank
+
+class PyTorchLightningPythonScript(TracerPythonScript):
     def __init__(
         self,
         script_path: str,
@@ -33,6 +79,9 @@ def __init__(
         self.best_model_score = None
         self.sanity_serving = sanity_serving
         self.has_finished = False
+        self.master_address = None
+        self.master_port = None
+        self.world_size = None
 
     def configure_tracer(self):
         from pytorch_lightning import Trainer
@@ -49,29 +98,34 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
         import torch.distributed as dist
 
         _logger.debug(f"Internal URLS: {internal_urls}")
-        master_address = str(internal_urls[0][0])
-        master_port = str(internal_urls[0][1])
+
+        self.master_address = str(internal_urls[0][0])
+        self.master_port = str(internal_urls[0][1])
         devices = self.cloud_compute.devices
-        world_size = self.num_nodes * devices
+        self.world_size = self.num_nodes * devices
+
+        backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
 
         distributed_env_vars = {
+            "MASTER_ADDRESS": self.master_address,
+            "MASTER_PORT": self.master_port,
             "NODE_RANK": str(self.node_rank),
-            "LOCAL_RANK": str(self.local_rank),
-            "GLOBAL_RANK": str(self.global_rank),
-            "MASTER_ADDRESS": master_address,
-            "MASTER_PORT": master_port,
-            "WORLD_SIZE": str(world_size),
+            "WORLD_SIZE": str(self.world_size),
+            "PL_TRAINER_NUM_NODES": str(self.num_nodes),
+            "PL_TRAINER_STRATEGY": "ddp",
+            "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
+            "PL_TRAINER_ACCELERATOR": "auto",
         }
         _logger.info(distributed_env_vars)
 
-        backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
+        # backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
 
-        dist.init_process_group(
-            backend=backend,
-            init_method=f"tcp://{master_address}:{master_port}",
-            world_size=world_size,
-            rank=self.global_rank,
-        )
+        # dist.init_process_group(
+        #     backend=backend,
+        #     init_method=f"tcp://{master_address}:{master_port}",
+        #     world_size=world_size,
+        #     rank=self.global_rank,
+        # )
 
         os.environ.update(distributed_env_vars)
         return super().run()
@@ -88,11 +142,25 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
         if self.sanity_serving:
             callbacks = callbacks + [ServableModuleValidator()]
         kwargs["callbacks"] = callbacks
-        kwargs["devices"] = self.cloud_compute.devices
-        kwargs["num_nodes"] = self.num_nodes
+        if self.is_running_in_cloud:
+            kwargs["num_nodes"] = self.num_nodes
+            kwargs["devices"] = self.cloud_compute.devices
+        else:
+            kwargs["num_nodes"] = 1
         kwargs["accelerator"] = "auto"
+        # kwargs["plugins"] = _Environment(
+        #     main_address=self.master_address,
+        #     main_port=self.master_port,
+        #     world_size=self.world_size,
+        #     global_rank=self.global_rank,
+        #     node_rank=self.node_rank,
+        # )
         return {}, args, kwargs
 
+    @property
+    def is_running_in_cloud(self) -> bool:
+        return "LIGHTNING_APP_STATE_URL" in os.environ
+
 
 class LightningTrainingComponent(LightningFlow):
     def __init__(
@@ -131,7 +199,7 @@ def run(self):
 
                 if self.is_running_in_cloud:
                     devices = self._cloud_compute.devices
-                    global_rank = (node_rank + 1) * devices - 1 if node_rank else 0
+                    global_rank = node_rank * devices if node_rank else 0
                     work_node_rank = node_rank
                     local_rank = 0
                 else:
@@ -139,7 +207,7 @@ def run(self):
                     work_node_rank = 0
                     local_rank = node_rank
 
-                self.ws[str(node_rank)] = _LightningTrainerWork(
+                self.ws[str(node_rank)] = PyTorchLightningPythonScript(
                     script_path=self.script_path,
                     script_args=self.script_args,
                     cloud_compute=self._cloud_compute,
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index 64f4fbe8bce01..41158f0c2a57c 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -7,12 +7,6 @@
     "gpu-fast-multi": 4,
 }
 
-_short_name_to_instance_map = {
-    "gpu-fast": "p3.2xlarge",
-    "gpu-fast-multi": "p3.8xlarge",
-}
-
-
 @dataclass
 class CloudCompute:
     """
@@ -63,9 +57,6 @@ def __post_init__(self):
 
         self.name = self.name.lower()
 
-        if self.name in _short_name_to_instance_map:
-            self.name = _short_name_to_instance_map[self.name]
-
     def to_dict(self):
         return {"__cloud_compute__": asdict(self)}
 
diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py
index a474ef9a99031..4d6b9bebc2d25 100644
--- a/src/pytorch_lightning/accelerators/cuda.py
+++ b/src/pytorch_lightning/accelerators/cuda.py
@@ -79,7 +79,9 @@ def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]:
     @staticmethod
     def get_parallel_devices(devices: List[int]) -> List[torch.device]:
         """Gets parallel devices for the Accelerator."""
-        return [torch.device("cuda", i) for i in devices]
+        parallel_devices = [torch.device("cuda", i) for i in devices]
+        print("get_parallel_devices", parallel_devices)
+        return parallel_devices
 
     @staticmethod
     def auto_device_count() -> int:
diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py
index 922730df35269..8fe2fcf3cd9db 100644
--- a/src/pytorch_lightning/strategies/ddp.py
+++ b/src/pytorch_lightning/strategies/ddp.py
@@ -83,8 +83,8 @@ def __init__(
         checkpoint_io: Optional[CheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
         ddp_comm_state: Optional[object] = None,
-        ddp_comm_hook: Optional[callable] = None,
-        ddp_comm_wrapper: Optional[callable] = None,
+        ddp_comm_hook: Optional[Callable] = None,
+        ddp_comm_wrapper: Optional[Callable] = None,
         model_averaging_period: Optional[int] = None,
         process_group_backend: Optional[str] = None,
         timeout: Optional[timedelta] = default_pg_timeout,
@@ -216,9 +216,15 @@ def _get_process_group_backend(self) -> str:
     def set_world_ranks(self) -> None:
         if self.cluster_environment is None:
             return
+        print(f"node_rank: {self.node_rank}")
+        print(f"num_processes: {self.num_processes}")
+        print(f"local_rank: {self.local_rank}")
+        print("num_nodes", self.num_nodes)
         self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
         self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
         rank_zero_only.rank = self.cluster_environment.global_rank()
+        print(f"global_rank: {rank_zero_only.rank}")
+        print("world_size", self.cluster_environment.world_size())
 
     def pre_configure_ddp(self) -> None:
         # if unset, default `find_unused_parameters` `True`
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
index 5a8632fb87306..9630eb812ce5a 100644
--- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py
+++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -88,8 +88,11 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
             trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer`.
             **kwargs: Optional keyword arguments to be passed to the given function.
         """
+        print("creates_processes_externally", self.cluster_environment.creates_processes_externally)
         if not self.cluster_environment.creates_processes_externally:
+            print("_call_children_scripts")
             self._call_children_scripts()
+        print("After creating")
         return function(*args, **kwargs)
 
     def _call_children_scripts(self) -> None:
@@ -130,6 +133,8 @@ def _call_children_scripts(self) -> None:
             env_copy = os.environ.copy()
             env_copy["LOCAL_RANK"] = f"{local_rank}"
 
+            print(f"Creating {local_rank} {env_copy}")
+
             # remove env var if global seed not set
             if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                 del env_copy["PL_GLOBAL_SEED"]
@@ -149,6 +154,8 @@ def _call_children_scripts(self) -> None:
             delay = np.random.uniform(1, 5, 1)[0]
             sleep(delay)
 
+        print("done !")
+
     def _check_can_spawn_children(self) -> None:
         if self.cluster_environment.local_rank() != 0:
             raise RuntimeError(
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index dc8594bfd7021..bd4dee270c05c 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -151,6 +151,8 @@ def __init__(
             A. Class > str
             B. Strategy > Accelerator/precision/plugins
         """
+        print("Accelerator Connector", num_nodes, devices, accelerator, strategy)
+
         if deterministic:
             if benchmark is None:
                 # Set benchmark to False to ensure determinism
@@ -188,6 +190,8 @@ def __init__(
         self._amp_level_flag: Optional[str] = amp_level
         self._auto_select_gpus: bool = auto_select_gpus
 
+        print("1")
+
         self._check_config_and_set_final_flags(
             strategy=strategy,
             accelerator=accelerator,
@@ -197,9 +201,17 @@ def __init__(
             amp_level=amp_level,
             sync_batchnorm=sync_batchnorm,
         )
+
+        print("2")
+
+
+
         self._check_device_config_and_set_final_flags(
             devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores
         )
+
+        print("3")
+
         # 2. Instantiate Accelerator
         # handle `auto` and `None`
         self._set_accelerator_if_ipu_strategy_is_passed()
@@ -207,9 +219,13 @@ def __init__(
             self._accelerator_flag = self._choose_accelerator()
         self._set_parallel_devices_and_init_accelerator()
 
+        print("4")
+
         # 3. Instantiate ClusterEnvironment
         self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()
 
+        print("5")
+
         # 4. Instantiate Strategy - Part 1
         if self._strategy_flag is None:
             self._strategy_flag = self._choose_strategy()
@@ -217,12 +233,18 @@ def __init__(
         self._check_strategy_and_fallback()
         self._init_strategy()
 
+        print("6")
+
         # 5. Instantiate Precision Plugin
         self.precision_plugin = self._check_and_init_precision()
 
+        print("7")
+
         # 6. Instantiate Strategy - Part 2
         self._lazy_init_strategy()
 
+        print("8")
+
     def _init_deterministic(self, deterministic: Optional[Union[bool, _LITERAL_WARN]]) -> None:
         self.deterministic = deterministic or False  # default to False if not set
         if _TORCH_GREATER_EQUAL_1_11 and deterministic == "warn":
@@ -530,10 +552,12 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
         self._devices_flag = self.accelerator.parse_devices(self._devices_flag)
         if not self._parallel_devices:
             self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag)
+            print("Right there", self._parallel_devices)
 
     def _set_devices_flag_if_auto_passed(self) -> None:
         if self._devices_flag == "auto" or self._devices_flag is None:
             self._devices_flag = self.accelerator.auto_device_count()
+            print(f"Auto device {self._devices_flag}")
 
     def _set_devices_flag_if_auto_select_gpus_passed(self) -> None:
         if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, CUDAAccelerator):
@@ -770,24 +794,33 @@ def _validate_precision_choice(self) -> None:
 
     def _lazy_init_strategy(self) -> None:
         """Lazily set missing attributes on the previously instantiated strategy."""
+        print("a")
         self.strategy.accelerator = self.accelerator
         if self.precision_plugin:
             self.strategy.precision_plugin = self.precision_plugin
         if self.checkpoint_io:
             self.strategy.checkpoint_io = self.checkpoint_io
+        print("b", self.cluster_environment)
         if hasattr(self.strategy, "cluster_environment"):
             self.strategy.cluster_environment = self.cluster_environment
         if hasattr(self.strategy, "parallel_devices"):
+            print("c", self.strategy.parallel_devices)
             if self.strategy.parallel_devices:
                 self._parallel_devices = self.strategy.parallel_devices
             else:
+                print("c1")
+                #print(self._parallel_devices, os.environ)
                 self.strategy.parallel_devices = self._parallel_devices
+                print("c2")
         if hasattr(self.strategy, "num_nodes"):
+            print("d", self._num_nodes_flag)
             self.strategy._num_nodes = self._num_nodes_flag
         if hasattr(self.strategy, "_layer_sync"):
             self.strategy._layer_sync = self._layer_sync
         if hasattr(self.strategy, "set_world_ranks"):
+            print("e")
             self.strategy.set_world_ranks()
+        print("f")
         self.strategy._configure_launcher()
 
         from pytorch_lightning.utilities import _IS_INTERACTIVE
diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py
index d10225fea2d65..19ee61483e6ca 100644
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@@ -430,6 +430,8 @@ def __init__(
         # init connectors
         self._data_connector = DataConnector(self, multiple_trainloader_mode)
 
+        print('before accelerator_connector')
+
         self._accelerator_connector = AcceleratorConnector(
             num_processes=num_processes,
             devices=devices,
@@ -449,12 +451,16 @@ def __init__(
             amp_level=amp_level,
             plugins=plugins,
         )
+        print('after accelerator_connector')
+
         self._logger_connector = LoggerConnector(self)
         self._callback_connector = CallbackConnector(self)
         self._checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint)
         self._signal_connector = SignalConnector(self)
         self.tuner = Tuner(self)
 
+        print('_parse_loop_limits')
+
         min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits(
             min_steps, max_steps, min_epochs, max_epochs, max_time
         )
@@ -462,6 +468,8 @@ def __init__(
         training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps)
         fit_loop.connect(epoch_loop=training_epoch_loop)
 
+        print('TrainingEpochLoop')
+
         # default .fit() loop
         self.fit_loop = fit_loop
 
@@ -483,6 +491,8 @@ def __init__(
         self._tested_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
         self._predicted_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
 
+        print('on_trainer_init')
+
         # init callbacks
         # Declare attributes to be set in _callback_connector on_trainer_init
         self._callback_connector.on_trainer_init(
@@ -507,6 +517,8 @@ def __init__(
             check_val_every_n_epoch,
         )
 
+        print('on_trainer_init')
+
         # gradient clipping
         if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)):
             raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.")
diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py
index 361c6dd12beeb..63c43319b0b55 100644
--- a/src/pytorch_lightning/utilities/distributed.py
+++ b/src/pytorch_lightning/utilities/distributed.py
@@ -366,6 +366,9 @@ def init_dist_connection(
     if torch.distributed.is_initialized():
         log.debug("torch.distributed is already initialized. Exiting early")
         return
+
+    print(cluster_environment)
+
     global_rank = global_rank if global_rank is not None else cluster_environment.global_rank()
     world_size = world_size if world_size is not None else cluster_environment.world_size()
     os.environ["MASTER_ADDR"] = cluster_environment.main_address

From b0a3c529dc93b2ef96a877c5509566af523f512c Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 15:27:12 +0200
Subject: [PATCH 12/40] update

---
 src/lightning_app/components/training.py             |  2 +-
 .../utilities/packaging/cloud_compute.py             |  1 +
 .../trainer/connectors/accelerator_connector.py      |  4 +---
 src/pytorch_lightning/trainer/trainer.py             | 12 ++++++------
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index c9cfdb55415f3..7698964d68312 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -11,7 +11,6 @@
 
 
 class _Environment(ClusterEnvironment):
-
     def __init__(self, main_address, main_port, world_size, global_rank, node_rank):
         self._main_address = main_address
         self._main_port = main_port
@@ -55,6 +54,7 @@ def local_rank(self):
             return 0
         return self._local_rank
 
+
 class PyTorchLightningPythonScript(TracerPythonScript):
     def __init__(
         self,
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index 41158f0c2a57c..d181cd32204ec 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -7,6 +7,7 @@
     "gpu-fast-multi": 4,
 }
 
+
 @dataclass
 class CloudCompute:
     """
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index bd4dee270c05c..e19e211b64d16 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -204,8 +204,6 @@ def __init__(
 
         print("2")
 
-
-
         self._check_device_config_and_set_final_flags(
             devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores
         )
@@ -809,7 +807,7 @@ def _lazy_init_strategy(self) -> None:
                 self._parallel_devices = self.strategy.parallel_devices
             else:
                 print("c1")
-                #print(self._parallel_devices, os.environ)
+                # print(self._parallel_devices, os.environ)
                 self.strategy.parallel_devices = self._parallel_devices
                 print("c2")
         if hasattr(self.strategy, "num_nodes"):
diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py
index 19ee61483e6ca..affc7b3f9d140 100644
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@@ -430,7 +430,7 @@ def __init__(
         # init connectors
         self._data_connector = DataConnector(self, multiple_trainloader_mode)
 
-        print('before accelerator_connector')
+        print("before accelerator_connector")
 
         self._accelerator_connector = AcceleratorConnector(
             num_processes=num_processes,
@@ -451,7 +451,7 @@ def __init__(
             amp_level=amp_level,
             plugins=plugins,
         )
-        print('after accelerator_connector')
+        print("after accelerator_connector")
 
         self._logger_connector = LoggerConnector(self)
         self._callback_connector = CallbackConnector(self)
@@ -459,7 +459,7 @@ def __init__(
         self._signal_connector = SignalConnector(self)
         self.tuner = Tuner(self)
 
-        print('_parse_loop_limits')
+        print("_parse_loop_limits")
 
         min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits(
             min_steps, max_steps, min_epochs, max_epochs, max_time
@@ -468,7 +468,7 @@ def __init__(
         training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps)
         fit_loop.connect(epoch_loop=training_epoch_loop)
 
-        print('TrainingEpochLoop')
+        print("TrainingEpochLoop")
 
         # default .fit() loop
         self.fit_loop = fit_loop
@@ -491,7 +491,7 @@ def __init__(
         self._tested_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
         self._predicted_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
 
-        print('on_trainer_init')
+        print("on_trainer_init")
 
         # init callbacks
         # Declare attributes to be set in _callback_connector on_trainer_init
@@ -517,7 +517,7 @@ def __init__(
             check_val_every_n_epoch,
         )
 
-        print('on_trainer_init')
+        print("on_trainer_init")
 
         # gradient clipping
         if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)):

From 04fe16d1623b18e3bf5707bee3fe6eca7b620453 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 15:52:31 +0200
Subject: [PATCH 13/40] update

---
 MANIFEST.in                                   | 12 +++
 examples/app_multi_node/.lightning            |  2 +-
 src/lightning_app/components/training.py      | 89 ++++---------------
 .../strategies/launchers/subprocess_script.py | 13 +++
 4 files changed, 42 insertions(+), 74 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 297119d8e20b3..1b7de078d6b11 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -63,3 +63,15 @@ recursive-include src/lightning_app/cli/*-template *
 recursive-include src *.md
 recursive-include requirements *.txt
 recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index ccc66f8a00a3a..d6917f3c735da 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: '59'
+name: '62'
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 7698964d68312..87530f5cf5072 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -5,56 +5,10 @@
 from lightning import CloudCompute
 from lightning_app import LightningFlow, structures
 from lightning_app.components.python import TracerPythonScript
-from pytorch_lightning.plugins.environments import ClusterEnvironment
 
 _logger = logging.getLogger(__name__)
 
 
-class _Environment(ClusterEnvironment):
-    def __init__(self, main_address, main_port, world_size, global_rank, node_rank):
-        self._main_address = main_address
-        self._main_port = main_port
-        self._world_size = world_size
-        self._global_rank = global_rank
-        self._node_rank = node_rank
-        self._local_rank = None
-
-    def detect(self):
-        return True
-
-    @property
-    def creates_processes_externally(self) -> bool:
-        return False
-
-    @property
-    def main_address(self):
-        return self._main_address
-
-    @property
-    def main_port(self) -> int:
-        return self._main_port
-
-    def global_rank(self):
-        return self._global_rank
-
-    def node_rank(self) -> int:
-        return self._node_rank
-
-    def world_size(self):
-        return self._world_size
-
-    def set_world_size(self, size: int) -> None:
-        self._world_size = size
-
-    def set_global_rank(self, rank: int) -> None:
-        self._global_rank = rank
-
-    def local_rank(self):
-        if self._local_rank is None:
-            return 0
-        return self._local_rank
-
-
 class PyTorchLightningPythonScript(TracerPythonScript):
     def __init__(
         self,
@@ -87,7 +41,7 @@ def configure_tracer(self):
         from pytorch_lightning import Trainer
 
         tracer = super().configure_tracer()
-        tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
+        # tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
         return tracer
 
     def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
@@ -95,8 +49,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             _logger.info(f"The node {self.node_rank} started !")
             return
 
-        import torch.distributed as dist
-
         _logger.debug(f"Internal URLS: {internal_urls}")
 
         self.master_address = str(internal_urls[0][0])
@@ -104,8 +56,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
         devices = self.cloud_compute.devices
         self.world_size = self.num_nodes * devices
 
-        backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
-
         distributed_env_vars = {
             "MASTER_ADDRESS": self.master_address,
             "MASTER_PORT": self.master_port,
@@ -117,16 +67,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             "PL_TRAINER_ACCELERATOR": "auto",
         }
         _logger.info(distributed_env_vars)
-
-        # backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl"
-
-        # dist.init_process_group(
-        #     backend=backend,
-        #     init_method=f"tcp://{master_address}:{master_port}",
-        #     world_size=world_size,
-        #     rank=self.global_rank,
-        # )
-
         os.environ.update(distributed_env_vars)
         return super().run()
 
@@ -136,18 +76,21 @@ def on_after_run(self, script_globals):
         raise SystemExit(0)
 
     def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
-        from pytorch_lightning.serve import ServableModuleValidator
-
-        callbacks = kwargs.get("callbacks", [])
-        if self.sanity_serving:
-            callbacks = callbacks + [ServableModuleValidator()]
-        kwargs["callbacks"] = callbacks
-        if self.is_running_in_cloud:
-            kwargs["num_nodes"] = self.num_nodes
-            kwargs["devices"] = self.cloud_compute.devices
-        else:
-            kwargs["num_nodes"] = 1
-        kwargs["accelerator"] = "auto"
+        if self.node_rank != 0 :
+            return {}, args, kwargs
+
+        # from pytorch_lightning.serve import ServableModuleValidator
+
+        # callbacks = kwargs.get("callbacks", [])
+        # if self.sanity_serving:
+        #     callbacks = callbacks + [ServableModuleValidator()]
+        # kwargs["callbacks"] = callbacks
+        # if self.is_running_in_cloud:
+        #     kwargs["num_nodes"] = self.num_nodes
+        #     kwargs["devices"] = self.cloud_compute.devices
+        # else:
+        #     kwargs["num_nodes"] = 1
+        # kwargs["accelerator"] = "auto"
         # kwargs["plugins"] = _Environment(
         #     main_address=self.master_address,
         #     main_port=self.master_port,
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
index 9630eb812ce5a..822645a4f89f0 100644
--- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py
+++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -96,17 +96,23 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
         return function(*args, **kwargs)
 
     def _call_children_scripts(self) -> None:
+        print("1")
         # bookkeeping of spawned processes
         self._check_can_spawn_children()
+        print("2")
 
         # DDP Environment variables
         os.environ["MASTER_ADDR"] = self.cluster_environment.main_address
         os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port)
 
+        print("3")
+
         # allow the user to pass the node rank
         os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
         os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())
 
+        print("4")
+
         # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
         # See https://docs.python.org/3/reference/import.html#main-spec
         if __main__.__spec__ is None:  # pragma: no-cover
@@ -127,9 +133,16 @@ def _call_children_scripts(self) -> None:
         else:  # Script called as `python -m a.b.c`
             command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
 
+        print("5")
+
         os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"
 
+        print("call_children_scripts", os.environ, self.num_processes)
+
+        print("6")
+
         for local_rank in range(1, self.num_processes):
+            print("7")
             env_copy = os.environ.copy()
             env_copy["LOCAL_RANK"] = f"{local_rank}"
 

From 0e1b06e2ccb30735742d804b27a1e95befdd4a38 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 15:54:22 +0200
Subject: [PATCH 14/40] update

---
 MANIFEST.in                              | 72 ------------------------
 setup.py                                 |  2 +-
 src/lightning_app/components/training.py |  2 +-
 3 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 1b7de078d6b11..a8dbcff69b631 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,75 +3,3 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
diff --git a/setup.py b/setup.py
index 6d271cc40b0aa..a542b3c1e0291 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = ""
+_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 87530f5cf5072..194b070388404 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -76,7 +76,7 @@ def on_after_run(self, script_globals):
         raise SystemExit(0)
 
     def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
-        if self.node_rank != 0 :
+        if self.node_rank != 0:
             return {}, args, kwargs
 
         # from pytorch_lightning.serve import ServableModuleValidator

From a389de67bd7246ba223938cd61449910f72d9666 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 18:23:35 +0200
Subject: [PATCH 15/40] update

---
 examples/app_multi_node/train.py              |  1 -
 src/lightning_app/components/training.py      | 55 ++++---------------
 .../strategies/launchers/subprocess_script.py | 14 +++--
 .../utilities/distributed.py                  | 10 +++-
 4 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py
index ec82459279640..b5e83d905047d 100644
--- a/examples/app_multi_node/train.py
+++ b/examples/app_multi_node/train.py
@@ -4,5 +4,4 @@
 if __name__ == "__main__":
     model = BoringModel()
     trainer = Trainer(max_epochs=1, strategy="ddp")
-    print("Strategy", trainer.strategy.__dict__)
     trainer.fit(model)
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 194b070388404..e464098087164 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -16,8 +16,6 @@ def __init__(
         script_args: Optional[Union[list, str]] = None,
         node_rank: int = 1,
         num_nodes: int = 1,
-        global_rank: int = 0,
-        local_rank: int = 0,
         sanity_serving: bool = False,
         cloud_compute: Optional[CloudCompute] = None,
         **kwargs,
@@ -27,8 +25,6 @@ def __init__(
         )
         self.node_rank = node_rank
         self.num_nodes = num_nodes
-        self.global_rank = global_rank
-        self.local_rank = local_rank
         self.best_model_path: None
         self.best_model_score = None
         self.sanity_serving = sanity_serving
@@ -41,7 +37,7 @@ def configure_tracer(self):
         from pytorch_lightning import Trainer
 
         tracer = super().configure_tracer()
-        # tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
+        tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
         return tracer
 
     def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
@@ -65,7 +61,9 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             "PL_TRAINER_STRATEGY": "ddp",
             "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
             "PL_TRAINER_ACCELERATOR": "auto",
+            "PL_TORCH_DISTRIBUTED_BACKEND": "gloo",
         }
+
         _logger.info(distributed_env_vars)
         os.environ.update(distributed_env_vars)
         return super().run()
@@ -79,25 +77,12 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
         if self.node_rank != 0:
             return {}, args, kwargs
 
-        # from pytorch_lightning.serve import ServableModuleValidator
-
-        # callbacks = kwargs.get("callbacks", [])
-        # if self.sanity_serving:
-        #     callbacks = callbacks + [ServableModuleValidator()]
-        # kwargs["callbacks"] = callbacks
-        # if self.is_running_in_cloud:
-        #     kwargs["num_nodes"] = self.num_nodes
-        #     kwargs["devices"] = self.cloud_compute.devices
-        # else:
-        #     kwargs["num_nodes"] = 1
-        # kwargs["accelerator"] = "auto"
-        # kwargs["plugins"] = _Environment(
-        #     main_address=self.master_address,
-        #     main_port=self.master_port,
-        #     world_size=self.world_size,
-        #     global_rank=self.global_rank,
-        #     node_rank=self.node_rank,
-        # )
+        from pytorch_lightning.serve import ServableModuleValidator
+
+        callbacks = kwargs.get("callbacks", [])
+        if self.sanity_serving:
+            callbacks = callbacks + [ServableModuleValidator()]
+        kwargs["callbacks"] = callbacks
         return {}, args, kwargs
 
     @property
@@ -133,32 +118,16 @@ def __init__(
         self._cloud_compute = cloud_compute  # TODO: Add support for cloudCOmpute
         self.sanity_serving = sanity_serving
 
-        if not self.is_running_in_cloud and num_nodes > 1:
-            _logger.info(f"This app is running locally, `num_nodes` would be mapped to devices * {num_nodes}.")
-
     def run(self):
         if not self.has_initialized:
             for node_rank in range(self.num_nodes):
-
-                if self.is_running_in_cloud:
-                    devices = self._cloud_compute.devices
-                    global_rank = node_rank * devices if node_rank else 0
-                    work_node_rank = node_rank
-                    local_rank = 0
-                else:
-                    global_rank = node_rank
-                    work_node_rank = 0
-                    local_rank = node_rank
-
                 self.ws[str(node_rank)] = PyTorchLightningPythonScript(
                     script_path=self.script_path,
                     script_args=self.script_args,
                     cloud_compute=self._cloud_compute,
-                    node_rank=work_node_rank,
-                    global_rank=global_rank,
+                    node_rank=node_rank,
                     sanity_serving=self.sanity_serving,
                     num_nodes=self.num_nodes,
-                    local_rank=local_rank,
                 )
 
             self.has_initialized = True
@@ -175,7 +144,3 @@ def run(self):
     @property
     def ready(self) -> bool:
         return all(w.internal_ip for w in self.ws.values())
-
-    @property
-    def is_running_in_cloud(self) -> bool:
-        return "LIGHTNING_APP_STATE_URL" in os.environ
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
index 822645a4f89f0..46c00342dbfb4 100644
--- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py
+++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -133,12 +133,10 @@ def _call_children_scripts(self) -> None:
         else:  # Script called as `python -m a.b.c`
             command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
 
-        print("5")
+        print("5", self.num_processes, self.num_nodes)
 
         os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"
 
-        print("call_children_scripts", os.environ, self.num_processes)
-
         print("6")
 
         for local_rank in range(1, self.num_processes):
@@ -146,7 +144,7 @@ def _call_children_scripts(self) -> None:
             env_copy = os.environ.copy()
             env_copy["LOCAL_RANK"] = f"{local_rank}"
 
-            print(f"Creating {local_rank} {env_copy}")
+            print(f"Creating {local_rank}")
 
             # remove env var if global seed not set
             if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
@@ -154,19 +152,23 @@ def _call_children_scripts(self) -> None:
 
             # start process
             # if hydra is available and initialized, make sure to set the cwd correctly
-            cwd: Optional[str] = None
+            cwd: Optional[str] = os.getcwd()
             if _HYDRA_AVAILABLE:
                 if HydraConfig.initialized():
                     cwd = get_original_cwd()
                     os_cwd = f'"{os.getcwd()}"'
                     command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
-            subprocess.Popen(command, env=env_copy, cwd=cwd)
+
+            print(command, cwd)
+            process = subprocess.Popen(command, env=env_copy, cwd=cwd, stderr=sys.stderr)
 
             # starting all processes at once can cause issues
             # with dataloaders delay between 1-10 seconds
             delay = np.random.uniform(1, 5, 1)[0]
             sleep(delay)
 
+            print(process.returncode)
+
         print("done !")
 
     def _check_can_spawn_children(self) -> None:
diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py
index 63c43319b0b55..903fe300a2de5 100644
--- a/src/pytorch_lightning/utilities/distributed.py
+++ b/src/pytorch_lightning/utilities/distributed.py
@@ -374,7 +374,15 @@ def init_dist_connection(
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-    torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
+    torch.distributed.init_process_group(
+        torch_distributed_backend,
+        init_method=f"tcp://{cluster_environment.main_address}:{cluster_environment.main_port}",
+        rank=global_rank,
+        world_size=world_size,
+        **kwargs,
+    )
+
+    print("HERE")
 
     # on rank=0 let everyone know training is starting
     new_rank_zero_info(

From 3c9d1f80dc8e82d73ab725c79a941f1b0a346bae Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 20:42:37 +0200
Subject: [PATCH 16/40] update

---
 MANIFEST.in                                   | 51 ++++++++++++++++++
 examples/app_multi_node/.lightning            |  2 +-
 examples/app_multi_node/app.py                |  2 +-
 setup.py                                      |  2 +-
 src/lightning_app/components/training.py      | 49 +++++++++--------
 .../utilities/packaging/cloud_compute.py      |  2 +
 src/pytorch_lightning/CHANGELOG.md            |  6 +++
 src/pytorch_lightning/accelerators/cuda.py    | 10 +---
 src/pytorch_lightning/cli.py                  |  2 +-
 src/pytorch_lightning/lite/lite.py            |  7 +--
 src/pytorch_lightning/loggers/comet.py        | 32 ++++++------
 src/pytorch_lightning/loggers/csv_logs.py     |  2 +-
 src/pytorch_lightning/loggers/mlflow.py       |  4 +-
 src/pytorch_lightning/loggers/tensorboard.py  |  2 +-
 src/pytorch_lightning/loggers/wandb.py        |  2 +-
 src/pytorch_lightning/strategies/ddp.py       | 10 +---
 .../strategies/launchers/subprocess_script.py | 26 +---------
 .../connectors/accelerator_connector.py       | 52 ++++++-------------
 src/pytorch_lightning/trainer/trainer.py      | 12 -----
 .../utilities/distributed.py                  | 13 +----
 src/pytorch_lightning/utilities/enums.py      |  4 +-
 src/pytorch_lightning/utilities/logger.py     |  6 ++-
 22 files changed, 144 insertions(+), 154 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index a8dbcff69b631..4b0d22529d613 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,54 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
+recursive-include src *.md
+recursive-include requirements *.txt
+recursive-include src/lightning_app/cli/*-template *
diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index d6917f3c735da..9202c95a897f9 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: '62'
+name: '92'
diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index fc92b83647860..586924062c45c 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -6,6 +6,6 @@
     LightningTrainingComponent(
         "train.py",
         num_nodes=2,
-        cloud_compute=CloudCompute("gpu-fast-multi"),
+        cloud_compute=CloudCompute("cpu"),
     ),
 )
diff --git a/setup.py b/setup.py
index a542b3c1e0291..6d271cc40b0aa 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
+_PACKAGE_NAME = ""
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index e464098087164..7d3ac7903fc32 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -1,15 +1,16 @@
 import logging
 import os
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 from lightning import CloudCompute
 from lightning_app import LightningFlow, structures
 from lightning_app.components.python import TracerPythonScript
+from lightning_app.storage.path import Path
 
 _logger = logging.getLogger(__name__)
 
 
-class PyTorchLightningPythonScript(TracerPythonScript):
+class PyTorchLightningScriptRunner(TracerPythonScript):
     def __init__(
         self,
         script_path: str,
@@ -25,13 +26,10 @@ def __init__(
         )
         self.node_rank = node_rank
         self.num_nodes = num_nodes
-        self.best_model_path: None
+        self.best_model_path = None
         self.best_model_score = None
         self.sanity_serving = sanity_serving
         self.has_finished = False
-        self.master_address = None
-        self.master_port = None
-        self.world_size = None
 
     def configure_tracer(self):
         from pytorch_lightning import Trainer
@@ -45,32 +43,37 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
             _logger.info(f"The node {self.node_rank} started !")
             return
 
-        _logger.debug(f"Internal URLS: {internal_urls}")
-
-        self.master_address = str(internal_urls[0][0])
-        self.master_port = str(internal_urls[0][1])
-        devices = self.cloud_compute.devices
-        self.world_size = self.num_nodes * devices
+        master_address = str(internal_urls[0][0])
+        master_port = str(internal_urls[0][1])
 
         distributed_env_vars = {
-            "MASTER_ADDRESS": self.master_address,
-            "MASTER_PORT": self.master_port,
+            "MASTER_ADDR": master_address,
+            "MASTER_PORT": master_port,
             "NODE_RANK": str(self.node_rank),
-            "WORLD_SIZE": str(self.world_size),
+            "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices),
             "PL_TRAINER_NUM_NODES": str(self.num_nodes),
             "PL_TRAINER_STRATEGY": "ddp",
             "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
             "PL_TRAINER_ACCELERATOR": "auto",
-            "PL_TORCH_DISTRIBUTED_BACKEND": "gloo",
         }
-
-        _logger.info(distributed_env_vars)
         os.environ.update(distributed_env_vars)
         return super().run()
 
     def on_after_run(self, script_globals):
-        # TODO: Why does it hang there.
+        from pytorch_lightning import Trainer
+        from pytorch_lightning.utilities.cli import LightningCLI
+
+        cli = [v for v in script_globals.values() if isinstance(v, LightningCLI)]
+        if cli:
+            trainer = cli[0].trainer
+        else:
+            trainer = [v for v in script_globals.values() if isinstance(v, Trainer)][0]
+
+        if trainer.checkpoint_callback.best_model_score:
+            self.best_model_path = Path(trainer.checkpoint_callback.best_model_path)
+            self.best_model_score = float(trainer.checkpoint_callback.best_model_score)
         self.has_finished = True
+        # TODO: Why does it hang there.
         raise SystemExit(0)
 
     def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
@@ -96,10 +99,11 @@ def __init__(
         script_path: str,
         script_args: Optional[Union[list, str]] = None,
         num_nodes: int = 1,
-        cloud_compute: CloudCompute = CloudCompute("cpu"),
+        cloud_compute: CloudCompute = CloudCompute("default"),
         sanity_serving: bool = False,
+        script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
     ):
-        """This component enables to perform distributed training.
+        """This component enables to perform distributed multi-node multi-gpus training.
 
         Arguments:
             script_path: Path to the script to be executed.
@@ -117,11 +121,12 @@ def __init__(
         self.num_nodes = num_nodes
         self._cloud_compute = cloud_compute  # TODO: Add support for cloudCOmpute
         self.sanity_serving = sanity_serving
+        self._script_runner = script_runner
 
     def run(self):
         if not self.has_initialized:
             for node_rank in range(self.num_nodes):
-                self.ws[str(node_rank)] = PyTorchLightningPythonScript(
+                self.ws[str(node_rank)] = self._script_runner(
                     script_path=self.script_path,
                     script_args=self.script_args,
                     cloud_compute=self._cloud_compute,
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index d181cd32204ec..dcec4cf858828 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -2,6 +2,8 @@
 from typing import List, Optional, Union
 
 _name_to_devices_map = {
+    "default": 2,
+    "cpu": 2,
     "gpu": 1,
     "gpu-fast": 1,
     "gpu-fast-multi": 4,
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index af53c9b063853..b2c7ca54e68a7 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -110,6 +110,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642))
+
+
 - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527))
 
 
@@ -166,6 +169,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Deprecated
 
+- Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636))
+
+
 - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014))
 
 
diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py
index 4d6b9bebc2d25..1c69015546976 100644
--- a/src/pytorch_lightning/accelerators/cuda.py
+++ b/src/pytorch_lightning/accelerators/cuda.py
@@ -79,9 +79,7 @@ def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]:
     @staticmethod
     def get_parallel_devices(devices: List[int]) -> List[torch.device]:
         """Gets parallel devices for the Accelerator."""
-        parallel_devices = [torch.device("cuda", i) for i in devices]
-        print("get_parallel_devices", parallel_devices)
-        return parallel_devices
+        return [torch.device("cuda", i) for i in devices]
 
     @staticmethod
     def auto_device_count() -> int:
@@ -99,12 +97,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None:
             cls,
             description=f"{cls.__class__.__name__}",
         )
-        # temporarily enable "gpu" to point to the CUDA Accelerator
-        accelerator_registry.register(
-            "gpu",
-            cls,
-            description=f"{cls.__class__.__name__}",
-        )
 
     def teardown(self) -> None:
         # clean up memory
diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py
index 169f16b66cd33..d3990d79c5c88 100644
--- a/src/pytorch_lightning/cli.py
+++ b/src/pytorch_lightning/cli.py
@@ -27,7 +27,7 @@
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn
 
-_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.10.2")
+_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.12.0")
 
 if _JSONARGPARSE_SIGNATURES_AVAILABLE:
     import docstring_parser
diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py
index 86bddaf676e01..0195e6852eb28 100644
--- a/src/pytorch_lightning/lite/lite.py
+++ b/src/pytorch_lightning/lite/lite.py
@@ -54,7 +54,8 @@ class LightningLite(ABC):
     - Multi-node support.
 
     Args:
-        accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
+        accelerator: The hardware to run on. Possible choices are:
+            ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
         strategy: Strategy for how to run across multiple devices. Possible choices are:
             ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``.
         devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``.
@@ -436,7 +437,7 @@ def _get_distributed_sampler(dataloader: DataLoader, **kwargs: Any) -> Distribut
         return DistributedSamplerWrapper(dataloader.sampler, **kwargs)
 
     def _check_accelerator_support(self, accelerator: Optional[Union[str, Accelerator]]) -> None:
-        supported = [t.value.lower() for t in self._supported_device_types()] + ["auto"]
+        supported = [t.value.lower() for t in self._supported_device_types()] + ["gpu", "auto"]
         valid = accelerator is None or isinstance(accelerator, Accelerator) or accelerator in supported
         if not valid:
             raise MisconfigurationException(
@@ -457,7 +458,7 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N
     def _supported_device_types() -> Sequence[_AcceleratorType]:
         return (
             _AcceleratorType.CPU,
-            _AcceleratorType.GPU,
+            _AcceleratorType.CUDA,
             _AcceleratorType.TPU,
             _AcceleratorType.MPS,
         )
diff --git a/src/pytorch_lightning/loggers/comet.py b/src/pytorch_lightning/loggers/comet.py
index 2b853f59259ff..363d47c1166e6 100644
--- a/src/pytorch_lightning/loggers/comet.py
+++ b/src/pytorch_lightning/loggers/comet.py
@@ -21,7 +21,7 @@
 from argparse import Namespace
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
 
-from torch import is_tensor, Tensor
+from torch import Tensor
 
 import pytorch_lightning as pl
 from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment
@@ -141,7 +141,7 @@ def __init__(
         prefix: str = "",
         agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None,
         agg_default_func: Optional[Callable[[Sequence[float]], float]] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         if comet_ml is None:
             raise ModuleNotFoundError(
@@ -149,6 +149,8 @@ def __init__(
             )
         super().__init__(agg_key_funcs=agg_key_funcs, agg_default_func=agg_default_func)
         self._experiment = None
+        self._save_dir: Optional[str]
+        self.rest_api_key: Optional[str]
 
         # Determine online or offline mode based on which arguments were passed to CometLogger
         api_key = api_key or comet_ml.config.get_api_key(None, comet_ml.config.get_config())
@@ -170,12 +172,12 @@ def __init__(
 
         log.info(f"CometLogger will be initialized in {self.mode} mode")
 
-        self._project_name = project_name
-        self._experiment_key = experiment_key
-        self._experiment_name = experiment_name
-        self._prefix = prefix
-        self._kwargs = kwargs
-        self._future_experiment_key = None
+        self._project_name: Optional[str] = project_name
+        self._experiment_key: Optional[str] = experiment_key
+        self._experiment_name: Optional[str] = experiment_name
+        self._prefix: str = prefix
+        self._kwargs: Any = kwargs
+        self._future_experiment_key: Optional[str] = None
 
         if rest_api_key is not None:
             # Comet.ml rest API, used to determine version number
@@ -185,9 +187,7 @@ def __init__(
             self.rest_api_key = None
             self.comet_api = None
 
-        self._kwargs = kwargs
-
-    @property
+    @property  # type: ignore[misc]
     @rank_zero_experiment
     def experiment(self) -> Union[CometExperiment, CometExistingExperiment, CometOfflineExperiment]:
         r"""
@@ -240,19 +240,19 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
         self.experiment.log_parameters(params)
 
     @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None:
+    def log_metrics(self, metrics: Mapping[str, Union[Tensor, float]], step: Optional[int] = None) -> None:
         assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
         # Comet.ml expects metrics to be a dictionary of detached tensors on CPU
         metrics_without_epoch = metrics.copy()
         for key, val in metrics_without_epoch.items():
-            if is_tensor(val):
+            if isinstance(val, Tensor):
                 metrics_without_epoch[key] = val.cpu().detach()
 
         epoch = metrics_without_epoch.pop("epoch", None)
         metrics_without_epoch = _add_prefix(metrics_without_epoch, self._prefix, self.LOGGER_JOIN_CHAR)
         self.experiment.log_metrics(metrics_without_epoch, step=step, epoch=epoch)
 
-    def reset_experiment(self):
+    def reset_experiment(self) -> None:
         self._experiment = None
 
     @rank_zero_only
@@ -326,7 +326,7 @@ def version(self) -> str:
 
         return self._future_experiment_key
 
-    def __getstate__(self):
+    def __getstate__(self) -> Dict[str, Any]:
         state = self.__dict__.copy()
 
         # Save the experiment id in case an experiment object already exists,
@@ -340,6 +340,6 @@ def __getstate__(self):
         state["_experiment"] = None
         return state
 
-    def log_graph(self, model: "pl.LightningModule", input_array=None) -> None:
+    def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = None) -> None:
         if self._experiment is not None:
             self._experiment.set_model_graph(model)
diff --git a/src/pytorch_lightning/loggers/csv_logs.py b/src/pytorch_lightning/loggers/csv_logs.py
index 72d21ae2c4974..45d5fffb51e33 100644
--- a/src/pytorch_lightning/loggers/csv_logs.py
+++ b/src/pytorch_lightning/loggers/csv_logs.py
@@ -195,7 +195,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
         self.experiment.log_hparams(params)
 
     @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+    def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None:
         metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
         self.experiment.log_metrics(metrics, step)
         if step is not None and (step + 1) % self._flush_logs_every_n_steps == 0:
diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py
index 313fcfe07f10e..5675a3bd9fc67 100644
--- a/src/pytorch_lightning/loggers/mlflow.py
+++ b/src/pytorch_lightning/loggers/mlflow.py
@@ -20,7 +20,7 @@
 import re
 from argparse import Namespace
 from time import time
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Mapping, Optional, Union
 
 from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment
 from pytorch_lightning.utilities.imports import _module_available
@@ -230,7 +230,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
             self.experiment.log_param(self.run_id, k, v)
 
     @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+    def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None:
         assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
 
         metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py
index 12ec2e21b84ce..dacecf129523b 100644
--- a/src/pytorch_lightning/loggers/tensorboard.py
+++ b/src/pytorch_lightning/loggers/tensorboard.py
@@ -216,7 +216,7 @@ def log_hyperparams(
             writer.add_summary(sei)
 
     @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+    def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None:
         assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
 
         metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py
index bc2a84dc82b00..8e30827759b99 100644
--- a/src/pytorch_lightning/loggers/wandb.py
+++ b/src/pytorch_lightning/loggers/wandb.py
@@ -379,7 +379,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
         self.experiment.config.update(params, allow_val_change=True)
 
     @rank_zero_only
-    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+    def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None:
         assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
 
         metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py
index 8fe2fcf3cd9db..922730df35269 100644
--- a/src/pytorch_lightning/strategies/ddp.py
+++ b/src/pytorch_lightning/strategies/ddp.py
@@ -83,8 +83,8 @@ def __init__(
         checkpoint_io: Optional[CheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
         ddp_comm_state: Optional[object] = None,
-        ddp_comm_hook: Optional[Callable] = None,
-        ddp_comm_wrapper: Optional[Callable] = None,
+        ddp_comm_hook: Optional[callable] = None,
+        ddp_comm_wrapper: Optional[callable] = None,
         model_averaging_period: Optional[int] = None,
         process_group_backend: Optional[str] = None,
         timeout: Optional[timedelta] = default_pg_timeout,
@@ -216,15 +216,9 @@ def _get_process_group_backend(self) -> str:
     def set_world_ranks(self) -> None:
         if self.cluster_environment is None:
             return
-        print(f"node_rank: {self.node_rank}")
-        print(f"num_processes: {self.num_processes}")
-        print(f"local_rank: {self.local_rank}")
-        print("num_nodes", self.num_nodes)
         self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
         self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
         rank_zero_only.rank = self.cluster_environment.global_rank()
-        print(f"global_rank: {rank_zero_only.rank}")
-        print("world_size", self.cluster_environment.world_size())
 
     def pre_configure_ddp(self) -> None:
         # if unset, default `find_unused_parameters` `True`
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
index 46c00342dbfb4..5a8632fb87306 100644
--- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py
+++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -88,31 +88,22 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
             trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer`.
             **kwargs: Optional keyword arguments to be passed to the given function.
         """
-        print("creates_processes_externally", self.cluster_environment.creates_processes_externally)
         if not self.cluster_environment.creates_processes_externally:
-            print("_call_children_scripts")
             self._call_children_scripts()
-        print("After creating")
         return function(*args, **kwargs)
 
     def _call_children_scripts(self) -> None:
-        print("1")
         # bookkeeping of spawned processes
         self._check_can_spawn_children()
-        print("2")
 
         # DDP Environment variables
         os.environ["MASTER_ADDR"] = self.cluster_environment.main_address
         os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port)
 
-        print("3")
-
         # allow the user to pass the node rank
         os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
         os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())
 
-        print("4")
-
         # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
         # See https://docs.python.org/3/reference/import.html#main-spec
         if __main__.__spec__ is None:  # pragma: no-cover
@@ -133,44 +124,31 @@ def _call_children_scripts(self) -> None:
         else:  # Script called as `python -m a.b.c`
             command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
 
-        print("5", self.num_processes, self.num_nodes)
-
         os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"
 
-        print("6")
-
         for local_rank in range(1, self.num_processes):
-            print("7")
             env_copy = os.environ.copy()
             env_copy["LOCAL_RANK"] = f"{local_rank}"
 
-            print(f"Creating {local_rank}")
-
             # remove env var if global seed not set
             if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                 del env_copy["PL_GLOBAL_SEED"]
 
             # start process
             # if hydra is available and initialized, make sure to set the cwd correctly
-            cwd: Optional[str] = os.getcwd()
+            cwd: Optional[str] = None
             if _HYDRA_AVAILABLE:
                 if HydraConfig.initialized():
                     cwd = get_original_cwd()
                     os_cwd = f'"{os.getcwd()}"'
                     command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
-
-            print(command, cwd)
-            process = subprocess.Popen(command, env=env_copy, cwd=cwd, stderr=sys.stderr)
+            subprocess.Popen(command, env=env_copy, cwd=cwd)
 
             # starting all processes at once can cause issues
             # with dataloaders delay between 1-10 seconds
             delay = np.random.uniform(1, 5, 1)[0]
             sleep(delay)
 
-            print(process.returncode)
-
-        print("done !")
-
     def _check_can_spawn_children(self) -> None:
         if self.cluster_environment.local_rank() != 0:
             raise RuntimeError(
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index e19e211b64d16..bd879cf85ff7a 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -151,8 +151,6 @@ def __init__(
             A. Class > str
             B. Strategy > Accelerator/precision/plugins
         """
-        print("Accelerator Connector", num_nodes, devices, accelerator, strategy)
-
         if deterministic:
             if benchmark is None:
                 # Set benchmark to False to ensure determinism
@@ -190,8 +188,6 @@ def __init__(
         self._amp_level_flag: Optional[str] = amp_level
         self._auto_select_gpus: bool = auto_select_gpus
 
-        print("1")
-
         self._check_config_and_set_final_flags(
             strategy=strategy,
             accelerator=accelerator,
@@ -201,29 +197,23 @@ def __init__(
             amp_level=amp_level,
             sync_batchnorm=sync_batchnorm,
         )
-
-        print("2")
-
         self._check_device_config_and_set_final_flags(
             devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores
         )
-
-        print("3")
-
         # 2. Instantiate Accelerator
-        # handle `auto` and `None`
         self._set_accelerator_if_ipu_strategy_is_passed()
+
+        # handle `auto`, `None` and `gpu`
         if self._accelerator_flag == "auto" or self._accelerator_flag is None:
-            self._accelerator_flag = self._choose_accelerator()
-        self._set_parallel_devices_and_init_accelerator()
+            self._accelerator_flag = self._choose_auto_accelerator()
+        elif self._accelerator_flag == "gpu":
+            self._accelerator_flag = self._choose_gpu_accelerator_backend()
 
-        print("4")
+        self._set_parallel_devices_and_init_accelerator()
 
         # 3. Instantiate ClusterEnvironment
         self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()
 
-        print("5")
-
         # 4. Instantiate Strategy - Part 1
         if self._strategy_flag is None:
             self._strategy_flag = self._choose_strategy()
@@ -231,18 +221,12 @@ def __init__(
         self._check_strategy_and_fallback()
         self._init_strategy()
 
-        print("6")
-
         # 5. Instantiate Precision Plugin
         self.precision_plugin = self._check_and_init_precision()
 
-        print("7")
-
         # 6. Instantiate Strategy - Part 2
         self._lazy_init_strategy()
 
-        print("8")
-
     def _init_deterministic(self, deterministic: Optional[Union[bool, _LITERAL_WARN]]) -> None:
         self.deterministic = deterministic or False  # default to False if not set
         if _TORCH_GREATER_EQUAL_1_11 and deterministic == "warn":
@@ -300,7 +284,7 @@ def _check_config_and_set_final_flags(
         if (
             accelerator is not None
             and accelerator not in self._accelerator_types
-            and accelerator != "auto"
+            and accelerator not in ("auto", "gpu")
             and not isinstance(accelerator, Accelerator)
         ):
             raise ValueError(
@@ -507,7 +491,7 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None:
         if isinstance(self._strategy_flag, IPUStrategy):
             self._accelerator_flag = "ipu"
 
-    def _choose_accelerator(self) -> str:
+    def _choose_auto_accelerator(self) -> str:
         """Choose the accelerator type (str) based on availability when ``accelerator='auto'``."""
         if self._accelerator_flag == "auto":
             if _TPU_AVAILABLE:
@@ -522,6 +506,15 @@ def _choose_accelerator(self) -> str:
                 return "cuda"
         return "cpu"
 
+    @staticmethod
+    def _choose_gpu_accelerator_backend() -> str:
+        if MPSAccelerator.is_available():
+            return "mps"
+        if CUDAAccelerator.is_available():
+            return "cuda"
+
+        raise MisconfigurationException("No supported gpu backend found!")
+
     def _set_parallel_devices_and_init_accelerator(self) -> None:
         if isinstance(self._accelerator_flag, Accelerator):
             self.accelerator: Accelerator = self._accelerator_flag
@@ -550,12 +543,10 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
         self._devices_flag = self.accelerator.parse_devices(self._devices_flag)
         if not self._parallel_devices:
             self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag)
-            print("Right there", self._parallel_devices)
 
     def _set_devices_flag_if_auto_passed(self) -> None:
         if self._devices_flag == "auto" or self._devices_flag is None:
             self._devices_flag = self.accelerator.auto_device_count()
-            print(f"Auto device {self._devices_flag}")
 
     def _set_devices_flag_if_auto_select_gpus_passed(self) -> None:
         if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, CUDAAccelerator):
@@ -792,33 +783,24 @@ def _validate_precision_choice(self) -> None:
 
     def _lazy_init_strategy(self) -> None:
         """Lazily set missing attributes on the previously instantiated strategy."""
-        print("a")
         self.strategy.accelerator = self.accelerator
         if self.precision_plugin:
             self.strategy.precision_plugin = self.precision_plugin
         if self.checkpoint_io:
             self.strategy.checkpoint_io = self.checkpoint_io
-        print("b", self.cluster_environment)
         if hasattr(self.strategy, "cluster_environment"):
             self.strategy.cluster_environment = self.cluster_environment
         if hasattr(self.strategy, "parallel_devices"):
-            print("c", self.strategy.parallel_devices)
             if self.strategy.parallel_devices:
                 self._parallel_devices = self.strategy.parallel_devices
             else:
-                print("c1")
-                # print(self._parallel_devices, os.environ)
                 self.strategy.parallel_devices = self._parallel_devices
-                print("c2")
         if hasattr(self.strategy, "num_nodes"):
-            print("d", self._num_nodes_flag)
             self.strategy._num_nodes = self._num_nodes_flag
         if hasattr(self.strategy, "_layer_sync"):
             self.strategy._layer_sync = self._layer_sync
         if hasattr(self.strategy, "set_world_ranks"):
-            print("e")
             self.strategy.set_world_ranks()
-        print("f")
         self.strategy._configure_launcher()
 
         from pytorch_lightning.utilities import _IS_INTERACTIVE
diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py
index affc7b3f9d140..d10225fea2d65 100644
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@@ -430,8 +430,6 @@ def __init__(
         # init connectors
         self._data_connector = DataConnector(self, multiple_trainloader_mode)
 
-        print("before accelerator_connector")
-
         self._accelerator_connector = AcceleratorConnector(
             num_processes=num_processes,
             devices=devices,
@@ -451,16 +449,12 @@ def __init__(
             amp_level=amp_level,
             plugins=plugins,
         )
-        print("after accelerator_connector")
-
         self._logger_connector = LoggerConnector(self)
         self._callback_connector = CallbackConnector(self)
         self._checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint)
         self._signal_connector = SignalConnector(self)
         self.tuner = Tuner(self)
 
-        print("_parse_loop_limits")
-
         min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits(
             min_steps, max_steps, min_epochs, max_epochs, max_time
         )
@@ -468,8 +462,6 @@ def __init__(
         training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps)
         fit_loop.connect(epoch_loop=training_epoch_loop)
 
-        print("TrainingEpochLoop")
-
         # default .fit() loop
         self.fit_loop = fit_loop
 
@@ -491,8 +483,6 @@ def __init__(
         self._tested_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
         self._predicted_ckpt_path: Optional[str] = None  # TODO: remove in v1.8
 
-        print("on_trainer_init")
-
         # init callbacks
         # Declare attributes to be set in _callback_connector on_trainer_init
         self._callback_connector.on_trainer_init(
@@ -517,8 +507,6 @@ def __init__(
             check_val_every_n_epoch,
         )
 
-        print("on_trainer_init")
-
         # gradient clipping
         if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)):
             raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.")
diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py
index 903fe300a2de5..361c6dd12beeb 100644
--- a/src/pytorch_lightning/utilities/distributed.py
+++ b/src/pytorch_lightning/utilities/distributed.py
@@ -366,23 +366,12 @@ def init_dist_connection(
     if torch.distributed.is_initialized():
         log.debug("torch.distributed is already initialized. Exiting early")
         return
-
-    print(cluster_environment)
-
     global_rank = global_rank if global_rank is not None else cluster_environment.global_rank()
     world_size = world_size if world_size is not None else cluster_environment.world_size()
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-    torch.distributed.init_process_group(
-        torch_distributed_backend,
-        init_method=f"tcp://{cluster_environment.main_address}:{cluster_environment.main_port}",
-        rank=global_rank,
-        world_size=world_size,
-        **kwargs,
-    )
-
-    print("HERE")
+    torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
 
     # on rank=0 let everyone know training is starting
     new_rank_zero_info(
diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py
index 91f8466b77500..d7d3a14ec924a 100644
--- a/src/pytorch_lightning/utilities/enums.py
+++ b/src/pytorch_lightning/utilities/enums.py
@@ -244,7 +244,7 @@ class _AcceleratorType(LightningEnum):
     >>> _AcceleratorType.CPU == _AcceleratorType.from_str('cpu')
     True
     >>> # you can match the type with string
-    >>> _AcceleratorType.GPU == 'GPU'
+    >>> _AcceleratorType.CUDA == 'CUDA'
     True
     >>> # which is case invariant
     >>> _AcceleratorType.TPU in ('tpu', 'CPU')
@@ -252,7 +252,7 @@ class _AcceleratorType(LightningEnum):
     """
 
     CPU = "CPU"
-    GPU = "GPU"
+    CUDA = "CUDA"
     IPU = "IPU"
     TPU = "TPU"
     HPU = "HPU"
diff --git a/src/pytorch_lightning/utilities/logger.py b/src/pytorch_lightning/utilities/logger.py
index 07ecf4c3c0ca0..24d75e4f41034 100644
--- a/src/pytorch_lightning/utilities/logger.py
+++ b/src/pytorch_lightning/utilities/logger.py
@@ -14,7 +14,7 @@
 """Utilities for loggers."""
 
 from argparse import Namespace
-from typing import Any, Dict, Generator, List, MutableMapping, Optional, Union
+from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Union
 
 import numpy as np
 import torch
@@ -132,7 +132,9 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]:
     return params
 
 
-def _add_prefix(metrics: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]:
+def _add_prefix(
+    metrics: Mapping[str, Union[Tensor, float]], prefix: str, separator: str
+) -> Mapping[str, Union[Tensor, float]]:
     """Insert prefix before each key in a dict, separated by the separator.
 
     Args:

From fa28c53890db2946afba70647318dc8dfce4844e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 20:43:28 +0200
Subject: [PATCH 17/40] update

---
 MANIFEST.in | 51 ---------------------------------------------------
 setup.py    |  2 +-
 2 files changed, 1 insertion(+), 52 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 4b0d22529d613..a8dbcff69b631 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,54 +3,3 @@ exclude requirements.txt
 exclude __pycache__
 include .actions/setup_tools.py
 include *.cff  # citation info
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
-recursive-include src *.md
-recursive-include requirements *.txt
-recursive-include src/lightning_app/cli/*-template *
diff --git a/setup.py b/setup.py
index 6d271cc40b0aa..a542b3c1e0291 100755
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 from setuptools import setup
 
-_PACKAGE_NAME = ""
+_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "")
 _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"}
 _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME)
 # https://packaging.python.org/guides/single-sourcing-package-version/

From a087275e42c27620cf84d85310cd1d3a842b415f Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 20:45:42 +0200
Subject: [PATCH 18/40] update

---
 src/lightning_app/utilities/packaging/cloud_compute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index dcec4cf858828..e7b05cd6548aa 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -2,8 +2,8 @@
 from typing import List, Optional, Union
 
 _name_to_devices_map = {
-    "default": 2,
-    "cpu": 2,
+    "default": 1,
+    "cpu": 1,
     "gpu": 1,
     "gpu-fast": 1,
     "gpu-fast-multi": 4,

From b8c1ff349e13a210bd50c51392626e859640d398 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 25 Jul 2022 20:51:06 +0200
Subject: [PATCH 19/40] update

---
 examples/app_multi_node/.lightning                     | 2 +-
 src/lightning_app/components/training.py               | 3 ++-
 src/lightning_app/utilities/packaging/cloud_compute.py | 6 ------
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning
index 9202c95a897f9..7befcc74ea6d3 100644
--- a/examples/app_multi_node/.lightning
+++ b/examples/app_multi_node/.lightning
@@ -1 +1 @@
-name: '92'
+name: multi-node-demo
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 7d3ac7903fc32..17d7e46b7d207 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -142,7 +142,8 @@ def run(self):
                 internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()]
                 work.run(internal_urls)
                 if all(w.has_finished for w in self.ws.values()):
-                    self._exit("Finished training")
+                    for w in self.ws.values():
+                        w.stop()
             else:
                 work.run()
 
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index e7b05cd6548aa..0733e8cd816ed 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -69,12 +69,6 @@ def from_dict(cls, d):
 
     @property
     def devices(self) -> int:
-        # TODO: Add a resolver here.
         if self.name in _name_to_devices_map:
             return _name_to_devices_map[self.name]
         return 1
-
-    @property
-    def accelerator(self) -> str:
-        # TODO: Add a resolver here.
-        return self.name

From 7860ce406a109d94c40b8714526a96634b69b747 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 09:46:51 +0200
Subject: [PATCH 20/40] update

---
 tests/tests_app/components/test_training.py |  0
 tests/tests_app_examples/test_multi_node.py | 29 +++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 tests/tests_app/components/test_training.py
 create mode 100644 tests/tests_app_examples/test_multi_node.py

diff --git a/tests/tests_app/components/test_training.py b/tests/tests_app/components/test_training.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tests_app_examples/test_multi_node.py b/tests/tests_app_examples/test_multi_node.py
new file mode 100644
index 0000000000000..4b5c80c0cd9cb
--- /dev/null
+++ b/tests/tests_app_examples/test_multi_node.py
@@ -0,0 +1,29 @@
+import os
+
+from tests_app import _PROJECT_ROOT
+
+from lightning_app.testing.testing import application_testing, LightningTestApp
+
+
+class LightningTestMultiNodeApp(LightningTestApp):
+    def on_before_run_once(self):
+        res = super().on_before_run_once()
+        if all(w.has_finished for w in self.works):
+            return True
+        return res
+
+
+def test_multi_node_example():
+    cwd = os.getcwd()
+    new_cwd = os.path.join(_PROJECT_ROOT, "examples/app_multi_node")
+    os.chdir(new_cwd)
+    command_line = [
+        "app.py",
+        "--blocking",
+        "False",
+        "--open-ui",
+        "False",
+    ]
+    result = application_testing(LightningTestMultiNodeApp, command_line)
+    assert result.exit_code == 0
+    os.chdir(cwd)

From 0701fc8e008139fec9329816404b3cb5da2e6c53 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 09:47:12 +0200
Subject: [PATCH 21/40] update

---
 tests/tests_app/components/test_training.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/tests_app/components/test_training.py

diff --git a/tests/tests_app/components/test_training.py b/tests/tests_app/components/test_training.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From c0df7c342ac9ac1ef55ef9fcd660f641ba57e0c0 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 09:52:20 +0200
Subject: [PATCH 22/40] update

---
 .gitignore                               |  1 +
 examples/app_multi_node/app.py           |  2 +-
 src/lightning_app/components/training.py | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7040a912974e1..0f03c69600bed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ src/lightning_app/ui/*
 *examples/template_react_ui*
 hars*
 artifacts/*
+*docs/examples*
diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 586924062c45c..3750e67c485f3 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -1,6 +1,6 @@
 from lightning import LightningApp
 from lightning.app.components.training import LightningTrainingComponent
-from lightning_app.utilities.packaging.cloud_compute import CloudCompute
+from lightning.app.utilities.packaging.cloud_compute import CloudCompute
 
 app = LightningApp(
     LightningTrainingComponent(
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 17d7e46b7d207..768d4a3b7f7e2 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -105,6 +105,20 @@ def __init__(
     ):
         """This component enables to perform distributed multi-node multi-gpus training.
 
+        Example::
+
+            from lightning import LightningApp
+            from lightning.app.components.training import LightningTrainingComponent
+            from lightning.app.utilities.packaging.cloud_compute import CloudCompute
+
+            app = LightningApp(
+                LightningTrainingComponent(
+                    "train.py",
+                    num_nodes=2,
+                    cloud_compute=CloudCompute("gpu"),
+                ),
+            )
+
         Arguments:
             script_path: Path to the script to be executed.
             script_args: The arguments to be pass to the script.

From e596856a289979b9b23a2882ac104b88a7c822b4 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 09:55:24 +0200
Subject: [PATCH 23/40] update

---
 src/lightning_app/components/training.py | 31 ++++++++++++------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 768d4a3b7f7e2..af5c904b8a83f 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -46,17 +46,18 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
         master_address = str(internal_urls[0][0])
         master_port = str(internal_urls[0][1])
 
-        distributed_env_vars = {
-            "MASTER_ADDR": master_address,
-            "MASTER_PORT": master_port,
-            "NODE_RANK": str(self.node_rank),
-            "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices),
-            "PL_TRAINER_NUM_NODES": str(self.num_nodes),
-            "PL_TRAINER_STRATEGY": "ddp",
-            "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
-            "PL_TRAINER_ACCELERATOR": "auto",
-        }
-        os.environ.update(distributed_env_vars)
+        os.environ.update(
+            {
+                "MASTER_ADDR": master_address,
+                "MASTER_PORT": master_port,
+                "NODE_RANK": str(self.node_rank),
+                "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices),
+                "PL_TRAINER_NUM_NODES": str(self.num_nodes),
+                "PL_TRAINER_STRATEGY": "ddp",
+                "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
+                "PL_TRAINER_ACCELERATOR": "auto",
+            }
+        )
         return super().run()
 
     def on_after_run(self, script_globals):
@@ -103,7 +104,7 @@ def __init__(
         sanity_serving: bool = False,
         script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
     ):
-        """This component enables to perform distributed multi-node multi-gpus training.
+        """This component enables to perform distributed multi-node multi-devices training.
 
         Example::
 
@@ -133,7 +134,7 @@ def __init__(
         self.script_path = script_path
         self.script_args = script_args
         self.num_nodes = num_nodes
-        self._cloud_compute = cloud_compute  # TODO: Add support for cloudCOmpute
+        self._cloud_compute = cloud_compute  # TODO: Add support for cloudCompute
         self.sanity_serving = sanity_serving
         self._script_runner = script_runner
 
@@ -152,7 +153,7 @@ def run(self):
             self.has_initialized = True
 
         for work in self.ws.values():
-            if self.ready:
+            if self._ready:
                 internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()]
                 work.run(internal_urls)
                 if all(w.has_finished for w in self.ws.values()):
@@ -162,5 +163,5 @@ def run(self):
                 work.run()
 
     @property
-    def ready(self) -> bool:
+    def _ready(self) -> bool:
         return all(w.internal_ip for w in self.ws.values())

From 253aa4323a483eeb46c24e86729ef90882ef2a4e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 15:59:28 +0200
Subject: [PATCH 24/40] update

---
 src/lightning_app/components/python/tracer.py |  32 +++++-
 src/lightning_app/components/training.py      | 108 +++++++++++-------
 src/lightning_app/source_code/local.py        |   1 +
 .../utilities/packaging/cloud_compute.py      |  14 ---
 .../utilities/packaging/tarfile.py            |  26 +++++
 5 files changed, 121 insertions(+), 60 deletions(-)
 create mode 100644 src/lightning_app/utilities/packaging/tarfile.py

diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py
index fa955646acbbf..57f5b3cbdccf6 100644
--- a/src/lightning_app/components/python/tracer.py
+++ b/src/lightning_app/components/python/tracer.py
@@ -2,16 +2,23 @@
 import os
 import signal
 import sys
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, TypedDict, Union
 
 from lightning_app import LightningWork
+from lightning_app.storage.drive import Drive
 from lightning_app.storage.payload import Payload
 from lightning_app.utilities.app_helpers import _collect_child_process_pids
+from lightning_app.utilities.packaging.tarfile import clean_tarfile, extract_tarfile
 from lightning_app.utilities.tracer import Tracer
 
 logger = logging.getLogger(__name__)
 
 
+class Code(TypedDict):
+    drive: Drive
+    name: str
+
+
 class TracerPythonScript(LightningWork):
     def on_before_run(self):
         """Called before the python script is executed."""
@@ -101,13 +108,34 @@ def __init__(
         self.outputs = outputs or []
         for name in self.outputs:
             setattr(self, name, None)
+        self.params = None
+
+    def run(self, params: Optional[Dict[str, Any]] = None, code: Optional[Code] = None, **kwargs):
+        """
+        Arguments:
+            params: A dictionary of arguments to be be added to script_args
+            code: A dictionary with a drive and a file name to get retrieve
+        """
+
+        if params:
+            self.params = params
+            self.script_args.extend([f"--{k}={v}" for k, v in params.items()])
+
+        if code:
+            raise Exception(code)
+            clean_tarfile(code["name"], "r:gz")
+            code["drive"].get(code["name"])
+            extract_tarfile(code["name"], ".", "r:gz")
+            os.remove(code["name"])
 
-    def run(self, **kwargs):
         if not os.path.exists(self.script_path):
             raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.")
+
         kwargs = {k: v.value if isinstance(v, Payload) else v for k, v in kwargs.items()}
+
         init_globals = globals()
         init_globals.update(kwargs)
+
         self.on_before_run()
         env_copy = os.environ.copy()
         if self.env:
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index af5c904b8a83f..dea0bfeda5324 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 from lightning import CloudCompute
 from lightning_app import LightningFlow, structures
@@ -19,10 +19,18 @@ def __init__(
         num_nodes: int = 1,
         sanity_serving: bool = False,
         cloud_compute: Optional[CloudCompute] = None,
+        parallel: bool = True,
+        raise_exception: bool = True,
+        env: Optional[Dict[str, Any]] = None,
         **kwargs,
     ):
         super().__init__(
-            script_path, script_args, raise_exception=True, parallel=True, cloud_compute=cloud_compute, **kwargs
+            script_path,
+            script_args,
+            raise_exception=raise_exception,
+            parallel=parallel,
+            cloud_compute=cloud_compute,
+            **kwargs,
         )
         self.node_rank = node_rank
         self.num_nodes = num_nodes
@@ -30,6 +38,7 @@ def __init__(
         self.best_model_score = None
         self.sanity_serving = sanity_serving
         self.has_finished = False
+        self.env = env
 
     def configure_tracer(self):
         from pytorch_lightning import Trainer
@@ -38,44 +47,46 @@ def configure_tracer(self):
         tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
         return tracer
 
-    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None):
+    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs):
         if not internal_urls:
+            # Note: This is called only once.
             _logger.info(f"The node {self.node_rank} started !")
             return
 
-        master_address = str(internal_urls[0][0])
-        master_port = str(internal_urls[0][1])
-
-        os.environ.update(
-            {
-                "MASTER_ADDR": master_address,
-                "MASTER_PORT": master_port,
-                "NODE_RANK": str(self.node_rank),
-                "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices),
-                "PL_TRAINER_NUM_NODES": str(self.num_nodes),
-                "PL_TRAINER_STRATEGY": "ddp",
-                "PL_TRAINER_DEVICES": str(self.cloud_compute.devices),
-                "PL_TRAINER_ACCELERATOR": "auto",
-            }
-        )
-        return super().run()
+        if self.env:
+            os.environ.update(self.env)
+
+        distributed_env_vars = {
+            "MASTER_ADDR": internal_urls[0][0],
+            "MASTER_PORT": str(internal_urls[0][1]),
+            "NODE_RANK": str(self.node_rank),
+            "PL_TRAINER_NUM_NODES": str(self.num_nodes),
+            "PL_TRAINER_DEVICES": "auto",
+            "PL_TRAINER_ACCELERATOR": "auto",
+        }
+
+        os.environ.update(distributed_env_vars)
+        return super().run(**kwargs)
 
     def on_after_run(self, script_globals):
         from pytorch_lightning import Trainer
-        from pytorch_lightning.utilities.cli import LightningCLI
-
-        cli = [v for v in script_globals.values() if isinstance(v, LightningCLI)]
-        if cli:
-            trainer = cli[0].trainer
+        from pytorch_lightning.cli import LightningCLI
+
+        for v in script_globals.values():
+            if isinstance(v, LightningCLI):
+                trainer = v.trainer
+                break
+            elif isinstance(v, Trainer):
+                trainer = v
+                break
         else:
-            trainer = [v for v in script_globals.values() if isinstance(v, Trainer)][0]
+            raise RuntimeError("No trainer instance found.")
 
         if trainer.checkpoint_callback.best_model_score:
             self.best_model_path = Path(trainer.checkpoint_callback.best_model_path)
             self.best_model_score = float(trainer.checkpoint_callback.best_model_score)
+
         self.has_finished = True
-        # TODO: Why does it hang there.
-        raise SystemExit(0)
 
     def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
         if self.node_rank != 0:
@@ -103,6 +114,7 @@ def __init__(
         cloud_compute: CloudCompute = CloudCompute("default"),
         sanity_serving: bool = False,
         script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
+        **kwargs,
     ):
         """This component enables to perform distributed multi-node multi-devices training.
 
@@ -129,7 +141,7 @@ def __init__(
                 the ServableModule API
         """
         super().__init__()
-        self.ws = structures.Dict()
+        self.ws = structures.List()
         self.has_initialized = False
         self.script_path = script_path
         self.script_args = script_args
@@ -137,31 +149,39 @@ def __init__(
         self._cloud_compute = cloud_compute  # TODO: Add support for cloudCompute
         self.sanity_serving = sanity_serving
         self._script_runner = script_runner
+        self._kwargs = kwargs
 
-    def run(self):
+    def run(self, **kwargs):
         if not self.has_initialized:
             for node_rank in range(self.num_nodes):
-                self.ws[str(node_rank)] = self._script_runner(
-                    script_path=self.script_path,
-                    script_args=self.script_args,
-                    cloud_compute=self._cloud_compute,
-                    node_rank=node_rank,
-                    sanity_serving=self.sanity_serving,
-                    num_nodes=self.num_nodes,
+                self.ws.append(
+                    self._script_runner(
+                        script_path=self.script_path,
+                        script_args=self.script_args,
+                        cloud_compute=self._cloud_compute,
+                        node_rank=node_rank,
+                        sanity_serving=self.sanity_serving,
+                        num_nodes=self.num_nodes,
+                        **self._kwargs,
+                    )
                 )
 
             self.has_initialized = True
 
-        for work in self.ws.values():
-            if self._ready:
-                internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()]
-                work.run(internal_urls)
-                if all(w.has_finished for w in self.ws.values()):
-                    for w in self.ws.values():
+        for work in self.ws:
+            if all(w.internal_ip for w in self.ws):
+                internal_urls = [(w.internal_ip, w.port) for w in self.ws]
+                work.run(internal_urls=internal_urls, **kwargs)
+                if all(w.has_finished for w in self.ws):
+                    for w in self.ws:
                         w.stop()
             else:
                 work.run()
 
     @property
-    def _ready(self) -> bool:
-        return all(w.internal_ip for w in self.ws.values())
+    def best_model_score(self) -> Optional[float]:
+        return self.ws[0].best_model_score
+
+    @property
+    def best_model_paths(self) -> List[Optional[Path]]:
+        return [self.ws[node_idx].best_mode_path for node_idx in range(len(self.ws))]
diff --git a/src/lightning_app/source_code/local.py b/src/lightning_app/source_code/local.py
index a42347ac42101..05669dff2f6a5 100644
--- a/src/lightning_app/source_code/local.py
+++ b/src/lightning_app/source_code/local.py
@@ -94,6 +94,7 @@ def upload(self, url: str) -> None:
             raise OSError(
                 "cannot upload directory code whose total fize size is greater than 2GB (2e9 bytes)"
             ) from None
+
         uploader = FileUploader(
             presigned_url=url,
             source_file=str(self.package_path),
diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py
index 0733e8cd816ed..6527911855bae 100644
--- a/src/lightning_app/utilities/packaging/cloud_compute.py
+++ b/src/lightning_app/utilities/packaging/cloud_compute.py
@@ -1,14 +1,6 @@
 from dataclasses import asdict, dataclass
 from typing import List, Optional, Union
 
-_name_to_devices_map = {
-    "default": 1,
-    "cpu": 1,
-    "gpu": 1,
-    "gpu-fast": 1,
-    "gpu-fast-multi": 4,
-}
-
 
 @dataclass
 class CloudCompute:
@@ -66,9 +58,3 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, d):
         return cls(**d["__cloud_compute__"])
-
-    @property
-    def devices(self) -> int:
-        if self.name in _name_to_devices_map:
-            return _name_to_devices_map[self.name]
-        return 1
diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py
new file mode 100644
index 0000000000000..6e8a6e52aecc7
--- /dev/null
+++ b/src/lightning_app/utilities/packaging/tarfile.py
@@ -0,0 +1,26 @@
+import os
+import shutil
+import tarfile
+
+
+def clean_tarfile(file_path: str, mode):
+    if os.path.exists(file_path):
+        with tarfile.open(file_path, mode=mode) as tar_ref:
+            for member in tar_ref.getmembers():
+                p = member.path
+                if p != "." and os.path.exists(p):
+                    if os.path.isfile(p):
+                        os.remove(p)
+                    else:
+                        shutil.rmtree(p)
+        os.remove(file_path)
+
+
+def extract_tarfile(file_path: str, extract_path: str, mode: str):
+    if os.path.exists(file_path):
+        with tarfile.open(file_path, mode=mode) as tar_ref:
+            for member in tar_ref.getmembers():
+                try:
+                    tar_ref.extract(member, path=extract_path, set_attrs=False)
+                except PermissionError:
+                    raise PermissionError(f"Could not extract tar file {file_path}")

From 0373fc769b1591837ecaa1bec8bbe0e323591f40 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 26 Jul 2022 16:11:01 +0200
Subject: [PATCH 25/40] update

---
 examples/app_multi_node/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py
index b5e83d905047d..f14809354f405 100644
--- a/examples/app_multi_node/train.py
+++ b/examples/app_multi_node/train.py
@@ -3,5 +3,5 @@
 
 if __name__ == "__main__":
     model = BoringModel()
-    trainer = Trainer(max_epochs=1, strategy="ddp")
+    trainer = Trainer(max_epochs=1)
     trainer.fit(model)

From 71f1cfec9851f9017a9f367e20c5e5ba30964592 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 10:41:34 +0200
Subject: [PATCH 26/40] update

---
 examples/app_multi_node/app.py                |  2 +-
 src/lightning_app/components/python/tracer.py | 34 +++++++++----
 src/lightning_app/components/training.py      | 19 ++++---
 src/lightning_app/testing/testing.py          |  3 ++
 src/lightning_app/utilities/network.py        |  2 +-
 .../utilities/packaging/tarfile.py            | 47 ++++++++++-------
 src/lightning_app/utilities/proxies.py        |  5 +-
 src/lightning_app/utilities/state.py          |  8 +++
 .../components/python/test_python.py          | 50 +++++++++++++++++++
 9 files changed, 133 insertions(+), 37 deletions(-)

diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py
index 3750e67c485f3..6e405a346a143 100644
--- a/examples/app_multi_node/app.py
+++ b/examples/app_multi_node/app.py
@@ -6,6 +6,6 @@
     LightningTrainingComponent(
         "train.py",
         num_nodes=2,
-        cloud_compute=CloudCompute("cpu"),
+        cloud_compute=CloudCompute("gpu-fast-multi"),
     ),
 )
diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py
index 57f5b3cbdccf6..4b0b736b78b24 100644
--- a/src/lightning_app/components/python/tracer.py
+++ b/src/lightning_app/components/python/tracer.py
@@ -2,6 +2,7 @@
 import os
 import signal
 import sys
+from copy import deepcopy
 from typing import Any, Dict, List, Optional, TypedDict, Union
 
 from lightning_app import LightningWork
@@ -38,6 +39,7 @@ def __init__(
         script_args: Optional[Union[list, str]] = None,
         outputs: Optional[List[str]] = None,
         env: Optional[Dict] = None,
+        code: Optional[Code] = None,
         **kwargs,
     ):
         """The TracerPythonScript class enables to easily run a python script.
@@ -104,29 +106,37 @@ def __init__(
         if isinstance(script_args, str):
             script_args = script_args.split(" ")
         self.script_args = script_args if script_args else []
+        self.original_args = deepcopy(self.script_args)
         self.env = env
         self.outputs = outputs or []
         for name in self.outputs:
             setattr(self, name, None)
         self.params = None
+        self._code = code
+        self.restart_count = 0
 
-    def run(self, params: Optional[Dict[str, Any]] = None, code: Optional[Code] = None, **kwargs):
+    def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs):
         """
         Arguments:
             params: A dictionary of arguments to be be added to script_args
-            code: A dictionary with a drive and a file name to get retrieve
+            code: A dictionary with a drive and a file name to retrieve
         """
+        if restart_count:
+            self.restart_count = restart_count
 
         if params:
             self.params = params
-            self.script_args.extend([f"--{k}={v}" for k, v in params.items()])
+            self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()]
 
-        if code:
-            raise Exception(code)
-            clean_tarfile(code["name"], "r:gz")
-            code["drive"].get(code["name"])
-            extract_tarfile(code["name"], ".", "r:gz")
-            os.remove(code["name"])
+        if self._code:
+            drive = self._code["drive"]
+            name = self._code["name"]
+            if os.path.exists(name):
+                clean_tarfile(name, "r:gz")
+
+            if name in drive.list():
+                drive.get(name)
+                extract_tarfile(name, ".", "r:gz")
 
         if not os.path.exists(self.script_path):
             raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.")
@@ -153,5 +163,11 @@ def on_exit(self):
         for child_pid in _collect_child_process_pids(os.getpid()):
             os.kill(child_pid, signal.SIGTERM)
 
+    @staticmethod
+    def _to_script_args(k: str, v: str) -> str:
+        if k.startswith("--"):
+            return f"{k}={v}"
+        return f"--{k}={v}"
+
 
 __all__ = ["TracerPythonScript"]
diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index dea0bfeda5324..91ba5786fbd7b 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -36,6 +36,7 @@ def __init__(
         self.num_nodes = num_nodes
         self.best_model_path = None
         self.best_model_score = None
+        self.monitor = None
         self.sanity_serving = sanity_serving
         self.has_finished = False
         self.env = env
@@ -47,11 +48,11 @@ def configure_tracer(self):
         tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
         return tracer
 
-    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs):
+    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs) -> None:
         if not internal_urls:
             # Note: This is called only once.
             _logger.info(f"The node {self.node_rank} started !")
-            return
+            return None
 
         if self.env:
             os.environ.update(self.env)
@@ -82,9 +83,13 @@ def on_after_run(self, script_globals):
         else:
             raise RuntimeError("No trainer instance found.")
 
+        self.monitor = trainer.checkpoint_callback.monitor
+
         if trainer.checkpoint_callback.best_model_score:
             self.best_model_path = Path(trainer.checkpoint_callback.best_model_path)
             self.best_model_score = float(trainer.checkpoint_callback.best_model_score)
+        else:
+            self.best_model_path = Path(trainer.checkpoint_callback.last_model_path)
 
         self.has_finished = True
 
@@ -114,7 +119,7 @@ def __init__(
         cloud_compute: CloudCompute = CloudCompute("default"),
         sanity_serving: bool = False,
         script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
-        **kwargs,
+        **script_runner_kwargs,
     ):
         """This component enables to perform distributed multi-node multi-devices training.
 
@@ -149,9 +154,9 @@ def __init__(
         self._cloud_compute = cloud_compute  # TODO: Add support for cloudCompute
         self.sanity_serving = sanity_serving
         self._script_runner = script_runner
-        self._kwargs = kwargs
+        self._script_runner_kwargs = script_runner_kwargs
 
-    def run(self, **kwargs):
+    def run(self, **run_kwargs):
         if not self.has_initialized:
             for node_rank in range(self.num_nodes):
                 self.ws.append(
@@ -162,7 +167,7 @@ def run(self, **kwargs):
                         node_rank=node_rank,
                         sanity_serving=self.sanity_serving,
                         num_nodes=self.num_nodes,
-                        **self._kwargs,
+                        **self._script_runner_kwargs,
                     )
                 )
 
@@ -171,7 +176,7 @@ def run(self, **kwargs):
         for work in self.ws:
             if all(w.internal_ip for w in self.ws):
                 internal_urls = [(w.internal_ip, w.port) for w in self.ws]
-                work.run(internal_urls=internal_urls, **kwargs)
+                work.run(internal_urls=internal_urls, **run_kwargs)
                 if all(w.has_finished for w in self.ws):
                     for w in self.ws:
                         w.stop()
diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py
index bdf37cacf04a7..cc03f5badec2b 100644
--- a/src/lightning_app/testing/testing.py
+++ b/src/lightning_app/testing/testing.py
@@ -23,6 +23,7 @@
 from lightning_app.utilities.cloud import _get_project
 from lightning_app.utilities.imports import _is_playwright_available, requires
 from lightning_app.utilities.network import _configure_session, LightningClient
+from lightning_app.utilities.proxies import ProxyWorkRun
 
 if _is_playwright_available():
     import playwright
@@ -114,6 +115,8 @@ def run_work_isolated(work, *args, start_server: bool = False, **kwargs):
     # pop the stopped status.
     call_hash = work._calls["latest_call_hash"]
     work._calls[call_hash]["statuses"].pop(-1)
+    if isinstance(work.run, ProxyWorkRun):
+        work.run = work.run.work_run
 
 
 def browser_context_args(browser_context_args: Dict) -> Dict:
diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py
index 98c7db3d46ff8..a9ebcf37ab564 100644
--- a/src/lightning_app/utilities/network.py
+++ b/src/lightning_app/utilities/network.py
@@ -48,7 +48,7 @@ def _configure_session() -> Session:
     return http
 
 
-def _check_service_url_is_ready(url: str, timeout: float = 0.5) -> bool:
+def _check_service_url_is_ready(url: str, timeout: float = 1) -> bool:
     try:
         response = requests.get(url, timeout=timeout)
         return response.status_code in (200, 404)
diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py
index 6e8a6e52aecc7..ca945baf95ca8 100644
--- a/src/lightning_app/utilities/packaging/tarfile.py
+++ b/src/lightning_app/utilities/packaging/tarfile.py
@@ -3,24 +3,37 @@
 import tarfile
 
 
-def clean_tarfile(file_path: str, mode):
+def clean_tarfile(file_path: str, mode: str) -> None:
+    """This utility removes all files extracted from a tarfile."""
+
+    if not os.path.exists(file_path):
+        return None
+
+    with tarfile.open(file_path, mode=mode) as tar_ref:
+        for member in tar_ref.getmembers():
+            p = member.path
+            if p == "." or not os.path.exists(p):
+                continue
+            try:
+                if os.path.isfile(p):
+                    os.remove(p)
+                else:
+                    shutil.rmtree(p)
+            except (FileNotFoundError, OSError, PermissionError):
+                pass
+
     if os.path.exists(file_path):
-        with tarfile.open(file_path, mode=mode) as tar_ref:
-            for member in tar_ref.getmembers():
-                p = member.path
-                if p != "." and os.path.exists(p):
-                    if os.path.isfile(p):
-                        os.remove(p)
-                    else:
-                        shutil.rmtree(p)
         os.remove(file_path)
 
 
-def extract_tarfile(file_path: str, extract_path: str, mode: str):
-    if os.path.exists(file_path):
-        with tarfile.open(file_path, mode=mode) as tar_ref:
-            for member in tar_ref.getmembers():
-                try:
-                    tar_ref.extract(member, path=extract_path, set_attrs=False)
-                except PermissionError:
-                    raise PermissionError(f"Could not extract tar file {file_path}")
+def extract_tarfile(file_path: str, extract_path: str, mode: str) -> None:
+    """This utility extract all files from a tarfile."""
+    if not os.path.exists(file_path):
+        return None
+
+    with tarfile.open(file_path, mode=mode) as tar_ref:
+        for member in tar_ref.getmembers():
+            try:
+                tar_ref.extract(member, path=extract_path, set_attrs=False)
+            except PermissionError:
+                raise PermissionError(f"Could not extract tar file {file_path}")
diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py
index c33e41bb70203..ce7a768e78a0b 100644
--- a/src/lightning_app/utilities/proxies.py
+++ b/src/lightning_app/utilities/proxies.py
@@ -408,8 +408,9 @@ def run_once(self):
         persist_artifacts(work=self.work)
 
         # 15. Destroy the state observer.
-        self.state_observer.join(0)
-        self.state_observer = None
+        if self.state_observer:
+            self.state_observer.join(0)
+            self.state_observer = None
 
         # 15. An asynchronous work shouldn't return a return value.
         if ret is not None:
diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py
index 0802a426e7349..3c16a7b4cdb11 100644
--- a/src/lightning_app/utilities/state.py
+++ b/src/lightning_app/utilities/state.py
@@ -187,6 +187,14 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
                 state=self._state["flows"][name],
             )
 
+        elif name in self._state.get("structures", {}):
+            return AppState(
+                self._host,
+                self._port,
+                last_state=self._last_state["structures"][name],
+                state=self._state["structures"][name],
+            )
+
         raise AttributeError(
             f"Failed to access '{name}' through `AppState`. The state provides:"
             f" Variables: {list(self._state['vars'].keys())},"
diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py
index 61969ef1c4c51..6f739d84c0a4b 100644
--- a/tests/tests_app/components/python/test_python.py
+++ b/tests/tests_app/components/python/test_python.py
@@ -1,11 +1,15 @@
 import os
+import tarfile
 
 import pytest
 from tests_app import _PROJECT_ROOT
 
 from lightning_app.components.python import PopenPythonScript, TracerPythonScript
+from lightning_app.components.python.tracer import Code
+from lightning_app.storage.drive import Drive
 from lightning_app.testing.helpers import RunIf
 from lightning_app.testing.testing import run_work_isolated
+from lightning_app.utilities.component import _set_work_context
 
 COMPONENTS_SCRIPTS_FOLDER = str(os.path.join(_PROJECT_ROOT, "tests/tests_app/components/python/scripts/"))
 
@@ -69,3 +73,49 @@ def test_tracer_python_script_with_kwargs():
     )
     run_work_isolated(python_script)
     assert python_script.has_failed
+
+
+def test_tracer_with_code():
+
+    drive = Drive("lit://code")
+    drive.component_name = "something"
+    code = Code(drive=drive, name="sample.tar.gz")
+
+    with open("file.py", "w") as f:
+        f.write('raise Exception("An error")')
+
+    with tarfile.open("sample.tar.gz", "w:gz") as tar:
+        tar.add("file.py")
+
+    drive.put("sample.tar.gz")
+    os.remove("file.py")
+    os.remove("sample.tar.gz")
+
+    python_script = TracerPythonScript("file.py", script_args=["--b=1"], raise_exception=False, code=code)
+    run_work_isolated(python_script, params={"a": "1"}, restart_count=0)
+    assert python_script.status.message == "An error"
+
+    with open("file.py", "w") as f:
+        f.write("import sys\n")
+        f.write("print(sys.argv)\n")
+
+    with tarfile.open("sample.tar.gz", "w:gz") as tar:
+        tar.add("file.py")
+
+    _set_work_context()
+    drive.put("sample.tar.gz")
+    os.remove("file.py")
+    os.remove("sample.tar.gz")
+
+    with open("file.py", "w") as f:
+        f.write('raise Exception("An error")')
+
+    call_hash = python_script._calls["latest_call_hash"]
+    python_script._calls[call_hash]["statuses"].pop(-1)
+    python_script._calls[call_hash]["statuses"].pop(-1)
+
+    run_work_isolated(python_script, params={"a": "1"}, restart_count=1)
+    assert python_script.has_succeeded
+    assert python_script.script_args == ["--b=1", "--a=1"]
+    os.remove("file.py")
+    os.remove("sample.tar.gz")

From e5a4a0911b4ee16b32b10bf4d2ae86aa62c830e2 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 10:42:50 +0200
Subject: [PATCH 27/40] update

---
 src/lightning_app/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 7d0dcb589b9e3..d34a16f4f4aaa 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand`  ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
 
+- Add `LightningTrainingComponent` that orchestrate multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
+
 ### Changed
 
 - Update the Lightning App docs ([#13537](https://github.com/PyTorchLightning/pytorch-lightning/pull/13537))

From 7cc1c3982c3f496ad8a845e8c0940b0b40109f79 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 10:46:08 +0200
Subject: [PATCH 28/40] update

---
 src/lightning_app/components/python/tracer.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py
index 4b0b736b78b24..b761e79f8a30a 100644
--- a/src/lightning_app/components/python/tracer.py
+++ b/src/lightning_app/components/python/tracer.py
@@ -112,14 +112,15 @@ def __init__(
         for name in self.outputs:
             setattr(self, name, None)
         self.params = None
-        self._code = code
+        self.drive = code.get("drive") if code else None
+        self.code_name = code.get("name") if code else None
         self.restart_count = 0
 
     def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs):
         """
         Arguments:
-            params: A dictionary of arguments to be be added to script_args
-            code: A dictionary with a drive and a file name to retrieve
+            params: A dictionary of arguments to be be added to script_args.
+            restart_count: Pass an incrementing counter to enable re-execution the work.
         """
         if restart_count:
             self.restart_count = restart_count
@@ -128,15 +129,14 @@ def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[i
             self.params = params
             self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()]
 
-        if self._code:
-            drive = self._code["drive"]
-            name = self._code["name"]
-            if os.path.exists(name):
-                clean_tarfile(name, "r:gz")
+        if self.drive:
+            assert self.code_name
+            if os.path.exists(self.code_name):
+                clean_tarfile(self.code_name, "r:gz")
 
-            if name in drive.list():
-                drive.get(name)
-                extract_tarfile(name, ".", "r:gz")
+            if self.code_name in self.drive.list():
+                self.drive.get(self.code_name)
+                extract_tarfile(self.code_name, ".", "r:gz")
 
         if not os.path.exists(self.script_path):
             raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.")

From eec5e6dbe6c1126091f0ceebdc80620b414fbf6d Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 10:48:31 +0200
Subject: [PATCH 29/40] update

---
 src/lightning_app/utilities/state.py             | 8 --------
 tests/tests_app/components/python/test_python.py | 4 +++-
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py
index 3c16a7b4cdb11..0802a426e7349 100644
--- a/src/lightning_app/utilities/state.py
+++ b/src/lightning_app/utilities/state.py
@@ -187,14 +187,6 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
                 state=self._state["flows"][name],
             )
 
-        elif name in self._state.get("structures", {}):
-            return AppState(
-                self._host,
-                self._port,
-                last_state=self._last_state["structures"][name],
-                state=self._state["structures"][name],
-            )
-
         raise AttributeError(
             f"Failed to access '{name}' through `AppState`. The state provides:"
             f" Variables: {list(self._state['vars'].keys())},"
diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py
index 6f739d84c0a4b..85197f6f8f7ef 100644
--- a/tests/tests_app/components/python/test_python.py
+++ b/tests/tests_app/components/python/test_python.py
@@ -75,7 +75,9 @@ def test_tracer_python_script_with_kwargs():
     assert python_script.has_failed
 
 
-def test_tracer_with_code():
+def test_tracer_component_with_code():
+    """This test ensures the Tracer Component gets the latest code from the code object is provided and arguments
+    are cleaned."""
 
     drive = Drive("lit://code")
     drive.component_name = "something"

From 17b8c96fd3415e67de15e2ff98cb5f5583faca13 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 10:51:29 +0200
Subject: [PATCH 30/40] update

---
 docs/source-app/api_reference/components.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source-app/api_reference/components.rst b/docs/source-app/api_reference/components.rst
index 76a99402ddecc..c5f99f0f96629 100644
--- a/docs/source-app/api_reference/components.rst
+++ b/docs/source-app/api_reference/components.rst
@@ -20,5 +20,6 @@ ___________________
 
     ~python.popen.PopenPythonScript
     ~python.tracer.TracerPythonScript
+    ~training.LightningTrainingComponent
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI

From 01bcc2976a6888db08ab7bd44c2d1b6aa1f1e022 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:06:51 -0700
Subject: [PATCH 31/40] Update tests/tests_app/components/python/test_python.py

---
 tests/tests_app/components/python/test_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py
index 85197f6f8f7ef..bcb176bdf0184 100644
--- a/tests/tests_app/components/python/test_python.py
+++ b/tests/tests_app/components/python/test_python.py
@@ -76,7 +76,7 @@ def test_tracer_python_script_with_kwargs():
 
 
 def test_tracer_component_with_code():
-    """This test ensures the Tracer Component gets the latest code from the code object is provided and arguments
+    """This test ensures the Tracer Component gets the latest code from the code object that is provided and arguments
     are cleaned."""
 
     drive = Drive("lit://code")

From ba572cb9b3f992abe27c5b8c752588f83676aad9 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:06:59 -0700
Subject: [PATCH 32/40] Update src/lightning_app/utilities/packaging/tarfile.py

---
 src/lightning_app/utilities/packaging/tarfile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py
index ca945baf95ca8..123e4e2e0942a 100644
--- a/src/lightning_app/utilities/packaging/tarfile.py
+++ b/src/lightning_app/utilities/packaging/tarfile.py
@@ -27,7 +27,7 @@ def clean_tarfile(file_path: str, mode: str) -> None:
 
 
 def extract_tarfile(file_path: str, extract_path: str, mode: str) -> None:
-    """This utility extract all files from a tarfile."""
+    """This utility extracts all files from a tarfile."""
     if not os.path.exists(file_path):
         return None
 

From 2f5e4b0ffb80dfbcc92317e88a572e49a66d5e98 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:07:06 -0700
Subject: [PATCH 33/40] Update src/lightning_app/components/training.py

---
 src/lightning_app/components/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 91ba5786fbd7b..068fed6740084 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -142,7 +142,7 @@ def __init__(
             script_args: The arguments to be pass to the script.
             num_nodes: Number of nodes.
             cloud_compute: The cloud compute object used in the cloud.
-            sanity_serving: Whether to validate the model correctly implements
+            sanity_serving: Whether to validate that the model correctly implements
                 the ServableModule API
         """
         super().__init__()

From 1e000e5a5416ee889d4df572df1fa4f9048946a7 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:07:15 -0700
Subject: [PATCH 34/40] Update src/lightning_app/components/training.py

---
 src/lightning_app/components/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py
index 068fed6740084..9773fe9670e52 100644
--- a/src/lightning_app/components/training.py
+++ b/src/lightning_app/components/training.py
@@ -121,7 +121,7 @@ def __init__(
         script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
         **script_runner_kwargs,
     ):
-        """This component enables to perform distributed multi-node multi-devices training.
+        """This component enables performing distributed multi-node multi-device training.
 
         Example::
 

From 4ff559fada5ddaf829f5b5fd7c22d81c8067ea50 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:07:22 -0700
Subject: [PATCH 35/40] Update src/lightning_app/components/python/tracer.py

---
 src/lightning_app/components/python/tracer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py
index b761e79f8a30a..b98c782e138e4 100644
--- a/src/lightning_app/components/python/tracer.py
+++ b/src/lightning_app/components/python/tracer.py
@@ -120,7 +120,7 @@ def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[i
         """
         Arguments:
             params: A dictionary of arguments to be be added to script_args.
-            restart_count: Pass an incrementing counter to enable re-execution the work.
+            restart_count: Passes an incrementing counter to enable the re-execution of LightningWorks.
         """
         if restart_count:
             self.restart_count = restart_count

From 4c9efe49d0cfa1c2e9fc64a986f17244f5283764 Mon Sep 17 00:00:00 2001
From: Laverne Henderson <laverne.henderson@coupa.com>
Date: Thu, 28 Jul 2022 02:07:29 -0700
Subject: [PATCH 36/40] Update src/lightning_app/CHANGELOG.md

---
 src/lightning_app/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index d34a16f4f4aaa..ab8eb25adea85 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -10,7 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand`  ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
 
-- Add `LightningTrainingComponent` that orchestrate multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
+- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
 
 ### Changed
 

From d49ed8f22859cdf5bac1cc3d3ebd24495e214e20 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Jul 2022 09:08:20 +0000
Subject: [PATCH 37/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_app/components/python/test_python.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py
index bcb176bdf0184..678655d6ee908 100644
--- a/tests/tests_app/components/python/test_python.py
+++ b/tests/tests_app/components/python/test_python.py
@@ -76,8 +76,8 @@ def test_tracer_python_script_with_kwargs():
 
 
 def test_tracer_component_with_code():
-    """This test ensures the Tracer Component gets the latest code from the code object that is provided and arguments
-    are cleaned."""
+    """This test ensures the Tracer Component gets the latest code from the code object that is provided and
+    arguments are cleaned."""
 
     drive = Drive("lit://code")
     drive.component_name = "something"

From 3888ba15208656a0761e58c1bb1cf7071cf2dfda Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 12:49:17 +0200
Subject: [PATCH 38/40] update

---
 src/lightning_app/utilities/state.py    | 13 ++++++++++++-
 tests/tests_app/utilities/test_state.py | 20 ++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py
index 0802a426e7349..c644f25439140 100644
--- a/src/lightning_app/utilities/state.py
+++ b/src/lightning_app/utilities/state.py
@@ -168,7 +168,7 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
         # The state needs to be fetched on access if it doesn't exist.
         self._request_state()
 
-        if name in self._state["vars"]:
+        if name in self._state.get("vars", {}):
             value = self._state["vars"][name]
             if isinstance(value, dict):
                 return _maybe_create_drive("root." + ".".join(self._my_affiliation), value)
@@ -187,12 +187,23 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
                 state=self._state["flows"][name],
             )
 
+        elif name in self._state.get("structures", {}):
+            return AppState(
+                self._host,
+                self._port,
+                last_state=self._last_state["structures"][name],
+                state=self._state["structures"][name],
+            )
+
         raise AttributeError(
             f"Failed to access '{name}' through `AppState`. The state provides:"
             f" Variables: {list(self._state['vars'].keys())},"
             f" Components: {list(self._state.get('flows', {}).keys()) + list(self._state.get('works', {}).keys())}",
         )
 
+    def __getitem__(self, key: str):
+        return self.__getattr__(key)
+
     def __setattr__(self, name: str, value: Any) -> None:
         if name in self._APP_PRIVATE_KEYS:
             object.__setattr__(self, name, value)
diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py
index 0740ffc615b87..49b68619cbecc 100644
--- a/tests/tests_app/utilities/test_state.py
+++ b/tests/tests_app/utilities/test_state.py
@@ -7,6 +7,7 @@
 
 import lightning_app
 from lightning_app import LightningApp, LightningFlow, LightningWork
+from lightning_app.structures import Dict, List
 from lightning_app.utilities.app_helpers import AppStatePlugin, BaseStatePlugin
 from lightning_app.utilities.state import AppState
 
@@ -280,3 +281,22 @@ def test_app_state_with_no_env_var(**__):
     assert state._host == "http://127.0.0.1"
     assert state._port == 7501
     assert state._url == "http://127.0.0.1:7501"
+
+
+class FlowStructures(LightningFlow):
+    def __init__(self):
+        super().__init__()
+        self.w_list = List(Work(), Work())
+        self.w_dict = Dict(**{"0": Work(), "1": Work()})
+
+    def run(self):
+        self._exit()
+
+
+def test_app_state_with_structures():
+    app = LightningApp(FlowStructures())
+    state = AppState()
+    state._last_state = app.state
+    state._state = app.state
+    assert state.w_list["0"].counter == 0
+    assert state.w_dict["0"].counter == 0

From a72397bfb7b84e404ef19c579d206a30c92821b2 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Thu, 28 Jul 2022 19:14:20 +0200
Subject: [PATCH 39/40] update

---
 src/lightning_app/runners/backends/backend.py |  1 -
 src/lightning_app/utilities/state.py          | 49 ++++++++++++++++++-
 tests/tests_app/utilities/test_state.py       | 23 ++++++++-
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py
index c370c7098b778..87bb103823fd2 100644
--- a/src/lightning_app/runners/backends/backend.py
+++ b/src/lightning_app/runners/backends/backend.py
@@ -87,7 +87,6 @@ def _prepare_queues(self, app):
         app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw)
         app.error_queue = self.queues.get_error_queue(**kw)
         app.delta_queue = self.queues.get_delta_queue(**kw)
-        app.error_queue = self.queues.get_error_queue(**kw)
         app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw)
         app.api_delta_queue = self.queues.get_api_delta_queue(**kw)
         app.request_queues = {}
diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py
index c644f25439140..1242cbe1f622d 100644
--- a/src/lightning_app/utilities/state.py
+++ b/src/lightning_app/utilities/state.py
@@ -3,7 +3,7 @@
 import logging
 import os
 from copy import deepcopy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from deepdiff import DeepDiff
 from requests import Session
@@ -168,6 +168,11 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
         # The state needs to be fetched on access if it doesn't exist.
         self._request_state()
 
+        # import streamlit as st
+
+        # st.write(name)
+        # st.write(self._state)
+
         if name in self._state.get("vars", {}):
             value = self._state["vars"][name]
             if isinstance(value, dict):
@@ -237,6 +242,48 @@ def __repr__(self) -> str:
     def __bool__(self) -> bool:
         return bool(self._state)
 
+    def __len__(self) -> int:
+        # The state needs to be fetched on access if it doesn't exist.
+        self._request_state()
+
+        keys = []
+        for component in ["flows", "works", "structures"]:
+            keys.extend(list(self._state.get(component, {})))
+        return len(keys)
+
+    def items(self) -> List[Dict[str, Any]]:
+        # The state needs to be fetched on access if it doesn't exist.
+        self._request_state()
+
+        items = []
+        for component in ["flows", "works"]:
+            state = self._state.get(component, {})
+            last_state = self._last_state.get(component, {})
+            for name, state_value in state.items():
+                v = AppState(
+                    self._host,
+                    self._port,
+                    last_state=last_state[name],
+                    state=state_value,
+                )
+                items.append((name, v))
+
+        structures = self._state.get("structures", {})
+        last_structures = self._last_state.get("structures", {})
+        if structures:
+            for component in ["flows", "works"]:
+                state = structures.get(component, {})
+                last_state = last_structures.get(component, {})
+                for name, state_value in state.items():
+                    v = AppState(
+                        self._host,
+                        self._port,
+                        last_state=last_state[name],
+                        state=state_value,
+                    )
+                    items.append((name, v))
+        return items
+
     @staticmethod
     def _configure_session() -> Session:
         return _configure_session()
diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py
index 49b68619cbecc..3b9f1b790cfc7 100644
--- a/tests/tests_app/utilities/test_state.py
+++ b/tests/tests_app/utilities/test_state.py
@@ -287,7 +287,17 @@ class FlowStructures(LightningFlow):
     def __init__(self):
         super().__init__()
         self.w_list = List(Work(), Work())
-        self.w_dict = Dict(**{"0": Work(), "1": Work()})
+        self.w_dict = Dict(**{"toto": Work(), "toto_2": Work()})
+
+    def run(self):
+        self._exit()
+
+
+class FlowStructuresEmpty(LightningFlow):
+    def __init__(self):
+        super().__init__()
+        self.w_list = List()
+        self.w_dict = Dict()
 
     def run(self):
         self._exit()
@@ -299,4 +309,13 @@ def test_app_state_with_structures():
     state._last_state = app.state
     state._state = app.state
     assert state.w_list["0"].counter == 0
-    assert state.w_dict["0"].counter == 0
+    assert len(state.w_list) == 2
+    assert state.w_dict["toto"].counter == 0
+    assert [k for k, _ in state.w_dict.items()] == ["toto", "toto_2"]
+    assert [k for k, _ in state.w_list.items()] == ["0", "1"]
+
+    app = LightningApp(FlowStructuresEmpty())
+    state = AppState()
+    state._last_state = app.state
+    state._state = app.state
+    assert state.w_list

From d3ee31b057f0e58890af02f2f0d7c8d9c22c654f Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Fri, 29 Jul 2022 15:29:03 +0200
Subject: [PATCH 40/40] update

---
 src/lightning_app/core/flow.py         | 1 -
 src/lightning_app/utilities/proxies.py | 5 ++---
 src/lightning_app/utilities/state.py   | 5 -----
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py
index c7f769d5e7212..f6b6e34e81538 100644
--- a/src/lightning_app/core/flow.py
+++ b/src/lightning_app/core/flow.py
@@ -207,7 +207,6 @@ def _attach_backend(flow: "LightningFlow", backend):
             structure = getattr(flow, struct_name)
             for flow in structure.flows:
                 LightningFlow._attach_backend(flow, backend)
-                flow._backend = backend
             for work in structure.works:
                 backend._wrap_run_method(_LightningAppRef().get_current(), work)
                 work._backend = backend
diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py
index ce7a768e78a0b..c33e41bb70203 100644
--- a/src/lightning_app/utilities/proxies.py
+++ b/src/lightning_app/utilities/proxies.py
@@ -408,9 +408,8 @@ def run_once(self):
         persist_artifacts(work=self.work)
 
         # 15. Destroy the state observer.
-        if self.state_observer:
-            self.state_observer.join(0)
-            self.state_observer = None
+        self.state_observer.join(0)
+        self.state_observer = None
 
         # 15. An asynchronous work shouldn't return a return value.
         if ret is not None:
diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py
index 1242cbe1f622d..5cd7979de09d9 100644
--- a/src/lightning_app/utilities/state.py
+++ b/src/lightning_app/utilities/state.py
@@ -168,11 +168,6 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]:
         # The state needs to be fetched on access if it doesn't exist.
         self._request_state()
 
-        # import streamlit as st
-
-        # st.write(name)
-        # st.write(self._state)
-
         if name in self._state.get("vars", {}):
             value = self._state["vars"][name]
             if isinstance(value, dict):