From b9fc5b8ed12ced8d20616274c4af13a7a7279715 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 14:14:55 +0200 Subject: [PATCH 01/40] update --- MANIFEST.in | 27 ++++ examples/app_multi_node/app.py | 5 + examples/app_multi_node/{ => bare}/.gitignore | 0 .../app_multi_node/{ => bare}/multi_node.py | 0 .../{ => bare}/requirements.txt | 0 examples/app_multi_node/train.py | 7 + setup.py | 2 +- src/lightning_app/components/training.py | 144 ++++++++++++++++++ src/lightning_app/core/flow.py | 2 + src/lightning_app/structures/dict.py | 7 +- .../utilities/packaging/cloud_compute.py | 5 + 11 files changed, 196 insertions(+), 3 deletions(-) create mode 100644 examples/app_multi_node/app.py rename examples/app_multi_node/{ => bare}/.gitignore (100%) rename examples/app_multi_node/{ => bare}/multi_node.py (100%) rename examples/app_multi_node/{ => bare}/requirements.txt (100%) create mode 100644 examples/app_multi_node/train.py create mode 100644 src/lightning_app/components/training.py diff --git a/MANIFEST.in b/MANIFEST.in index a8dbcff69b631..2b1bf5a0d9d99 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,30 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py new file mode 100644 index 0000000000000..23e77c9ced766 --- /dev/null +++ b/examples/app_multi_node/app.py @@ -0,0 +1,5 @@ +from lightning import LightningApp +from lightning.app.components.training import LightningTrainingComponent +from lightning_app.utilities.packaging.cloud_compute import CloudCompute + +app = LightningApp(LightningTrainingComponent("train.py", num_nodes=2, cloud_compute=CloudCompute("cpu"))) diff --git a/examples/app_multi_node/.gitignore b/examples/app_multi_node/bare/.gitignore similarity index 100% rename from examples/app_multi_node/.gitignore rename to examples/app_multi_node/bare/.gitignore diff --git a/examples/app_multi_node/multi_node.py b/examples/app_multi_node/bare/multi_node.py similarity index 100% rename from examples/app_multi_node/multi_node.py rename to examples/app_multi_node/bare/multi_node.py diff --git a/examples/app_multi_node/requirements.txt b/examples/app_multi_node/bare/requirements.txt similarity index 100% rename from examples/app_multi_node/requirements.txt rename to examples/app_multi_node/bare/requirements.txt diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py new file mode 100644 index 0000000000000..d312d86bb780a --- /dev/null +++ b/examples/app_multi_node/train.py @@ -0,0 +1,7 @@ +from lightning.pytorch import Trainer +from lightning.pytorch.demos.boring_classes import BoringModel + +if __name__ == "__main__": + model = BoringModel() + trainer = Trainer(max_epochs=1, devices=2, strategy="ddp") + trainer.fit(model) diff --git a/setup.py b/setup.py index a542b3c1e0291..6d271cc40b0aa 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") +_PACKAGE_NAME = "" _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py new file mode 100644 index 0000000000000..349fb5644a440 --- /dev/null +++ b/src/lightning_app/components/training.py @@ -0,0 +1,144 @@ +import os +from typing import List, Optional, Tuple, Union + +from lightning import CloudCompute +from lightning_app import LightningFlow, structures +from lightning_app.components.python import TracerPythonScript +from lightning_app.utilities.imports import _is_pytorch_lightning_available + +if _is_pytorch_lightning_available(): + from pytorch_lightning import Callback + + class IntrospectionCallback(Callback): + def on_train_start(self, trainer, pl_module): + print(trainer.strategy) + print(trainer.world_size) + print(pl_module) + + +class _LightningTrainerWork(TracerPythonScript): + def __init__( + self, + script_path: str, + script_args: Optional[Union[list, str]] = None, + node_rank: int = 1, + num_nodes: int = 1, + global_rank: int = 0, + sanity_serving: bool = False, + cloud_compute: Optional[CloudCompute] = None, + **kwargs, + ): + super().__init__( + script_path, script_args, raise_exception=True, parallel=True, cloud_compute=cloud_compute, **kwargs + ) + self.node_rank = node_rank + self.num_nodes = num_nodes + self.global_rank = global_rank + self.best_model_path: None + self.best_model_score = None + self.sanity_serving = sanity_serving + self.has_finished = False + + def configure_tracer(self): + from pytorch_lightning import Trainer + + tracer = super().configure_tracer() + tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) + return tracer + + def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): + if not internal_urls: + print(f"The node {self.node_rank} started !") + return + + print(f"Internal URLS: {internal_urls}") + master_address = str(internal_urls[0][0]) + master_port = str(internal_urls[0][1]) + devices = self.cloud_compute.devices + + distributed_env_vars = { + "NODE_RANK": str(self.node_rank), + "LOCAL_RANK": str(self.global_rank), + "GLOBAL_RANK": str(self.global_rank), + "MASTER_ADDRESS": master_address, + "MASTER_PORT": master_port, + "WORLD_SIZE": str(self.num_nodes * devices), + } + print(distributed_env_vars) + os.environ.update(distributed_env_vars) + return super().run() + + def on_after_run(self, script_globals): + # TODO: Why does it hang there. + self.has_finished = True + raise SystemExit(0) + + def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): + from pytorch_lightning.serve import ServableModuleValidator + + callbacks = kwargs.get("callbacks", []) + if self.sanity_serving: + callbacks = callbacks + [ServableModuleValidator()] + callbacks += [IntrospectionCallback()] + kwargs["callbacks"] = callbacks + return {}, args, kwargs + + +class LightningTrainingComponent(LightningFlow): + def __init__( + self, + script_path: str, + script_args: Optional[Union[list, str]] = None, + num_nodes: int = 1, + cloud_compute: CloudCompute = CloudCompute("cpu"), + sanity_serving: bool = False, + ): + super().__init__() + self.ws = structures.Dict() + self.has_initialized = False + self.script_path = script_path + self.script_args = script_args + self.num_nodes = num_nodes + self._cloud_compute = cloud_compute # TODO: Add support for cloudCOmpute + self.sanity_serving = sanity_serving + + def run(self): + if not self.has_initialized: + for node_rank in range(self.num_nodes): + + if self.is_running_in_cloud: + devices = self._cloud_compute.devices + global_rank = (node_rank + 1) * devices - 1 if node_rank else 0 + work_node_rank = node_rank + else: + global_rank = node_rank + work_node_rank = 0 + + self.ws[str(node_rank)] = _LightningTrainerWork( + script_path=self.script_path, + script_args=self.script_args, + cloud_compute=self._cloud_compute, + node_rank=work_node_rank, + global_rank=global_rank, + sanity_serving=self.sanity_serving, + num_nodes=self.num_nodes, + ) + + self.has_initialized = True + + for work in self.ws.values(): + if self.ready: + internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()] + work.run(internal_urls) + if all(w.has_finished for w in self.ws.values()): + self._exit("Finished training") + else: + work.run() + + @property + def ready(self) -> bool: + return all(w.internal_ip for w in self.ws.values()) + + @property + def is_running_in_cloud(self) -> bool: + return "LIGHTNING_APP_STATE_URL" in os.environ diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index a5dcfd0a77e2e..4e6d7ee15e398 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -207,8 +207,10 @@ def _attach_backend(flow: "LightningFlow", backend): structure = getattr(flow, struct_name) for flow in structure.flows: LightningFlow._attach_backend(flow, backend) + flow._backend = backend for work in structure.works: backend._wrap_run_method(_LightningAppRef().get_current(), work) + work._backend = backend for name in flow._structures: getattr(flow, name)._backend = backend diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py index 2aa02d4ebfa50..93e2b161b2e7a 100644 --- a/src/lightning_app/structures/dict.py +++ b/src/lightning_app/structures/dict.py @@ -58,7 +58,10 @@ def __init__(self, **kwargs: T): def __setitem__(self, k, v): from lightning_app import LightningFlow, LightningWork - if "." in k: + if not isinstance(k, str): + raise Exception("The provided key should be an string") + + if isinstance(k, str) and "." in k: raise Exception(f"The provided name {k} contains . which is forbidden.") if self._backend: @@ -67,7 +70,7 @@ def __setitem__(self, k, v): _set_child_name(self, v, k) elif isinstance(v, LightningWork): self._backend._wrap_run_method(_LightningAppRef().get_current(), v) - v._name = f"{self.name}.{k}" + v._name = f"{self.name}.{k}" super().__setitem__(k, v) @property diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index 6527911855bae..a36d875c85982 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -58,3 +58,8 @@ def to_dict(self): @classmethod def from_dict(cls, d): return cls(**d["__cloud_compute__"]) + + @property + def devices(self) -> int: + # TODO: Add a resolver here. + return 1 From 5e607e308aed6ffdc46d06254f5acc35e0d31e0c Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 14:15:49 +0200 Subject: [PATCH 02/40] update --- MANIFEST.in | 27 --------------------------- setup.py | 2 +- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 2b1bf5a0d9d99..a8dbcff69b631 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,30 +3,3 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * diff --git a/setup.py b/setup.py index 6d271cc40b0aa..a542b3c1e0291 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = "" +_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ From ee161b33e6669b1227a14e4c3ace39e0ddb6ae85 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 14:16:41 +0200 Subject: [PATCH 03/40] update --- examples/app_multi_node/app.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 23e77c9ced766..51db0a4080d9d 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -2,4 +2,10 @@ from lightning.app.components.training import LightningTrainingComponent from lightning_app.utilities.packaging.cloud_compute import CloudCompute -app = LightningApp(LightningTrainingComponent("train.py", num_nodes=2, cloud_compute=CloudCompute("cpu"))) +app = LightningApp( + LightningTrainingComponent( + "train.py", + num_nodes=2, + cloud_compute=CloudCompute("gpu"), + ), +) From 3fffc0bde2d5040951fd68be20569cfc1c943e04 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 19:04:09 +0200 Subject: [PATCH 04/40] update --- MANIFEST.in | 3 ++ examples/app_multi_node/.lightning | 2 +- setup.py | 2 +- src/lightning_app/components/training.py | 36 ++++++++++++------- .../utilities/packaging/cloud_compute.py | 5 +++ 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index a8dbcff69b631..c22e9b09d4985 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,6 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 7befcc74ea6d3..2e661fac0e588 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: multi-node-demo +name: '18' diff --git a/setup.py b/setup.py index a542b3c1e0291..6d271cc40b0aa 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") +_PACKAGE_NAME = "" _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 349fb5644a440..3e919eef56312 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -4,16 +4,6 @@ from lightning import CloudCompute from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript -from lightning_app.utilities.imports import _is_pytorch_lightning_available - -if _is_pytorch_lightning_available(): - from pytorch_lightning import Callback - - class IntrospectionCallback(Callback): - def on_train_start(self, trainer, pl_module): - print(trainer.strategy) - print(trainer.world_size) - print(pl_module) class _LightningTrainerWork(TracerPythonScript): @@ -24,6 +14,7 @@ def __init__( node_rank: int = 1, num_nodes: int = 1, global_rank: int = 0, + local_rank: int = 0, sanity_serving: bool = False, cloud_compute: Optional[CloudCompute] = None, **kwargs, @@ -34,6 +25,7 @@ def __init__( self.node_rank = node_rank self.num_nodes = num_nodes self.global_rank = global_rank + self.local_rank = local_rank self.best_model_path: None self.best_model_score = None self.sanity_serving = sanity_serving @@ -51,20 +43,33 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): print(f"The node {self.node_rank} started !") return + import torch.distributed as dist + print(f"Internal URLS: {internal_urls}") master_address = str(internal_urls[0][0]) master_port = str(internal_urls[0][1]) devices = self.cloud_compute.devices + world_size = self.num_nodes * devices distributed_env_vars = { "NODE_RANK": str(self.node_rank), - "LOCAL_RANK": str(self.global_rank), + "LOCAL_RANK": str(self.local_rank), "GLOBAL_RANK": str(self.global_rank), "MASTER_ADDRESS": master_address, "MASTER_PORT": master_port, - "WORLD_SIZE": str(self.num_nodes * devices), + "WORLD_SIZE": str(world_size), } print(distributed_env_vars) + + backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" + + dist.init_process_group( + backend=backend, + init_method=f"tcp://{master_address}:{master_port}", + world_size=world_size, + rank=self.global_rank, + ) + os.environ.update(distributed_env_vars) return super().run() @@ -79,8 +84,10 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): callbacks = kwargs.get("callbacks", []) if self.sanity_serving: callbacks = callbacks + [ServableModuleValidator()] - callbacks += [IntrospectionCallback()] kwargs["callbacks"] = callbacks + kwargs["devices"] = self.cloud_compute.devices + kwargs["num_nodes"] = self.num_nodes + kwargs["accelerator"] = "auto" return {}, args, kwargs @@ -110,9 +117,11 @@ def run(self): devices = self._cloud_compute.devices global_rank = (node_rank + 1) * devices - 1 if node_rank else 0 work_node_rank = node_rank + local_rank = 0 else: global_rank = node_rank work_node_rank = 0 + local_rank = node_rank self.ws[str(node_rank)] = _LightningTrainerWork( script_path=self.script_path, @@ -122,6 +131,7 @@ def run(self): global_rank=global_rank, sanity_serving=self.sanity_serving, num_nodes=self.num_nodes, + local_rank=local_rank, ) self.has_initialized = True diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index a36d875c85982..075c0c24e86a9 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -63,3 +63,8 @@ def from_dict(cls, d): def devices(self) -> int: # TODO: Add a resolver here. return 1 + + @property + def accelerator(self) -> str: + # TODO: Add a resolver here. + return self.name From 6ddd07abb8dd65d1bea36dd1005a739b6e20c189 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 19:40:09 +0200 Subject: [PATCH 05/40] update --- MANIFEST.in | 3 --- examples/app_multi_node/.lightning | 2 +- setup.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index c22e9b09d4985..a8dbcff69b631 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,3 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 2e661fac0e588..7befcc74ea6d3 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: '18' +name: multi-node-demo diff --git a/setup.py b/setup.py index 6d271cc40b0aa..a542b3c1e0291 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = "" +_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ From acfb717cedd2eb1625021b25d27d093672aaa580 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 20:03:14 +0200 Subject: [PATCH 06/40] update --- examples/app_multi_node/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 51db0a4080d9d..0d63abb675005 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -6,6 +6,6 @@ LightningTrainingComponent( "train.py", num_nodes=2, - cloud_compute=CloudCompute("gpu"), + cloud_compute=CloudCompute("gpu", preemptible=True), ), ) From 9dc18644ce798b7a29dca5a4f33e3d38097a71b9 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 20:11:04 +0200 Subject: [PATCH 07/40] update --- examples/app_multi_node/.lightning | 2 +- examples/app_multi_node/app.py | 2 +- src/lightning_app/components/training.py | 22 ++++++++++++++++--- .../utilities/packaging/cloud_compute.py | 8 +++++++ 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 7befcc74ea6d3..7e3ebdf752f5c 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: multi-node-demo +name: '22' diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 0d63abb675005..2829ea3b157c9 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -6,6 +6,6 @@ LightningTrainingComponent( "train.py", num_nodes=2, - cloud_compute=CloudCompute("gpu", preemptible=True), + cloud_compute=CloudCompute("gpu-fast-multi", preemptible=True), ), ) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 3e919eef56312..e118aab4b78c9 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -1,3 +1,4 @@ +import logging import os from typing import List, Optional, Tuple, Union @@ -5,6 +6,8 @@ from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript +_logger = logging.getLogger(__name__) + class _LightningTrainerWork(TracerPythonScript): def __init__( @@ -40,12 +43,12 @@ def configure_tracer(self): def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): if not internal_urls: - print(f"The node {self.node_rank} started !") + _logger.info(f"The node {self.node_rank} started !") return import torch.distributed as dist - print(f"Internal URLS: {internal_urls}") + _logger.debug(f"Internal URLS: {internal_urls}") master_address = str(internal_urls[0][0]) master_port = str(internal_urls[0][1]) devices = self.cloud_compute.devices @@ -59,7 +62,7 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): "MASTER_PORT": master_port, "WORLD_SIZE": str(world_size), } - print(distributed_env_vars) + _logger.info(distributed_env_vars) backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" @@ -100,6 +103,16 @@ def __init__( cloud_compute: CloudCompute = CloudCompute("cpu"), sanity_serving: bool = False, ): + """This component enables to perform distributed training. + + Arguments: + script_path: Path to the script to be executed. + script_args: The arguments to be pass to the script. + num_nodes: Number of nodes. + cloud_compute: The cloud compute object used in the cloud. + sanity_serving: Whether to validate the model correctly implements + the ServableModule API + """ super().__init__() self.ws = structures.Dict() self.has_initialized = False @@ -109,6 +122,9 @@ def __init__( self._cloud_compute = cloud_compute # TODO: Add support for cloudCOmpute self.sanity_serving = sanity_serving + if not self.is_running_in_cloud and num_nodes > 1: + _logger.info(f"This app is running locally, `num_nodes` would be mapped to devices * {num_nodes}.") + def run(self): if not self.has_initialized: for node_rank in range(self.num_nodes): diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index 075c0c24e86a9..d181cd32204ec 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -1,6 +1,12 @@ from dataclasses import asdict, dataclass from typing import List, Optional, Union +_name_to_devices_map = { + "gpu": 1, + "gpu-fast": 1, + "gpu-fast-multi": 4, +} + @dataclass class CloudCompute: @@ -62,6 +68,8 @@ def from_dict(cls, d): @property def devices(self) -> int: # TODO: Add a resolver here. + if self.name in _name_to_devices_map: + return _name_to_devices_map[self.name] return 1 @property From abce33af56b89318568173e0f586883e9efa0a0a Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 20:11:30 +0200 Subject: [PATCH 08/40] update --- examples/app_multi_node/.lightning | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 7e3ebdf752f5c..7befcc74ea6d3 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: '22' +name: multi-node-demo From 6e00b78e8092582322a0056957d3fc03dad3fe22 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 20:24:44 +0200 Subject: [PATCH 09/40] update --- src/lightning_app/utilities/packaging/cloud_compute.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index d181cd32204ec..64f4fbe8bce01 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -7,6 +7,11 @@ "gpu-fast-multi": 4, } +_short_name_to_instance_map = { + "gpu-fast": "p3.2xlarge", + "gpu-fast-multi": "p3.8xlarge", +} + @dataclass class CloudCompute: @@ -58,6 +63,9 @@ def __post_init__(self): self.name = self.name.lower() + if self.name in _short_name_to_instance_map: + self.name = _short_name_to_instance_map[self.name] + def to_dict(self): return {"__cloud_compute__": asdict(self)} From aa548c94497f728cec43c585e619e15ac070461a Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 23 Jul 2022 21:10:03 +0200 Subject: [PATCH 10/40] update --- examples/app_multi_node/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py index d312d86bb780a..b5e83d905047d 100644 --- a/examples/app_multi_node/train.py +++ b/examples/app_multi_node/train.py @@ -3,5 +3,5 @@ if __name__ == "__main__": model = BoringModel() - trainer = Trainer(max_epochs=1, devices=2, strategy="ddp") + trainer = Trainer(max_epochs=1, strategy="ddp") trainer.fit(model) From 7b8e831602668a91a513a45b520b49f87cafe16e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 15:26:45 +0200 Subject: [PATCH 11/40] update --- MANIFEST.in | 60 ++++++++++ examples/app_multi_node/.lightning | 2 +- examples/app_multi_node/app.py | 2 +- examples/app_multi_node/train.py | 1 + setup.py | 2 +- src/lightning_app/components/training.py | 108 ++++++++++++++---- .../utilities/packaging/cloud_compute.py | 9 -- src/pytorch_lightning/accelerators/cuda.py | 4 +- src/pytorch_lightning/strategies/ddp.py | 10 +- .../strategies/launchers/subprocess_script.py | 7 ++ .../connectors/accelerator_connector.py | 33 ++++++ src/pytorch_lightning/trainer/trainer.py | 12 ++ .../utilities/distributed.py | 3 + 13 files changed, 218 insertions(+), 35 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index a8dbcff69b631..297119d8e20b3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,63 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 7befcc74ea6d3..ccc66f8a00a3a 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: multi-node-demo +name: '59' diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 2829ea3b157c9..fc92b83647860 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -6,6 +6,6 @@ LightningTrainingComponent( "train.py", num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi", preemptible=True), + cloud_compute=CloudCompute("gpu-fast-multi"), ), ) diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py index b5e83d905047d..ec82459279640 100644 --- a/examples/app_multi_node/train.py +++ b/examples/app_multi_node/train.py @@ -4,4 +4,5 @@ if __name__ == "__main__": model = BoringModel() trainer = Trainer(max_epochs=1, strategy="ddp") + print("Strategy", trainer.strategy.__dict__) trainer.fit(model) diff --git a/setup.py b/setup.py index a542b3c1e0291..6d271cc40b0aa 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") +_PACKAGE_NAME = "" _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index e118aab4b78c9..c9cfdb55415f3 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -5,11 +5,57 @@ from lightning import CloudCompute from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript +from pytorch_lightning.plugins.environments import ClusterEnvironment _logger = logging.getLogger(__name__) -class _LightningTrainerWork(TracerPythonScript): +class _Environment(ClusterEnvironment): + + def __init__(self, main_address, main_port, world_size, global_rank, node_rank): + self._main_address = main_address + self._main_port = main_port + self._world_size = world_size + self._global_rank = global_rank + self._node_rank = node_rank + self._local_rank = None + + def detect(self): + return True + + @property + def creates_processes_externally(self) -> bool: + return False + + @property + def main_address(self): + return self._main_address + + @property + def main_port(self) -> int: + return self._main_port + + def global_rank(self): + return self._global_rank + + def node_rank(self) -> int: + return self._node_rank + + def world_size(self): + return self._world_size + + def set_world_size(self, size: int) -> None: + self._world_size = size + + def set_global_rank(self, rank: int) -> None: + self._global_rank = rank + + def local_rank(self): + if self._local_rank is None: + return 0 + return self._local_rank + +class PyTorchLightningPythonScript(TracerPythonScript): def __init__( self, script_path: str, @@ -33,6 +79,9 @@ def __init__( self.best_model_score = None self.sanity_serving = sanity_serving self.has_finished = False + self.master_address = None + self.master_port = None + self.world_size = None def configure_tracer(self): from pytorch_lightning import Trainer @@ -49,29 +98,34 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): import torch.distributed as dist _logger.debug(f"Internal URLS: {internal_urls}") - master_address = str(internal_urls[0][0]) - master_port = str(internal_urls[0][1]) + + self.master_address = str(internal_urls[0][0]) + self.master_port = str(internal_urls[0][1]) devices = self.cloud_compute.devices - world_size = self.num_nodes * devices + self.world_size = self.num_nodes * devices + + backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" distributed_env_vars = { + "MASTER_ADDRESS": self.master_address, + "MASTER_PORT": self.master_port, "NODE_RANK": str(self.node_rank), - "LOCAL_RANK": str(self.local_rank), - "GLOBAL_RANK": str(self.global_rank), - "MASTER_ADDRESS": master_address, - "MASTER_PORT": master_port, - "WORLD_SIZE": str(world_size), + "WORLD_SIZE": str(self.world_size), + "PL_TRAINER_NUM_NODES": str(self.num_nodes), + "PL_TRAINER_STRATEGY": "ddp", + "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), + "PL_TRAINER_ACCELERATOR": "auto", } _logger.info(distributed_env_vars) - backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" + # backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" - dist.init_process_group( - backend=backend, - init_method=f"tcp://{master_address}:{master_port}", - world_size=world_size, - rank=self.global_rank, - ) + # dist.init_process_group( + # backend=backend, + # init_method=f"tcp://{master_address}:{master_port}", + # world_size=world_size, + # rank=self.global_rank, + # ) os.environ.update(distributed_env_vars) return super().run() @@ -88,11 +142,25 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): if self.sanity_serving: callbacks = callbacks + [ServableModuleValidator()] kwargs["callbacks"] = callbacks - kwargs["devices"] = self.cloud_compute.devices - kwargs["num_nodes"] = self.num_nodes + if self.is_running_in_cloud: + kwargs["num_nodes"] = self.num_nodes + kwargs["devices"] = self.cloud_compute.devices + else: + kwargs["num_nodes"] = 1 kwargs["accelerator"] = "auto" + # kwargs["plugins"] = _Environment( + # main_address=self.master_address, + # main_port=self.master_port, + # world_size=self.world_size, + # global_rank=self.global_rank, + # node_rank=self.node_rank, + # ) return {}, args, kwargs + @property + def is_running_in_cloud(self) -> bool: + return "LIGHTNING_APP_STATE_URL" in os.environ + class LightningTrainingComponent(LightningFlow): def __init__( @@ -131,7 +199,7 @@ def run(self): if self.is_running_in_cloud: devices = self._cloud_compute.devices - global_rank = (node_rank + 1) * devices - 1 if node_rank else 0 + global_rank = node_rank * devices if node_rank else 0 work_node_rank = node_rank local_rank = 0 else: @@ -139,7 +207,7 @@ def run(self): work_node_rank = 0 local_rank = node_rank - self.ws[str(node_rank)] = _LightningTrainerWork( + self.ws[str(node_rank)] = PyTorchLightningPythonScript( script_path=self.script_path, script_args=self.script_args, cloud_compute=self._cloud_compute, diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index 64f4fbe8bce01..41158f0c2a57c 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -7,12 +7,6 @@ "gpu-fast-multi": 4, } -_short_name_to_instance_map = { - "gpu-fast": "p3.2xlarge", - "gpu-fast-multi": "p3.8xlarge", -} - - @dataclass class CloudCompute: """ @@ -63,9 +57,6 @@ def __post_init__(self): self.name = self.name.lower() - if self.name in _short_name_to_instance_map: - self.name = _short_name_to_instance_map[self.name] - def to_dict(self): return {"__cloud_compute__": asdict(self)} diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index a474ef9a99031..4d6b9bebc2d25 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -79,7 +79,9 @@ def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: @staticmethod def get_parallel_devices(devices: List[int]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" - return [torch.device("cuda", i) for i in devices] + parallel_devices = [torch.device("cuda", i) for i in devices] + print("get_parallel_devices", parallel_devices) + return parallel_devices @staticmethod def auto_device_count() -> int: diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 922730df35269..8fe2fcf3cd9db 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -83,8 +83,8 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ddp_comm_state: Optional[object] = None, - ddp_comm_hook: Optional[callable] = None, - ddp_comm_wrapper: Optional[callable] = None, + ddp_comm_hook: Optional[Callable] = None, + ddp_comm_wrapper: Optional[Callable] = None, model_averaging_period: Optional[int] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, @@ -216,9 +216,15 @@ def _get_process_group_backend(self) -> str: def set_world_ranks(self) -> None: if self.cluster_environment is None: return + print(f"node_rank: {self.node_rank}") + print(f"num_processes: {self.num_processes}") + print(f"local_rank: {self.local_rank}") + print("num_nodes", self.num_nodes) self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) rank_zero_only.rank = self.cluster_environment.global_rank() + print(f"global_rank: {rank_zero_only.rank}") + print("world_size", self.cluster_environment.world_size()) def pre_configure_ddp(self) -> None: # if unset, default `find_unused_parameters` `True` diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index 5a8632fb87306..9630eb812ce5a 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -88,8 +88,11 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer`. **kwargs: Optional keyword arguments to be passed to the given function. """ + print("creates_processes_externally", self.cluster_environment.creates_processes_externally) if not self.cluster_environment.creates_processes_externally: + print("_call_children_scripts") self._call_children_scripts() + print("After creating") return function(*args, **kwargs) def _call_children_scripts(self) -> None: @@ -130,6 +133,8 @@ def _call_children_scripts(self) -> None: env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" + print(f"Creating {local_rank} {env_copy}") + # remove env var if global seed not set if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] @@ -149,6 +154,8 @@ def _call_children_scripts(self) -> None: delay = np.random.uniform(1, 5, 1)[0] sleep(delay) + print("done !") + def _check_can_spawn_children(self) -> None: if self.cluster_environment.local_rank() != 0: raise RuntimeError( diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index dc8594bfd7021..bd4dee270c05c 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -151,6 +151,8 @@ def __init__( A. Class > str B. Strategy > Accelerator/precision/plugins """ + print("Accelerator Connector", num_nodes, devices, accelerator, strategy) + if deterministic: if benchmark is None: # Set benchmark to False to ensure determinism @@ -188,6 +190,8 @@ def __init__( self._amp_level_flag: Optional[str] = amp_level self._auto_select_gpus: bool = auto_select_gpus + print("1") + self._check_config_and_set_final_flags( strategy=strategy, accelerator=accelerator, @@ -197,9 +201,17 @@ def __init__( amp_level=amp_level, sync_batchnorm=sync_batchnorm, ) + + print("2") + + + self._check_device_config_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) + + print("3") + # 2. Instantiate Accelerator # handle `auto` and `None` self._set_accelerator_if_ipu_strategy_is_passed() @@ -207,9 +219,13 @@ def __init__( self._accelerator_flag = self._choose_accelerator() self._set_parallel_devices_and_init_accelerator() + print("4") + # 3. Instantiate ClusterEnvironment self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment() + print("5") + # 4. Instantiate Strategy - Part 1 if self._strategy_flag is None: self._strategy_flag = self._choose_strategy() @@ -217,12 +233,18 @@ def __init__( self._check_strategy_and_fallback() self._init_strategy() + print("6") + # 5. Instantiate Precision Plugin self.precision_plugin = self._check_and_init_precision() + print("7") + # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() + print("8") + def _init_deterministic(self, deterministic: Optional[Union[bool, _LITERAL_WARN]]) -> None: self.deterministic = deterministic or False # default to False if not set if _TORCH_GREATER_EQUAL_1_11 and deterministic == "warn": @@ -530,10 +552,12 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._devices_flag = self.accelerator.parse_devices(self._devices_flag) if not self._parallel_devices: self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) + print("Right there", self._parallel_devices) def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or self._devices_flag is None: self._devices_flag = self.accelerator.auto_device_count() + print(f"Auto device {self._devices_flag}") def _set_devices_flag_if_auto_select_gpus_passed(self) -> None: if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, CUDAAccelerator): @@ -770,24 +794,33 @@ def _validate_precision_choice(self) -> None: def _lazy_init_strategy(self) -> None: """Lazily set missing attributes on the previously instantiated strategy.""" + print("a") self.strategy.accelerator = self.accelerator if self.precision_plugin: self.strategy.precision_plugin = self.precision_plugin if self.checkpoint_io: self.strategy.checkpoint_io = self.checkpoint_io + print("b", self.cluster_environment) if hasattr(self.strategy, "cluster_environment"): self.strategy.cluster_environment = self.cluster_environment if hasattr(self.strategy, "parallel_devices"): + print("c", self.strategy.parallel_devices) if self.strategy.parallel_devices: self._parallel_devices = self.strategy.parallel_devices else: + print("c1") + #print(self._parallel_devices, os.environ) self.strategy.parallel_devices = self._parallel_devices + print("c2") if hasattr(self.strategy, "num_nodes"): + print("d", self._num_nodes_flag) self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "_layer_sync"): self.strategy._layer_sync = self._layer_sync if hasattr(self.strategy, "set_world_ranks"): + print("e") self.strategy.set_world_ranks() + print("f") self.strategy._configure_launcher() from pytorch_lightning.utilities import _IS_INTERACTIVE diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index d10225fea2d65..19ee61483e6ca 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -430,6 +430,8 @@ def __init__( # init connectors self._data_connector = DataConnector(self, multiple_trainloader_mode) + print('before accelerator_connector') + self._accelerator_connector = AcceleratorConnector( num_processes=num_processes, devices=devices, @@ -449,12 +451,16 @@ def __init__( amp_level=amp_level, plugins=plugins, ) + print('after accelerator_connector') + self._logger_connector = LoggerConnector(self) self._callback_connector = CallbackConnector(self) self._checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) self._signal_connector = SignalConnector(self) self.tuner = Tuner(self) + print('_parse_loop_limits') + min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits( min_steps, max_steps, min_epochs, max_epochs, max_time ) @@ -462,6 +468,8 @@ def __init__( training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps) fit_loop.connect(epoch_loop=training_epoch_loop) + print('TrainingEpochLoop') + # default .fit() loop self.fit_loop = fit_loop @@ -483,6 +491,8 @@ def __init__( self._tested_ckpt_path: Optional[str] = None # TODO: remove in v1.8 self._predicted_ckpt_path: Optional[str] = None # TODO: remove in v1.8 + print('on_trainer_init') + # init callbacks # Declare attributes to be set in _callback_connector on_trainer_init self._callback_connector.on_trainer_init( @@ -507,6 +517,8 @@ def __init__( check_val_every_n_epoch, ) + print('on_trainer_init') + # gradient clipping if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)): raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.") diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index 361c6dd12beeb..63c43319b0b55 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -366,6 +366,9 @@ def init_dist_connection( if torch.distributed.is_initialized(): log.debug("torch.distributed is already initialized. Exiting early") return + + print(cluster_environment) + global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() world_size = world_size if world_size is not None else cluster_environment.world_size() os.environ["MASTER_ADDR"] = cluster_environment.main_address From b0a3c529dc93b2ef96a877c5509566af523f512c Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 15:27:12 +0200 Subject: [PATCH 12/40] update --- src/lightning_app/components/training.py | 2 +- .../utilities/packaging/cloud_compute.py | 1 + .../trainer/connectors/accelerator_connector.py | 4 +--- src/pytorch_lightning/trainer/trainer.py | 12 ++++++------ 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index c9cfdb55415f3..7698964d68312 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -11,7 +11,6 @@ class _Environment(ClusterEnvironment): - def __init__(self, main_address, main_port, world_size, global_rank, node_rank): self._main_address = main_address self._main_port = main_port @@ -55,6 +54,7 @@ def local_rank(self): return 0 return self._local_rank + class PyTorchLightningPythonScript(TracerPythonScript): def __init__( self, diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index 41158f0c2a57c..d181cd32204ec 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -7,6 +7,7 @@ "gpu-fast-multi": 4, } + @dataclass class CloudCompute: """ diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index bd4dee270c05c..e19e211b64d16 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -204,8 +204,6 @@ def __init__( print("2") - - self._check_device_config_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) @@ -809,7 +807,7 @@ def _lazy_init_strategy(self) -> None: self._parallel_devices = self.strategy.parallel_devices else: print("c1") - #print(self._parallel_devices, os.environ) + # print(self._parallel_devices, os.environ) self.strategy.parallel_devices = self._parallel_devices print("c2") if hasattr(self.strategy, "num_nodes"): diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 19ee61483e6ca..affc7b3f9d140 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -430,7 +430,7 @@ def __init__( # init connectors self._data_connector = DataConnector(self, multiple_trainloader_mode) - print('before accelerator_connector') + print("before accelerator_connector") self._accelerator_connector = AcceleratorConnector( num_processes=num_processes, @@ -451,7 +451,7 @@ def __init__( amp_level=amp_level, plugins=plugins, ) - print('after accelerator_connector') + print("after accelerator_connector") self._logger_connector = LoggerConnector(self) self._callback_connector = CallbackConnector(self) @@ -459,7 +459,7 @@ def __init__( self._signal_connector = SignalConnector(self) self.tuner = Tuner(self) - print('_parse_loop_limits') + print("_parse_loop_limits") min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits( min_steps, max_steps, min_epochs, max_epochs, max_time @@ -468,7 +468,7 @@ def __init__( training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps) fit_loop.connect(epoch_loop=training_epoch_loop) - print('TrainingEpochLoop') + print("TrainingEpochLoop") # default .fit() loop self.fit_loop = fit_loop @@ -491,7 +491,7 @@ def __init__( self._tested_ckpt_path: Optional[str] = None # TODO: remove in v1.8 self._predicted_ckpt_path: Optional[str] = None # TODO: remove in v1.8 - print('on_trainer_init') + print("on_trainer_init") # init callbacks # Declare attributes to be set in _callback_connector on_trainer_init @@ -517,7 +517,7 @@ def __init__( check_val_every_n_epoch, ) - print('on_trainer_init') + print("on_trainer_init") # gradient clipping if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)): From 04fe16d1623b18e3bf5707bee3fe6eca7b620453 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 15:52:31 +0200 Subject: [PATCH 13/40] update --- MANIFEST.in | 12 +++ examples/app_multi_node/.lightning | 2 +- src/lightning_app/components/training.py | 89 ++++--------------- .../strategies/launchers/subprocess_script.py | 13 +++ 4 files changed, 42 insertions(+), 74 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 297119d8e20b3..1b7de078d6b11 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -63,3 +63,15 @@ recursive-include src/lightning_app/cli/*-template * recursive-include src *.md recursive-include requirements *.txt recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index ccc66f8a00a3a..d6917f3c735da 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: '59' +name: '62' diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 7698964d68312..87530f5cf5072 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -5,56 +5,10 @@ from lightning import CloudCompute from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript -from pytorch_lightning.plugins.environments import ClusterEnvironment _logger = logging.getLogger(__name__) -class _Environment(ClusterEnvironment): - def __init__(self, main_address, main_port, world_size, global_rank, node_rank): - self._main_address = main_address - self._main_port = main_port - self._world_size = world_size - self._global_rank = global_rank - self._node_rank = node_rank - self._local_rank = None - - def detect(self): - return True - - @property - def creates_processes_externally(self) -> bool: - return False - - @property - def main_address(self): - return self._main_address - - @property - def main_port(self) -> int: - return self._main_port - - def global_rank(self): - return self._global_rank - - def node_rank(self) -> int: - return self._node_rank - - def world_size(self): - return self._world_size - - def set_world_size(self, size: int) -> None: - self._world_size = size - - def set_global_rank(self, rank: int) -> None: - self._global_rank = rank - - def local_rank(self): - if self._local_rank is None: - return 0 - return self._local_rank - - class PyTorchLightningPythonScript(TracerPythonScript): def __init__( self, @@ -87,7 +41,7 @@ def configure_tracer(self): from pytorch_lightning import Trainer tracer = super().configure_tracer() - tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) + # tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) return tracer def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): @@ -95,8 +49,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): _logger.info(f"The node {self.node_rank} started !") return - import torch.distributed as dist - _logger.debug(f"Internal URLS: {internal_urls}") self.master_address = str(internal_urls[0][0]) @@ -104,8 +56,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): devices = self.cloud_compute.devices self.world_size = self.num_nodes * devices - backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" - distributed_env_vars = { "MASTER_ADDRESS": self.master_address, "MASTER_PORT": self.master_port, @@ -117,16 +67,6 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): "PL_TRAINER_ACCELERATOR": "auto", } _logger.info(distributed_env_vars) - - # backend = "gloo" if self.cloud_compute.accelerator == "cpu" else "nccl" - - # dist.init_process_group( - # backend=backend, - # init_method=f"tcp://{master_address}:{master_port}", - # world_size=world_size, - # rank=self.global_rank, - # ) - os.environ.update(distributed_env_vars) return super().run() @@ -136,18 +76,21 @@ def on_after_run(self, script_globals): raise SystemExit(0) def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): - from pytorch_lightning.serve import ServableModuleValidator - - callbacks = kwargs.get("callbacks", []) - if self.sanity_serving: - callbacks = callbacks + [ServableModuleValidator()] - kwargs["callbacks"] = callbacks - if self.is_running_in_cloud: - kwargs["num_nodes"] = self.num_nodes - kwargs["devices"] = self.cloud_compute.devices - else: - kwargs["num_nodes"] = 1 - kwargs["accelerator"] = "auto" + if self.node_rank != 0 : + return {}, args, kwargs + + # from pytorch_lightning.serve import ServableModuleValidator + + # callbacks = kwargs.get("callbacks", []) + # if self.sanity_serving: + # callbacks = callbacks + [ServableModuleValidator()] + # kwargs["callbacks"] = callbacks + # if self.is_running_in_cloud: + # kwargs["num_nodes"] = self.num_nodes + # kwargs["devices"] = self.cloud_compute.devices + # else: + # kwargs["num_nodes"] = 1 + # kwargs["accelerator"] = "auto" # kwargs["plugins"] = _Environment( # main_address=self.master_address, # main_port=self.master_port, diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index 9630eb812ce5a..822645a4f89f0 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -96,17 +96,23 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] return function(*args, **kwargs) def _call_children_scripts(self) -> None: + print("1") # bookkeeping of spawned processes self._check_can_spawn_children() + print("2") # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.main_address os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) + print("3") + # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) + print("4") + # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover @@ -127,9 +133,16 @@ def _call_children_scripts(self) -> None: else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] + print("5") + os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" + print("call_children_scripts", os.environ, self.num_processes) + + print("6") + for local_rank in range(1, self.num_processes): + print("7") env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" From 0e1b06e2ccb30735742d804b27a1e95befdd4a38 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 15:54:22 +0200 Subject: [PATCH 14/40] update --- MANIFEST.in | 72 ------------------------ setup.py | 2 +- src/lightning_app/components/training.py | 2 +- 3 files changed, 2 insertions(+), 74 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 1b7de078d6b11..a8dbcff69b631 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,75 +3,3 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * diff --git a/setup.py b/setup.py index 6d271cc40b0aa..a542b3c1e0291 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = "" +_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 87530f5cf5072..194b070388404 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -76,7 +76,7 @@ def on_after_run(self, script_globals): raise SystemExit(0) def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): - if self.node_rank != 0 : + if self.node_rank != 0: return {}, args, kwargs # from pytorch_lightning.serve import ServableModuleValidator From a389de67bd7246ba223938cd61449910f72d9666 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 18:23:35 +0200 Subject: [PATCH 15/40] update --- examples/app_multi_node/train.py | 1 - src/lightning_app/components/training.py | 55 ++++--------------- .../strategies/launchers/subprocess_script.py | 14 +++-- .../utilities/distributed.py | 10 +++- 4 files changed, 27 insertions(+), 53 deletions(-) diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py index ec82459279640..b5e83d905047d 100644 --- a/examples/app_multi_node/train.py +++ b/examples/app_multi_node/train.py @@ -4,5 +4,4 @@ if __name__ == "__main__": model = BoringModel() trainer = Trainer(max_epochs=1, strategy="ddp") - print("Strategy", trainer.strategy.__dict__) trainer.fit(model) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 194b070388404..e464098087164 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -16,8 +16,6 @@ def __init__( script_args: Optional[Union[list, str]] = None, node_rank: int = 1, num_nodes: int = 1, - global_rank: int = 0, - local_rank: int = 0, sanity_serving: bool = False, cloud_compute: Optional[CloudCompute] = None, **kwargs, @@ -27,8 +25,6 @@ def __init__( ) self.node_rank = node_rank self.num_nodes = num_nodes - self.global_rank = global_rank - self.local_rank = local_rank self.best_model_path: None self.best_model_score = None self.sanity_serving = sanity_serving @@ -41,7 +37,7 @@ def configure_tracer(self): from pytorch_lightning import Trainer tracer = super().configure_tracer() - # tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) + tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) return tracer def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): @@ -65,7 +61,9 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): "PL_TRAINER_STRATEGY": "ddp", "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), "PL_TRAINER_ACCELERATOR": "auto", + "PL_TORCH_DISTRIBUTED_BACKEND": "gloo", } + _logger.info(distributed_env_vars) os.environ.update(distributed_env_vars) return super().run() @@ -79,25 +77,12 @@ def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): if self.node_rank != 0: return {}, args, kwargs - # from pytorch_lightning.serve import ServableModuleValidator - - # callbacks = kwargs.get("callbacks", []) - # if self.sanity_serving: - # callbacks = callbacks + [ServableModuleValidator()] - # kwargs["callbacks"] = callbacks - # if self.is_running_in_cloud: - # kwargs["num_nodes"] = self.num_nodes - # kwargs["devices"] = self.cloud_compute.devices - # else: - # kwargs["num_nodes"] = 1 - # kwargs["accelerator"] = "auto" - # kwargs["plugins"] = _Environment( - # main_address=self.master_address, - # main_port=self.master_port, - # world_size=self.world_size, - # global_rank=self.global_rank, - # node_rank=self.node_rank, - # ) + from pytorch_lightning.serve import ServableModuleValidator + + callbacks = kwargs.get("callbacks", []) + if self.sanity_serving: + callbacks = callbacks + [ServableModuleValidator()] + kwargs["callbacks"] = callbacks return {}, args, kwargs @property @@ -133,32 +118,16 @@ def __init__( self._cloud_compute = cloud_compute # TODO: Add support for cloudCOmpute self.sanity_serving = sanity_serving - if not self.is_running_in_cloud and num_nodes > 1: - _logger.info(f"This app is running locally, `num_nodes` would be mapped to devices * {num_nodes}.") - def run(self): if not self.has_initialized: for node_rank in range(self.num_nodes): - - if self.is_running_in_cloud: - devices = self._cloud_compute.devices - global_rank = node_rank * devices if node_rank else 0 - work_node_rank = node_rank - local_rank = 0 - else: - global_rank = node_rank - work_node_rank = 0 - local_rank = node_rank - self.ws[str(node_rank)] = PyTorchLightningPythonScript( script_path=self.script_path, script_args=self.script_args, cloud_compute=self._cloud_compute, - node_rank=work_node_rank, - global_rank=global_rank, + node_rank=node_rank, sanity_serving=self.sanity_serving, num_nodes=self.num_nodes, - local_rank=local_rank, ) self.has_initialized = True @@ -175,7 +144,3 @@ def run(self): @property def ready(self) -> bool: return all(w.internal_ip for w in self.ws.values()) - - @property - def is_running_in_cloud(self) -> bool: - return "LIGHTNING_APP_STATE_URL" in os.environ diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index 822645a4f89f0..46c00342dbfb4 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -133,12 +133,10 @@ def _call_children_scripts(self) -> None: else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] - print("5") + print("5", self.num_processes, self.num_nodes) os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" - print("call_children_scripts", os.environ, self.num_processes) - print("6") for local_rank in range(1, self.num_processes): @@ -146,7 +144,7 @@ def _call_children_scripts(self) -> None: env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" - print(f"Creating {local_rank} {env_copy}") + print(f"Creating {local_rank}") # remove env var if global seed not set if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: @@ -154,19 +152,23 @@ def _call_children_scripts(self) -> None: # start process # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = None + cwd: Optional[str] = os.getcwd() if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] - subprocess.Popen(command, env=env_copy, cwd=cwd) + + print(command, cwd) + process = subprocess.Popen(command, env=env_copy, cwd=cwd, stderr=sys.stderr) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) + print(process.returncode) + print("done !") def _check_can_spawn_children(self) -> None: diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index 63c43319b0b55..903fe300a2de5 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -374,7 +374,15 @@ def init_dist_connection( os.environ["MASTER_ADDR"] = cluster_environment.main_address os.environ["MASTER_PORT"] = str(cluster_environment.main_port) log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) + torch.distributed.init_process_group( + torch_distributed_backend, + init_method=f"tcp://{cluster_environment.main_address}:{cluster_environment.main_port}", + rank=global_rank, + world_size=world_size, + **kwargs, + ) + + print("HERE") # on rank=0 let everyone know training is starting new_rank_zero_info( From 3c9d1f80dc8e82d73ab725c79a941f1b0a346bae Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 20:42:37 +0200 Subject: [PATCH 16/40] update --- MANIFEST.in | 51 ++++++++++++++++++ examples/app_multi_node/.lightning | 2 +- examples/app_multi_node/app.py | 2 +- setup.py | 2 +- src/lightning_app/components/training.py | 49 +++++++++-------- .../utilities/packaging/cloud_compute.py | 2 + src/pytorch_lightning/CHANGELOG.md | 6 +++ src/pytorch_lightning/accelerators/cuda.py | 10 +--- src/pytorch_lightning/cli.py | 2 +- src/pytorch_lightning/lite/lite.py | 7 +-- src/pytorch_lightning/loggers/comet.py | 32 ++++++------ src/pytorch_lightning/loggers/csv_logs.py | 2 +- src/pytorch_lightning/loggers/mlflow.py | 4 +- src/pytorch_lightning/loggers/tensorboard.py | 2 +- src/pytorch_lightning/loggers/wandb.py | 2 +- src/pytorch_lightning/strategies/ddp.py | 10 +--- .../strategies/launchers/subprocess_script.py | 26 +--------- .../connectors/accelerator_connector.py | 52 ++++++------------- src/pytorch_lightning/trainer/trainer.py | 12 ----- .../utilities/distributed.py | 13 +---- src/pytorch_lightning/utilities/enums.py | 4 +- src/pytorch_lightning/utilities/logger.py | 6 ++- 22 files changed, 144 insertions(+), 154 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index a8dbcff69b631..4b0d22529d613 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,54 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * +recursive-include src *.md +recursive-include requirements *.txt +recursive-include src/lightning_app/cli/*-template * diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index d6917f3c735da..9202c95a897f9 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: '62' +name: '92' diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index fc92b83647860..586924062c45c 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -6,6 +6,6 @@ LightningTrainingComponent( "train.py", num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), + cloud_compute=CloudCompute("cpu"), ), ) diff --git a/setup.py b/setup.py index a542b3c1e0291..6d271cc40b0aa 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") +_PACKAGE_NAME = "" _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index e464098087164..7d3ac7903fc32 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -1,15 +1,16 @@ import logging import os -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union from lightning import CloudCompute from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript +from lightning_app.storage.path import Path _logger = logging.getLogger(__name__) -class PyTorchLightningPythonScript(TracerPythonScript): +class PyTorchLightningScriptRunner(TracerPythonScript): def __init__( self, script_path: str, @@ -25,13 +26,10 @@ def __init__( ) self.node_rank = node_rank self.num_nodes = num_nodes - self.best_model_path: None + self.best_model_path = None self.best_model_score = None self.sanity_serving = sanity_serving self.has_finished = False - self.master_address = None - self.master_port = None - self.world_size = None def configure_tracer(self): from pytorch_lightning import Trainer @@ -45,32 +43,37 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): _logger.info(f"The node {self.node_rank} started !") return - _logger.debug(f"Internal URLS: {internal_urls}") - - self.master_address = str(internal_urls[0][0]) - self.master_port = str(internal_urls[0][1]) - devices = self.cloud_compute.devices - self.world_size = self.num_nodes * devices + master_address = str(internal_urls[0][0]) + master_port = str(internal_urls[0][1]) distributed_env_vars = { - "MASTER_ADDRESS": self.master_address, - "MASTER_PORT": self.master_port, + "MASTER_ADDR": master_address, + "MASTER_PORT": master_port, "NODE_RANK": str(self.node_rank), - "WORLD_SIZE": str(self.world_size), + "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices), "PL_TRAINER_NUM_NODES": str(self.num_nodes), "PL_TRAINER_STRATEGY": "ddp", "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), "PL_TRAINER_ACCELERATOR": "auto", - "PL_TORCH_DISTRIBUTED_BACKEND": "gloo", } - - _logger.info(distributed_env_vars) os.environ.update(distributed_env_vars) return super().run() def on_after_run(self, script_globals): - # TODO: Why does it hang there. + from pytorch_lightning import Trainer + from pytorch_lightning.utilities.cli import LightningCLI + + cli = [v for v in script_globals.values() if isinstance(v, LightningCLI)] + if cli: + trainer = cli[0].trainer + else: + trainer = [v for v in script_globals.values() if isinstance(v, Trainer)][0] + + if trainer.checkpoint_callback.best_model_score: + self.best_model_path = Path(trainer.checkpoint_callback.best_model_path) + self.best_model_score = float(trainer.checkpoint_callback.best_model_score) self.has_finished = True + # TODO: Why does it hang there. raise SystemExit(0) def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): @@ -96,10 +99,11 @@ def __init__( script_path: str, script_args: Optional[Union[list, str]] = None, num_nodes: int = 1, - cloud_compute: CloudCompute = CloudCompute("cpu"), + cloud_compute: CloudCompute = CloudCompute("default"), sanity_serving: bool = False, + script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, ): - """This component enables to perform distributed training. + """This component enables to perform distributed multi-node multi-gpus training. Arguments: script_path: Path to the script to be executed. @@ -117,11 +121,12 @@ def __init__( self.num_nodes = num_nodes self._cloud_compute = cloud_compute # TODO: Add support for cloudCOmpute self.sanity_serving = sanity_serving + self._script_runner = script_runner def run(self): if not self.has_initialized: for node_rank in range(self.num_nodes): - self.ws[str(node_rank)] = PyTorchLightningPythonScript( + self.ws[str(node_rank)] = self._script_runner( script_path=self.script_path, script_args=self.script_args, cloud_compute=self._cloud_compute, diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index d181cd32204ec..dcec4cf858828 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -2,6 +2,8 @@ from typing import List, Optional, Union _name_to_devices_map = { + "default": 2, + "cpu": 2, "gpu": 1, "gpu-fast": 1, "gpu-fast-multi": 4, diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index af53c9b063853..b2c7ca54e68a7 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -110,6 +110,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) + + - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) @@ -166,6 +169,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated +- Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636)) + + - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index 4d6b9bebc2d25..1c69015546976 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -79,9 +79,7 @@ def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: @staticmethod def get_parallel_devices(devices: List[int]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" - parallel_devices = [torch.device("cuda", i) for i in devices] - print("get_parallel_devices", parallel_devices) - return parallel_devices + return [torch.device("cuda", i) for i in devices] @staticmethod def auto_device_count() -> int: @@ -99,12 +97,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) - # temporarily enable "gpu" to point to the CUDA Accelerator - accelerator_registry.register( - "gpu", - cls, - description=f"{cls.__class__.__name__}", - ) def teardown(self) -> None: # clean up memory diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index 169f16b66cd33..d3990d79c5c88 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -27,7 +27,7 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn -_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.10.2") +_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.12.0") if _JSONARGPARSE_SIGNATURES_AVAILABLE: import docstring_parser diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 86bddaf676e01..0195e6852eb28 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -54,7 +54,8 @@ class LightningLite(ABC): - Multi-node support. Args: - accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``. + accelerator: The hardware to run on. Possible choices are: + ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. strategy: Strategy for how to run across multiple devices. Possible choices are: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``. devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``. @@ -436,7 +437,7 @@ def _get_distributed_sampler(dataloader: DataLoader, **kwargs: Any) -> Distribut return DistributedSamplerWrapper(dataloader.sampler, **kwargs) def _check_accelerator_support(self, accelerator: Optional[Union[str, Accelerator]]) -> None: - supported = [t.value.lower() for t in self._supported_device_types()] + ["auto"] + supported = [t.value.lower() for t in self._supported_device_types()] + ["gpu", "auto"] valid = accelerator is None or isinstance(accelerator, Accelerator) or accelerator in supported if not valid: raise MisconfigurationException( @@ -457,7 +458,7 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N def _supported_device_types() -> Sequence[_AcceleratorType]: return ( _AcceleratorType.CPU, - _AcceleratorType.GPU, + _AcceleratorType.CUDA, _AcceleratorType.TPU, _AcceleratorType.MPS, ) diff --git a/src/pytorch_lightning/loggers/comet.py b/src/pytorch_lightning/loggers/comet.py index 2b853f59259ff..363d47c1166e6 100644 --- a/src/pytorch_lightning/loggers/comet.py +++ b/src/pytorch_lightning/loggers/comet.py @@ -21,7 +21,7 @@ from argparse import Namespace from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union -from torch import is_tensor, Tensor +from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment @@ -141,7 +141,7 @@ def __init__( prefix: str = "", agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, - **kwargs, + **kwargs: Any, ): if comet_ml is None: raise ModuleNotFoundError( @@ -149,6 +149,8 @@ def __init__( ) super().__init__(agg_key_funcs=agg_key_funcs, agg_default_func=agg_default_func) self._experiment = None + self._save_dir: Optional[str] + self.rest_api_key: Optional[str] # Determine online or offline mode based on which arguments were passed to CometLogger api_key = api_key or comet_ml.config.get_api_key(None, comet_ml.config.get_config()) @@ -170,12 +172,12 @@ def __init__( log.info(f"CometLogger will be initialized in {self.mode} mode") - self._project_name = project_name - self._experiment_key = experiment_key - self._experiment_name = experiment_name - self._prefix = prefix - self._kwargs = kwargs - self._future_experiment_key = None + self._project_name: Optional[str] = project_name + self._experiment_key: Optional[str] = experiment_key + self._experiment_name: Optional[str] = experiment_name + self._prefix: str = prefix + self._kwargs: Any = kwargs + self._future_experiment_key: Optional[str] = None if rest_api_key is not None: # Comet.ml rest API, used to determine version number @@ -185,9 +187,7 @@ def __init__( self.rest_api_key = None self.comet_api = None - self._kwargs = kwargs - - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> Union[CometExperiment, CometExistingExperiment, CometOfflineExperiment]: r""" @@ -240,19 +240,19 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_parameters(params) @rank_zero_only - def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, Union[Tensor, float]], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" # Comet.ml expects metrics to be a dictionary of detached tensors on CPU metrics_without_epoch = metrics.copy() for key, val in metrics_without_epoch.items(): - if is_tensor(val): + if isinstance(val, Tensor): metrics_without_epoch[key] = val.cpu().detach() epoch = metrics_without_epoch.pop("epoch", None) metrics_without_epoch = _add_prefix(metrics_without_epoch, self._prefix, self.LOGGER_JOIN_CHAR) self.experiment.log_metrics(metrics_without_epoch, step=step, epoch=epoch) - def reset_experiment(self): + def reset_experiment(self) -> None: self._experiment = None @rank_zero_only @@ -326,7 +326,7 @@ def version(self) -> str: return self._future_experiment_key - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: state = self.__dict__.copy() # Save the experiment id in case an experiment object already exists, @@ -340,6 +340,6 @@ def __getstate__(self): state["_experiment"] = None return state - def log_graph(self, model: "pl.LightningModule", input_array=None) -> None: + def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = None) -> None: if self._experiment is not None: self._experiment.set_model_graph(model) diff --git a/src/pytorch_lightning/loggers/csv_logs.py b/src/pytorch_lightning/loggers/csv_logs.py index 72d21ae2c4974..45d5fffb51e33 100644 --- a/src/pytorch_lightning/loggers/csv_logs.py +++ b/src/pytorch_lightning/loggers/csv_logs.py @@ -195,7 +195,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_hparams(params) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) self.experiment.log_metrics(metrics, step) if step is not None and (step + 1) % self._flush_logs_every_n_steps == 0: diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py index 313fcfe07f10e..5675a3bd9fc67 100644 --- a/src/pytorch_lightning/loggers/mlflow.py +++ b/src/pytorch_lightning/loggers/mlflow.py @@ -20,7 +20,7 @@ import re from argparse import Namespace from time import time -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Mapping, Optional, Union from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment from pytorch_lightning.utilities.imports import _module_available @@ -230,7 +230,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_param(self.run_id, k, v) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 12ec2e21b84ce..dacecf129523b 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -216,7 +216,7 @@ def log_hyperparams( writer.add_summary(sei) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index bc2a84dc82b00..8e30827759b99 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -379,7 +379,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.config.update(params, allow_val_change=True) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 8fe2fcf3cd9db..922730df35269 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -83,8 +83,8 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ddp_comm_state: Optional[object] = None, - ddp_comm_hook: Optional[Callable] = None, - ddp_comm_wrapper: Optional[Callable] = None, + ddp_comm_hook: Optional[callable] = None, + ddp_comm_wrapper: Optional[callable] = None, model_averaging_period: Optional[int] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, @@ -216,15 +216,9 @@ def _get_process_group_backend(self) -> str: def set_world_ranks(self) -> None: if self.cluster_environment is None: return - print(f"node_rank: {self.node_rank}") - print(f"num_processes: {self.num_processes}") - print(f"local_rank: {self.local_rank}") - print("num_nodes", self.num_nodes) self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) rank_zero_only.rank = self.cluster_environment.global_rank() - print(f"global_rank: {rank_zero_only.rank}") - print("world_size", self.cluster_environment.world_size()) def pre_configure_ddp(self) -> None: # if unset, default `find_unused_parameters` `True` diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index 46c00342dbfb4..5a8632fb87306 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -88,31 +88,22 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer`. **kwargs: Optional keyword arguments to be passed to the given function. """ - print("creates_processes_externally", self.cluster_environment.creates_processes_externally) if not self.cluster_environment.creates_processes_externally: - print("_call_children_scripts") self._call_children_scripts() - print("After creating") return function(*args, **kwargs) def _call_children_scripts(self) -> None: - print("1") # bookkeeping of spawned processes self._check_can_spawn_children() - print("2") # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.main_address os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) - print("3") - # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) - print("4") - # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover @@ -133,44 +124,31 @@ def _call_children_scripts(self) -> None: else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] - print("5", self.num_processes, self.num_nodes) - os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" - print("6") - for local_rank in range(1, self.num_processes): - print("7") env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" - print(f"Creating {local_rank}") - # remove env var if global seed not set if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = os.getcwd() + cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] - - print(command, cwd) - process = subprocess.Popen(command, env=env_copy, cwd=cwd, stderr=sys.stderr) + subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) - print(process.returncode) - - print("done !") - def _check_can_spawn_children(self) -> None: if self.cluster_environment.local_rank() != 0: raise RuntimeError( diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index e19e211b64d16..bd879cf85ff7a 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -151,8 +151,6 @@ def __init__( A. Class > str B. Strategy > Accelerator/precision/plugins """ - print("Accelerator Connector", num_nodes, devices, accelerator, strategy) - if deterministic: if benchmark is None: # Set benchmark to False to ensure determinism @@ -190,8 +188,6 @@ def __init__( self._amp_level_flag: Optional[str] = amp_level self._auto_select_gpus: bool = auto_select_gpus - print("1") - self._check_config_and_set_final_flags( strategy=strategy, accelerator=accelerator, @@ -201,29 +197,23 @@ def __init__( amp_level=amp_level, sync_batchnorm=sync_batchnorm, ) - - print("2") - self._check_device_config_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) - - print("3") - # 2. Instantiate Accelerator - # handle `auto` and `None` self._set_accelerator_if_ipu_strategy_is_passed() + + # handle `auto`, `None` and `gpu` if self._accelerator_flag == "auto" or self._accelerator_flag is None: - self._accelerator_flag = self._choose_accelerator() - self._set_parallel_devices_and_init_accelerator() + self._accelerator_flag = self._choose_auto_accelerator() + elif self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() - print("4") + self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment() - print("5") - # 4. Instantiate Strategy - Part 1 if self._strategy_flag is None: self._strategy_flag = self._choose_strategy() @@ -231,18 +221,12 @@ def __init__( self._check_strategy_and_fallback() self._init_strategy() - print("6") - # 5. Instantiate Precision Plugin self.precision_plugin = self._check_and_init_precision() - print("7") - # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() - print("8") - def _init_deterministic(self, deterministic: Optional[Union[bool, _LITERAL_WARN]]) -> None: self.deterministic = deterministic or False # default to False if not set if _TORCH_GREATER_EQUAL_1_11 and deterministic == "warn": @@ -300,7 +284,7 @@ def _check_config_and_set_final_flags( if ( accelerator is not None and accelerator not in self._accelerator_types - and accelerator != "auto" + and accelerator not in ("auto", "gpu") and not isinstance(accelerator, Accelerator) ): raise ValueError( @@ -507,7 +491,7 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None: if isinstance(self._strategy_flag, IPUStrategy): self._accelerator_flag = "ipu" - def _choose_accelerator(self) -> str: + def _choose_auto_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" if self._accelerator_flag == "auto": if _TPU_AVAILABLE: @@ -522,6 +506,15 @@ def _choose_accelerator(self) -> str: return "cuda" return "cpu" + @staticmethod + def _choose_gpu_accelerator_backend() -> str: + if MPSAccelerator.is_available(): + return "mps" + if CUDAAccelerator.is_available(): + return "cuda" + + raise MisconfigurationException("No supported gpu backend found!") + def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._accelerator_flag, Accelerator): self.accelerator: Accelerator = self._accelerator_flag @@ -550,12 +543,10 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._devices_flag = self.accelerator.parse_devices(self._devices_flag) if not self._parallel_devices: self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) - print("Right there", self._parallel_devices) def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or self._devices_flag is None: self._devices_flag = self.accelerator.auto_device_count() - print(f"Auto device {self._devices_flag}") def _set_devices_flag_if_auto_select_gpus_passed(self) -> None: if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, CUDAAccelerator): @@ -792,33 +783,24 @@ def _validate_precision_choice(self) -> None: def _lazy_init_strategy(self) -> None: """Lazily set missing attributes on the previously instantiated strategy.""" - print("a") self.strategy.accelerator = self.accelerator if self.precision_plugin: self.strategy.precision_plugin = self.precision_plugin if self.checkpoint_io: self.strategy.checkpoint_io = self.checkpoint_io - print("b", self.cluster_environment) if hasattr(self.strategy, "cluster_environment"): self.strategy.cluster_environment = self.cluster_environment if hasattr(self.strategy, "parallel_devices"): - print("c", self.strategy.parallel_devices) if self.strategy.parallel_devices: self._parallel_devices = self.strategy.parallel_devices else: - print("c1") - # print(self._parallel_devices, os.environ) self.strategy.parallel_devices = self._parallel_devices - print("c2") if hasattr(self.strategy, "num_nodes"): - print("d", self._num_nodes_flag) self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "_layer_sync"): self.strategy._layer_sync = self._layer_sync if hasattr(self.strategy, "set_world_ranks"): - print("e") self.strategy.set_world_ranks() - print("f") self.strategy._configure_launcher() from pytorch_lightning.utilities import _IS_INTERACTIVE diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index affc7b3f9d140..d10225fea2d65 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -430,8 +430,6 @@ def __init__( # init connectors self._data_connector = DataConnector(self, multiple_trainloader_mode) - print("before accelerator_connector") - self._accelerator_connector = AcceleratorConnector( num_processes=num_processes, devices=devices, @@ -451,16 +449,12 @@ def __init__( amp_level=amp_level, plugins=plugins, ) - print("after accelerator_connector") - self._logger_connector = LoggerConnector(self) self._callback_connector = CallbackConnector(self) self._checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) self._signal_connector = SignalConnector(self) self.tuner = Tuner(self) - print("_parse_loop_limits") - min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits( min_steps, max_steps, min_epochs, max_epochs, max_time ) @@ -468,8 +462,6 @@ def __init__( training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps) fit_loop.connect(epoch_loop=training_epoch_loop) - print("TrainingEpochLoop") - # default .fit() loop self.fit_loop = fit_loop @@ -491,8 +483,6 @@ def __init__( self._tested_ckpt_path: Optional[str] = None # TODO: remove in v1.8 self._predicted_ckpt_path: Optional[str] = None # TODO: remove in v1.8 - print("on_trainer_init") - # init callbacks # Declare attributes to be set in _callback_connector on_trainer_init self._callback_connector.on_trainer_init( @@ -517,8 +507,6 @@ def __init__( check_val_every_n_epoch, ) - print("on_trainer_init") - # gradient clipping if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)): raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.") diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index 903fe300a2de5..361c6dd12beeb 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -366,23 +366,12 @@ def init_dist_connection( if torch.distributed.is_initialized(): log.debug("torch.distributed is already initialized. Exiting early") return - - print(cluster_environment) - global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() world_size = world_size if world_size is not None else cluster_environment.world_size() os.environ["MASTER_ADDR"] = cluster_environment.main_address os.environ["MASTER_PORT"] = str(cluster_environment.main_port) log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch.distributed.init_process_group( - torch_distributed_backend, - init_method=f"tcp://{cluster_environment.main_address}:{cluster_environment.main_port}", - rank=global_rank, - world_size=world_size, - **kwargs, - ) - - print("HERE") + torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) # on rank=0 let everyone know training is starting new_rank_zero_info( diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index 91f8466b77500..d7d3a14ec924a 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -244,7 +244,7 @@ class _AcceleratorType(LightningEnum): >>> _AcceleratorType.CPU == _AcceleratorType.from_str('cpu') True >>> # you can match the type with string - >>> _AcceleratorType.GPU == 'GPU' + >>> _AcceleratorType.CUDA == 'CUDA' True >>> # which is case invariant >>> _AcceleratorType.TPU in ('tpu', 'CPU') @@ -252,7 +252,7 @@ class _AcceleratorType(LightningEnum): """ CPU = "CPU" - GPU = "GPU" + CUDA = "CUDA" IPU = "IPU" TPU = "TPU" HPU = "HPU" diff --git a/src/pytorch_lightning/utilities/logger.py b/src/pytorch_lightning/utilities/logger.py index 07ecf4c3c0ca0..24d75e4f41034 100644 --- a/src/pytorch_lightning/utilities/logger.py +++ b/src/pytorch_lightning/utilities/logger.py @@ -14,7 +14,7 @@ """Utilities for loggers.""" from argparse import Namespace -from typing import Any, Dict, Generator, List, MutableMapping, Optional, Union +from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Union import numpy as np import torch @@ -132,7 +132,9 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: return params -def _add_prefix(metrics: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]: +def _add_prefix( + metrics: Mapping[str, Union[Tensor, float]], prefix: str, separator: str +) -> Mapping[str, Union[Tensor, float]]: """Insert prefix before each key in a dict, separated by the separator. Args: From fa28c53890db2946afba70647318dc8dfce4844e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 20:43:28 +0200 Subject: [PATCH 17/40] update --- MANIFEST.in | 51 --------------------------------------------------- setup.py | 2 +- 2 files changed, 1 insertion(+), 52 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 4b0d22529d613..a8dbcff69b631 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,54 +3,3 @@ exclude requirements.txt exclude __pycache__ include .actions/setup_tools.py include *.cff # citation info -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * -recursive-include src *.md -recursive-include requirements *.txt -recursive-include src/lightning_app/cli/*-template * diff --git a/setup.py b/setup.py index 6d271cc40b0aa..a542b3c1e0291 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ from setuptools import setup -_PACKAGE_NAME = "" +_PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ From a087275e42c27620cf84d85310cd1d3a842b415f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 20:45:42 +0200 Subject: [PATCH 18/40] update --- src/lightning_app/utilities/packaging/cloud_compute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index dcec4cf858828..e7b05cd6548aa 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union _name_to_devices_map = { - "default": 2, - "cpu": 2, + "default": 1, + "cpu": 1, "gpu": 1, "gpu-fast": 1, "gpu-fast-multi": 4, From b8c1ff349e13a210bd50c51392626e859640d398 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 20:51:06 +0200 Subject: [PATCH 19/40] update --- examples/app_multi_node/.lightning | 2 +- src/lightning_app/components/training.py | 3 ++- src/lightning_app/utilities/packaging/cloud_compute.py | 6 ------ 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/app_multi_node/.lightning b/examples/app_multi_node/.lightning index 9202c95a897f9..7befcc74ea6d3 100644 --- a/examples/app_multi_node/.lightning +++ b/examples/app_multi_node/.lightning @@ -1 +1 @@ -name: '92' +name: multi-node-demo diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 7d3ac7903fc32..17d7e46b7d207 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -142,7 +142,8 @@ def run(self): internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()] work.run(internal_urls) if all(w.has_finished for w in self.ws.values()): - self._exit("Finished training") + for w in self.ws.values(): + w.stop() else: work.run() diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index e7b05cd6548aa..0733e8cd816ed 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -69,12 +69,6 @@ def from_dict(cls, d): @property def devices(self) -> int: - # TODO: Add a resolver here. if self.name in _name_to_devices_map: return _name_to_devices_map[self.name] return 1 - - @property - def accelerator(self) -> str: - # TODO: Add a resolver here. - return self.name From 7860ce406a109d94c40b8714526a96634b69b747 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 09:46:51 +0200 Subject: [PATCH 20/40] update --- tests/tests_app/components/test_training.py | 0 tests/tests_app_examples/test_multi_node.py | 29 +++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/tests_app/components/test_training.py create mode 100644 tests/tests_app_examples/test_multi_node.py diff --git a/tests/tests_app/components/test_training.py b/tests/tests_app/components/test_training.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_app_examples/test_multi_node.py b/tests/tests_app_examples/test_multi_node.py new file mode 100644 index 0000000000000..4b5c80c0cd9cb --- /dev/null +++ b/tests/tests_app_examples/test_multi_node.py @@ -0,0 +1,29 @@ +import os + +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import application_testing, LightningTestApp + + +class LightningTestMultiNodeApp(LightningTestApp): + def on_before_run_once(self): + res = super().on_before_run_once() + if all(w.has_finished for w in self.works): + return True + return res + + +def test_multi_node_example(): + cwd = os.getcwd() + new_cwd = os.path.join(_PROJECT_ROOT, "examples/app_multi_node") + os.chdir(new_cwd) + command_line = [ + "app.py", + "--blocking", + "False", + "--open-ui", + "False", + ] + result = application_testing(LightningTestMultiNodeApp, command_line) + assert result.exit_code == 0 + os.chdir(cwd) From 0701fc8e008139fec9329816404b3cb5da2e6c53 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 09:47:12 +0200 Subject: [PATCH 21/40] update --- tests/tests_app/components/test_training.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/tests_app/components/test_training.py diff --git a/tests/tests_app/components/test_training.py b/tests/tests_app/components/test_training.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From c0df7c342ac9ac1ef55ef9fcd660f641ba57e0c0 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 09:52:20 +0200 Subject: [PATCH 22/40] update --- .gitignore | 1 + examples/app_multi_node/app.py | 2 +- src/lightning_app/components/training.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7040a912974e1..0f03c69600bed 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ src/lightning_app/ui/* *examples/template_react_ui* hars* artifacts/* +*docs/examples* diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 586924062c45c..3750e67c485f3 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -1,6 +1,6 @@ from lightning import LightningApp from lightning.app.components.training import LightningTrainingComponent -from lightning_app.utilities.packaging.cloud_compute import CloudCompute +from lightning.app.utilities.packaging.cloud_compute import CloudCompute app = LightningApp( LightningTrainingComponent( diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 17d7e46b7d207..768d4a3b7f7e2 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -105,6 +105,20 @@ def __init__( ): """This component enables to perform distributed multi-node multi-gpus training. + Example:: + + from lightning import LightningApp + from lightning.app.components.training import LightningTrainingComponent + from lightning.app.utilities.packaging.cloud_compute import CloudCompute + + app = LightningApp( + LightningTrainingComponent( + "train.py", + num_nodes=2, + cloud_compute=CloudCompute("gpu"), + ), + ) + Arguments: script_path: Path to the script to be executed. script_args: The arguments to be pass to the script. From e596856a289979b9b23a2882ac104b88a7c822b4 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 09:55:24 +0200 Subject: [PATCH 23/40] update --- src/lightning_app/components/training.py | 31 ++++++++++++------------ 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 768d4a3b7f7e2..af5c904b8a83f 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -46,17 +46,18 @@ def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): master_address = str(internal_urls[0][0]) master_port = str(internal_urls[0][1]) - distributed_env_vars = { - "MASTER_ADDR": master_address, - "MASTER_PORT": master_port, - "NODE_RANK": str(self.node_rank), - "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices), - "PL_TRAINER_NUM_NODES": str(self.num_nodes), - "PL_TRAINER_STRATEGY": "ddp", - "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), - "PL_TRAINER_ACCELERATOR": "auto", - } - os.environ.update(distributed_env_vars) + os.environ.update( + { + "MASTER_ADDR": master_address, + "MASTER_PORT": master_port, + "NODE_RANK": str(self.node_rank), + "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices), + "PL_TRAINER_NUM_NODES": str(self.num_nodes), + "PL_TRAINER_STRATEGY": "ddp", + "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), + "PL_TRAINER_ACCELERATOR": "auto", + } + ) return super().run() def on_after_run(self, script_globals): @@ -103,7 +104,7 @@ def __init__( sanity_serving: bool = False, script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, ): - """This component enables to perform distributed multi-node multi-gpus training. + """This component enables to perform distributed multi-node multi-devices training. Example:: @@ -133,7 +134,7 @@ def __init__( self.script_path = script_path self.script_args = script_args self.num_nodes = num_nodes - self._cloud_compute = cloud_compute # TODO: Add support for cloudCOmpute + self._cloud_compute = cloud_compute # TODO: Add support for cloudCompute self.sanity_serving = sanity_serving self._script_runner = script_runner @@ -152,7 +153,7 @@ def run(self): self.has_initialized = True for work in self.ws.values(): - if self.ready: + if self._ready: internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()] work.run(internal_urls) if all(w.has_finished for w in self.ws.values()): @@ -162,5 +163,5 @@ def run(self): work.run() @property - def ready(self) -> bool: + def _ready(self) -> bool: return all(w.internal_ip for w in self.ws.values()) From 253aa4323a483eeb46c24e86729ef90882ef2a4e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 15:59:28 +0200 Subject: [PATCH 24/40] update --- src/lightning_app/components/python/tracer.py | 32 +++++- src/lightning_app/components/training.py | 108 +++++++++++------- src/lightning_app/source_code/local.py | 1 + .../utilities/packaging/cloud_compute.py | 14 --- .../utilities/packaging/tarfile.py | 26 +++++ 5 files changed, 121 insertions(+), 60 deletions(-) create mode 100644 src/lightning_app/utilities/packaging/tarfile.py diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index fa955646acbbf..57f5b3cbdccf6 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -2,16 +2,23 @@ import os import signal import sys -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, TypedDict, Union from lightning_app import LightningWork +from lightning_app.storage.drive import Drive from lightning_app.storage.payload import Payload from lightning_app.utilities.app_helpers import _collect_child_process_pids +from lightning_app.utilities.packaging.tarfile import clean_tarfile, extract_tarfile from lightning_app.utilities.tracer import Tracer logger = logging.getLogger(__name__) +class Code(TypedDict): + drive: Drive + name: str + + class TracerPythonScript(LightningWork): def on_before_run(self): """Called before the python script is executed.""" @@ -101,13 +108,34 @@ def __init__( self.outputs = outputs or [] for name in self.outputs: setattr(self, name, None) + self.params = None + + def run(self, params: Optional[Dict[str, Any]] = None, code: Optional[Code] = None, **kwargs): + """ + Arguments: + params: A dictionary of arguments to be be added to script_args + code: A dictionary with a drive and a file name to get retrieve + """ + + if params: + self.params = params + self.script_args.extend([f"--{k}={v}" for k, v in params.items()]) + + if code: + raise Exception(code) + clean_tarfile(code["name"], "r:gz") + code["drive"].get(code["name"]) + extract_tarfile(code["name"], ".", "r:gz") + os.remove(code["name"]) - def run(self, **kwargs): if not os.path.exists(self.script_path): raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.") + kwargs = {k: v.value if isinstance(v, Payload) else v for k, v in kwargs.items()} + init_globals = globals() init_globals.update(kwargs) + self.on_before_run() env_copy = os.environ.copy() if self.env: diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index af5c904b8a83f..dea0bfeda5324 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -1,6 +1,6 @@ import logging import os -from typing import List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union from lightning import CloudCompute from lightning_app import LightningFlow, structures @@ -19,10 +19,18 @@ def __init__( num_nodes: int = 1, sanity_serving: bool = False, cloud_compute: Optional[CloudCompute] = None, + parallel: bool = True, + raise_exception: bool = True, + env: Optional[Dict[str, Any]] = None, **kwargs, ): super().__init__( - script_path, script_args, raise_exception=True, parallel=True, cloud_compute=cloud_compute, **kwargs + script_path, + script_args, + raise_exception=raise_exception, + parallel=parallel, + cloud_compute=cloud_compute, + **kwargs, ) self.node_rank = node_rank self.num_nodes = num_nodes @@ -30,6 +38,7 @@ def __init__( self.best_model_score = None self.sanity_serving = sanity_serving self.has_finished = False + self.env = env def configure_tracer(self): from pytorch_lightning import Trainer @@ -38,44 +47,46 @@ def configure_tracer(self): tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) return tracer - def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None): + def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs): if not internal_urls: + # Note: This is called only once. _logger.info(f"The node {self.node_rank} started !") return - master_address = str(internal_urls[0][0]) - master_port = str(internal_urls[0][1]) - - os.environ.update( - { - "MASTER_ADDR": master_address, - "MASTER_PORT": master_port, - "NODE_RANK": str(self.node_rank), - "WORLD_SIZE": str(self.num_nodes * self.cloud_compute.devices), - "PL_TRAINER_NUM_NODES": str(self.num_nodes), - "PL_TRAINER_STRATEGY": "ddp", - "PL_TRAINER_DEVICES": str(self.cloud_compute.devices), - "PL_TRAINER_ACCELERATOR": "auto", - } - ) - return super().run() + if self.env: + os.environ.update(self.env) + + distributed_env_vars = { + "MASTER_ADDR": internal_urls[0][0], + "MASTER_PORT": str(internal_urls[0][1]), + "NODE_RANK": str(self.node_rank), + "PL_TRAINER_NUM_NODES": str(self.num_nodes), + "PL_TRAINER_DEVICES": "auto", + "PL_TRAINER_ACCELERATOR": "auto", + } + + os.environ.update(distributed_env_vars) + return super().run(**kwargs) def on_after_run(self, script_globals): from pytorch_lightning import Trainer - from pytorch_lightning.utilities.cli import LightningCLI - - cli = [v for v in script_globals.values() if isinstance(v, LightningCLI)] - if cli: - trainer = cli[0].trainer + from pytorch_lightning.cli import LightningCLI + + for v in script_globals.values(): + if isinstance(v, LightningCLI): + trainer = v.trainer + break + elif isinstance(v, Trainer): + trainer = v + break else: - trainer = [v for v in script_globals.values() if isinstance(v, Trainer)][0] + raise RuntimeError("No trainer instance found.") if trainer.checkpoint_callback.best_model_score: self.best_model_path = Path(trainer.checkpoint_callback.best_model_path) self.best_model_score = float(trainer.checkpoint_callback.best_model_score) + self.has_finished = True - # TODO: Why does it hang there. - raise SystemExit(0) def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): if self.node_rank != 0: @@ -103,6 +114,7 @@ def __init__( cloud_compute: CloudCompute = CloudCompute("default"), sanity_serving: bool = False, script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, + **kwargs, ): """This component enables to perform distributed multi-node multi-devices training. @@ -129,7 +141,7 @@ def __init__( the ServableModule API """ super().__init__() - self.ws = structures.Dict() + self.ws = structures.List() self.has_initialized = False self.script_path = script_path self.script_args = script_args @@ -137,31 +149,39 @@ def __init__( self._cloud_compute = cloud_compute # TODO: Add support for cloudCompute self.sanity_serving = sanity_serving self._script_runner = script_runner + self._kwargs = kwargs - def run(self): + def run(self, **kwargs): if not self.has_initialized: for node_rank in range(self.num_nodes): - self.ws[str(node_rank)] = self._script_runner( - script_path=self.script_path, - script_args=self.script_args, - cloud_compute=self._cloud_compute, - node_rank=node_rank, - sanity_serving=self.sanity_serving, - num_nodes=self.num_nodes, + self.ws.append( + self._script_runner( + script_path=self.script_path, + script_args=self.script_args, + cloud_compute=self._cloud_compute, + node_rank=node_rank, + sanity_serving=self.sanity_serving, + num_nodes=self.num_nodes, + **self._kwargs, + ) ) self.has_initialized = True - for work in self.ws.values(): - if self._ready: - internal_urls = [(w.internal_ip, w.port) for w in self.ws.values()] - work.run(internal_urls) - if all(w.has_finished for w in self.ws.values()): - for w in self.ws.values(): + for work in self.ws: + if all(w.internal_ip for w in self.ws): + internal_urls = [(w.internal_ip, w.port) for w in self.ws] + work.run(internal_urls=internal_urls, **kwargs) + if all(w.has_finished for w in self.ws): + for w in self.ws: w.stop() else: work.run() @property - def _ready(self) -> bool: - return all(w.internal_ip for w in self.ws.values()) + def best_model_score(self) -> Optional[float]: + return self.ws[0].best_model_score + + @property + def best_model_paths(self) -> List[Optional[Path]]: + return [self.ws[node_idx].best_mode_path for node_idx in range(len(self.ws))] diff --git a/src/lightning_app/source_code/local.py b/src/lightning_app/source_code/local.py index a42347ac42101..05669dff2f6a5 100644 --- a/src/lightning_app/source_code/local.py +++ b/src/lightning_app/source_code/local.py @@ -94,6 +94,7 @@ def upload(self, url: str) -> None: raise OSError( "cannot upload directory code whose total fize size is greater than 2GB (2e9 bytes)" ) from None + uploader = FileUploader( presigned_url=url, source_file=str(self.package_path), diff --git a/src/lightning_app/utilities/packaging/cloud_compute.py b/src/lightning_app/utilities/packaging/cloud_compute.py index 0733e8cd816ed..6527911855bae 100644 --- a/src/lightning_app/utilities/packaging/cloud_compute.py +++ b/src/lightning_app/utilities/packaging/cloud_compute.py @@ -1,14 +1,6 @@ from dataclasses import asdict, dataclass from typing import List, Optional, Union -_name_to_devices_map = { - "default": 1, - "cpu": 1, - "gpu": 1, - "gpu-fast": 1, - "gpu-fast-multi": 4, -} - @dataclass class CloudCompute: @@ -66,9 +58,3 @@ def to_dict(self): @classmethod def from_dict(cls, d): return cls(**d["__cloud_compute__"]) - - @property - def devices(self) -> int: - if self.name in _name_to_devices_map: - return _name_to_devices_map[self.name] - return 1 diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py new file mode 100644 index 0000000000000..6e8a6e52aecc7 --- /dev/null +++ b/src/lightning_app/utilities/packaging/tarfile.py @@ -0,0 +1,26 @@ +import os +import shutil +import tarfile + + +def clean_tarfile(file_path: str, mode): + if os.path.exists(file_path): + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + p = member.path + if p != "." and os.path.exists(p): + if os.path.isfile(p): + os.remove(p) + else: + shutil.rmtree(p) + os.remove(file_path) + + +def extract_tarfile(file_path: str, extract_path: str, mode: str): + if os.path.exists(file_path): + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + try: + tar_ref.extract(member, path=extract_path, set_attrs=False) + except PermissionError: + raise PermissionError(f"Could not extract tar file {file_path}") From 0373fc769b1591837ecaa1bec8bbe0e323591f40 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 26 Jul 2022 16:11:01 +0200 Subject: [PATCH 25/40] update --- examples/app_multi_node/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py index b5e83d905047d..f14809354f405 100644 --- a/examples/app_multi_node/train.py +++ b/examples/app_multi_node/train.py @@ -3,5 +3,5 @@ if __name__ == "__main__": model = BoringModel() - trainer = Trainer(max_epochs=1, strategy="ddp") + trainer = Trainer(max_epochs=1) trainer.fit(model) From 71f1cfec9851f9017a9f367e20c5e5ba30964592 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 10:41:34 +0200 Subject: [PATCH 26/40] update --- examples/app_multi_node/app.py | 2 +- src/lightning_app/components/python/tracer.py | 34 +++++++++---- src/lightning_app/components/training.py | 19 ++++--- src/lightning_app/testing/testing.py | 3 ++ src/lightning_app/utilities/network.py | 2 +- .../utilities/packaging/tarfile.py | 47 ++++++++++------- src/lightning_app/utilities/proxies.py | 5 +- src/lightning_app/utilities/state.py | 8 +++ .../components/python/test_python.py | 50 +++++++++++++++++++ 9 files changed, 133 insertions(+), 37 deletions(-) diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py index 3750e67c485f3..6e405a346a143 100644 --- a/examples/app_multi_node/app.py +++ b/examples/app_multi_node/app.py @@ -6,6 +6,6 @@ LightningTrainingComponent( "train.py", num_nodes=2, - cloud_compute=CloudCompute("cpu"), + cloud_compute=CloudCompute("gpu-fast-multi"), ), ) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index 57f5b3cbdccf6..4b0b736b78b24 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -2,6 +2,7 @@ import os import signal import sys +from copy import deepcopy from typing import Any, Dict, List, Optional, TypedDict, Union from lightning_app import LightningWork @@ -38,6 +39,7 @@ def __init__( script_args: Optional[Union[list, str]] = None, outputs: Optional[List[str]] = None, env: Optional[Dict] = None, + code: Optional[Code] = None, **kwargs, ): """The TracerPythonScript class enables to easily run a python script. @@ -104,29 +106,37 @@ def __init__( if isinstance(script_args, str): script_args = script_args.split(" ") self.script_args = script_args if script_args else [] + self.original_args = deepcopy(self.script_args) self.env = env self.outputs = outputs or [] for name in self.outputs: setattr(self, name, None) self.params = None + self._code = code + self.restart_count = 0 - def run(self, params: Optional[Dict[str, Any]] = None, code: Optional[Code] = None, **kwargs): + def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs): """ Arguments: params: A dictionary of arguments to be be added to script_args - code: A dictionary with a drive and a file name to get retrieve + code: A dictionary with a drive and a file name to retrieve """ + if restart_count: + self.restart_count = restart_count if params: self.params = params - self.script_args.extend([f"--{k}={v}" for k, v in params.items()]) + self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()] - if code: - raise Exception(code) - clean_tarfile(code["name"], "r:gz") - code["drive"].get(code["name"]) - extract_tarfile(code["name"], ".", "r:gz") - os.remove(code["name"]) + if self._code: + drive = self._code["drive"] + name = self._code["name"] + if os.path.exists(name): + clean_tarfile(name, "r:gz") + + if name in drive.list(): + drive.get(name) + extract_tarfile(name, ".", "r:gz") if not os.path.exists(self.script_path): raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.") @@ -153,5 +163,11 @@ def on_exit(self): for child_pid in _collect_child_process_pids(os.getpid()): os.kill(child_pid, signal.SIGTERM) + @staticmethod + def _to_script_args(k: str, v: str) -> str: + if k.startswith("--"): + return f"{k}={v}" + return f"--{k}={v}" + __all__ = ["TracerPythonScript"] diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index dea0bfeda5324..91ba5786fbd7b 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -36,6 +36,7 @@ def __init__( self.num_nodes = num_nodes self.best_model_path = None self.best_model_score = None + self.monitor = None self.sanity_serving = sanity_serving self.has_finished = False self.env = env @@ -47,11 +48,11 @@ def configure_tracer(self): tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) return tracer - def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs): + def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs) -> None: if not internal_urls: # Note: This is called only once. _logger.info(f"The node {self.node_rank} started !") - return + return None if self.env: os.environ.update(self.env) @@ -82,9 +83,13 @@ def on_after_run(self, script_globals): else: raise RuntimeError("No trainer instance found.") + self.monitor = trainer.checkpoint_callback.monitor + if trainer.checkpoint_callback.best_model_score: self.best_model_path = Path(trainer.checkpoint_callback.best_model_path) self.best_model_score = float(trainer.checkpoint_callback.best_model_score) + else: + self.best_model_path = Path(trainer.checkpoint_callback.last_model_path) self.has_finished = True @@ -114,7 +119,7 @@ def __init__( cloud_compute: CloudCompute = CloudCompute("default"), sanity_serving: bool = False, script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, - **kwargs, + **script_runner_kwargs, ): """This component enables to perform distributed multi-node multi-devices training. @@ -149,9 +154,9 @@ def __init__( self._cloud_compute = cloud_compute # TODO: Add support for cloudCompute self.sanity_serving = sanity_serving self._script_runner = script_runner - self._kwargs = kwargs + self._script_runner_kwargs = script_runner_kwargs - def run(self, **kwargs): + def run(self, **run_kwargs): if not self.has_initialized: for node_rank in range(self.num_nodes): self.ws.append( @@ -162,7 +167,7 @@ def run(self, **kwargs): node_rank=node_rank, sanity_serving=self.sanity_serving, num_nodes=self.num_nodes, - **self._kwargs, + **self._script_runner_kwargs, ) ) @@ -171,7 +176,7 @@ def run(self, **kwargs): for work in self.ws: if all(w.internal_ip for w in self.ws): internal_urls = [(w.internal_ip, w.port) for w in self.ws] - work.run(internal_urls=internal_urls, **kwargs) + work.run(internal_urls=internal_urls, **run_kwargs) if all(w.has_finished for w in self.ws): for w in self.ws: w.stop() diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index bdf37cacf04a7..cc03f5badec2b 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -23,6 +23,7 @@ from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.imports import _is_playwright_available, requires from lightning_app.utilities.network import _configure_session, LightningClient +from lightning_app.utilities.proxies import ProxyWorkRun if _is_playwright_available(): import playwright @@ -114,6 +115,8 @@ def run_work_isolated(work, *args, start_server: bool = False, **kwargs): # pop the stopped status. call_hash = work._calls["latest_call_hash"] work._calls[call_hash]["statuses"].pop(-1) + if isinstance(work.run, ProxyWorkRun): + work.run = work.run.work_run def browser_context_args(browser_context_args: Dict) -> Dict: diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 98c7db3d46ff8..a9ebcf37ab564 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -48,7 +48,7 @@ def _configure_session() -> Session: return http -def _check_service_url_is_ready(url: str, timeout: float = 0.5) -> bool: +def _check_service_url_is_ready(url: str, timeout: float = 1) -> bool: try: response = requests.get(url, timeout=timeout) return response.status_code in (200, 404) diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py index 6e8a6e52aecc7..ca945baf95ca8 100644 --- a/src/lightning_app/utilities/packaging/tarfile.py +++ b/src/lightning_app/utilities/packaging/tarfile.py @@ -3,24 +3,37 @@ import tarfile -def clean_tarfile(file_path: str, mode): +def clean_tarfile(file_path: str, mode: str) -> None: + """This utility removes all files extracted from a tarfile.""" + + if not os.path.exists(file_path): + return None + + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + p = member.path + if p == "." or not os.path.exists(p): + continue + try: + if os.path.isfile(p): + os.remove(p) + else: + shutil.rmtree(p) + except (FileNotFoundError, OSError, PermissionError): + pass + if os.path.exists(file_path): - with tarfile.open(file_path, mode=mode) as tar_ref: - for member in tar_ref.getmembers(): - p = member.path - if p != "." and os.path.exists(p): - if os.path.isfile(p): - os.remove(p) - else: - shutil.rmtree(p) os.remove(file_path) -def extract_tarfile(file_path: str, extract_path: str, mode: str): - if os.path.exists(file_path): - with tarfile.open(file_path, mode=mode) as tar_ref: - for member in tar_ref.getmembers(): - try: - tar_ref.extract(member, path=extract_path, set_attrs=False) - except PermissionError: - raise PermissionError(f"Could not extract tar file {file_path}") +def extract_tarfile(file_path: str, extract_path: str, mode: str) -> None: + """This utility extract all files from a tarfile.""" + if not os.path.exists(file_path): + return None + + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + try: + tar_ref.extract(member, path=extract_path, set_attrs=False) + except PermissionError: + raise PermissionError(f"Could not extract tar file {file_path}") diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index c33e41bb70203..ce7a768e78a0b 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -408,8 +408,9 @@ def run_once(self): persist_artifacts(work=self.work) # 15. Destroy the state observer. - self.state_observer.join(0) - self.state_observer = None + if self.state_observer: + self.state_observer.join(0) + self.state_observer = None # 15. An asynchronous work shouldn't return a return value. if ret is not None: diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 0802a426e7349..3c16a7b4cdb11 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -187,6 +187,14 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: state=self._state["flows"][name], ) + elif name in self._state.get("structures", {}): + return AppState( + self._host, + self._port, + last_state=self._last_state["structures"][name], + state=self._state["structures"][name], + ) + raise AttributeError( f"Failed to access '{name}' through `AppState`. The state provides:" f" Variables: {list(self._state['vars'].keys())}," diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 61969ef1c4c51..6f739d84c0a4b 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -1,11 +1,15 @@ import os +import tarfile import pytest from tests_app import _PROJECT_ROOT from lightning_app.components.python import PopenPythonScript, TracerPythonScript +from lightning_app.components.python.tracer import Code +from lightning_app.storage.drive import Drive from lightning_app.testing.helpers import RunIf from lightning_app.testing.testing import run_work_isolated +from lightning_app.utilities.component import _set_work_context COMPONENTS_SCRIPTS_FOLDER = str(os.path.join(_PROJECT_ROOT, "tests/tests_app/components/python/scripts/")) @@ -69,3 +73,49 @@ def test_tracer_python_script_with_kwargs(): ) run_work_isolated(python_script) assert python_script.has_failed + + +def test_tracer_with_code(): + + drive = Drive("lit://code") + drive.component_name = "something" + code = Code(drive=drive, name="sample.tar.gz") + + with open("file.py", "w") as f: + f.write('raise Exception("An error")') + + with tarfile.open("sample.tar.gz", "w:gz") as tar: + tar.add("file.py") + + drive.put("sample.tar.gz") + os.remove("file.py") + os.remove("sample.tar.gz") + + python_script = TracerPythonScript("file.py", script_args=["--b=1"], raise_exception=False, code=code) + run_work_isolated(python_script, params={"a": "1"}, restart_count=0) + assert python_script.status.message == "An error" + + with open("file.py", "w") as f: + f.write("import sys\n") + f.write("print(sys.argv)\n") + + with tarfile.open("sample.tar.gz", "w:gz") as tar: + tar.add("file.py") + + _set_work_context() + drive.put("sample.tar.gz") + os.remove("file.py") + os.remove("sample.tar.gz") + + with open("file.py", "w") as f: + f.write('raise Exception("An error")') + + call_hash = python_script._calls["latest_call_hash"] + python_script._calls[call_hash]["statuses"].pop(-1) + python_script._calls[call_hash]["statuses"].pop(-1) + + run_work_isolated(python_script, params={"a": "1"}, restart_count=1) + assert python_script.has_succeeded + assert python_script.script_args == ["--b=1", "--a=1"] + os.remove("file.py") + os.remove("sample.tar.gz") From e5a4a0911b4ee16b32b10bf4d2ae86aa62c830e2 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 10:42:50 +0200 Subject: [PATCH 27/40] update --- src/lightning_app/CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7d0dcb589b9e3..d34a16f4f4aaa 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) +- Add `LightningTrainingComponent` that orchestrate multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) + ### Changed - Update the Lightning App docs ([#13537](https://github.com/PyTorchLightning/pytorch-lightning/pull/13537)) From 7cc1c3982c3f496ad8a845e8c0940b0b40109f79 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 10:46:08 +0200 Subject: [PATCH 28/40] update --- src/lightning_app/components/python/tracer.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index 4b0b736b78b24..b761e79f8a30a 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -112,14 +112,15 @@ def __init__( for name in self.outputs: setattr(self, name, None) self.params = None - self._code = code + self.drive = code.get("drive") if code else None + self.code_name = code.get("name") if code else None self.restart_count = 0 def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs): """ Arguments: - params: A dictionary of arguments to be be added to script_args - code: A dictionary with a drive and a file name to retrieve + params: A dictionary of arguments to be be added to script_args. + restart_count: Pass an incrementing counter to enable re-execution the work. """ if restart_count: self.restart_count = restart_count @@ -128,15 +129,14 @@ def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[i self.params = params self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()] - if self._code: - drive = self._code["drive"] - name = self._code["name"] - if os.path.exists(name): - clean_tarfile(name, "r:gz") + if self.drive: + assert self.code_name + if os.path.exists(self.code_name): + clean_tarfile(self.code_name, "r:gz") - if name in drive.list(): - drive.get(name) - extract_tarfile(name, ".", "r:gz") + if self.code_name in self.drive.list(): + self.drive.get(self.code_name) + extract_tarfile(self.code_name, ".", "r:gz") if not os.path.exists(self.script_path): raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.") From eec5e6dbe6c1126091f0ceebdc80620b414fbf6d Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 10:48:31 +0200 Subject: [PATCH 29/40] update --- src/lightning_app/utilities/state.py | 8 -------- tests/tests_app/components/python/test_python.py | 4 +++- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 3c16a7b4cdb11..0802a426e7349 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -187,14 +187,6 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: state=self._state["flows"][name], ) - elif name in self._state.get("structures", {}): - return AppState( - self._host, - self._port, - last_state=self._last_state["structures"][name], - state=self._state["structures"][name], - ) - raise AttributeError( f"Failed to access '{name}' through `AppState`. The state provides:" f" Variables: {list(self._state['vars'].keys())}," diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 6f739d84c0a4b..85197f6f8f7ef 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -75,7 +75,9 @@ def test_tracer_python_script_with_kwargs(): assert python_script.has_failed -def test_tracer_with_code(): +def test_tracer_component_with_code(): + """This test ensures the Tracer Component gets the latest code from the code object is provided and arguments + are cleaned.""" drive = Drive("lit://code") drive.component_name = "something" From 17b8c96fd3415e67de15e2ff98cb5f5583faca13 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 10:51:29 +0200 Subject: [PATCH 30/40] update --- docs/source-app/api_reference/components.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source-app/api_reference/components.rst b/docs/source-app/api_reference/components.rst index 76a99402ddecc..c5f99f0f96629 100644 --- a/docs/source-app/api_reference/components.rst +++ b/docs/source-app/api_reference/components.rst @@ -20,5 +20,6 @@ ___________________ ~python.popen.PopenPythonScript ~python.tracer.TracerPythonScript + ~training.LightningTrainingComponent ~serve.gradio.ServeGradio ~serve.serve.ModelInferenceAPI From 01bcc2976a6888db08ab7bd44c2d1b6aa1f1e022 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:06:51 -0700 Subject: [PATCH 31/40] Update tests/tests_app/components/python/test_python.py --- tests/tests_app/components/python/test_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 85197f6f8f7ef..bcb176bdf0184 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -76,7 +76,7 @@ def test_tracer_python_script_with_kwargs(): def test_tracer_component_with_code(): - """This test ensures the Tracer Component gets the latest code from the code object is provided and arguments + """This test ensures the Tracer Component gets the latest code from the code object that is provided and arguments are cleaned.""" drive = Drive("lit://code") From ba572cb9b3f992abe27c5b8c752588f83676aad9 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:06:59 -0700 Subject: [PATCH 32/40] Update src/lightning_app/utilities/packaging/tarfile.py --- src/lightning_app/utilities/packaging/tarfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py index ca945baf95ca8..123e4e2e0942a 100644 --- a/src/lightning_app/utilities/packaging/tarfile.py +++ b/src/lightning_app/utilities/packaging/tarfile.py @@ -27,7 +27,7 @@ def clean_tarfile(file_path: str, mode: str) -> None: def extract_tarfile(file_path: str, extract_path: str, mode: str) -> None: - """This utility extract all files from a tarfile.""" + """This utility extracts all files from a tarfile.""" if not os.path.exists(file_path): return None From 2f5e4b0ffb80dfbcc92317e88a572e49a66d5e98 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:07:06 -0700 Subject: [PATCH 33/40] Update src/lightning_app/components/training.py --- src/lightning_app/components/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 91ba5786fbd7b..068fed6740084 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -142,7 +142,7 @@ def __init__( script_args: The arguments to be pass to the script. num_nodes: Number of nodes. cloud_compute: The cloud compute object used in the cloud. - sanity_serving: Whether to validate the model correctly implements + sanity_serving: Whether to validate that the model correctly implements the ServableModule API """ super().__init__() From 1e000e5a5416ee889d4df572df1fa4f9048946a7 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:07:15 -0700 Subject: [PATCH 34/40] Update src/lightning_app/components/training.py --- src/lightning_app/components/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 068fed6740084..9773fe9670e52 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -121,7 +121,7 @@ def __init__( script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, **script_runner_kwargs, ): - """This component enables to perform distributed multi-node multi-devices training. + """This component enables performing distributed multi-node multi-device training. Example:: From 4ff559fada5ddaf829f5b5fd7c22d81c8067ea50 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:07:22 -0700 Subject: [PATCH 35/40] Update src/lightning_app/components/python/tracer.py --- src/lightning_app/components/python/tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index b761e79f8a30a..b98c782e138e4 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -120,7 +120,7 @@ def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[i """ Arguments: params: A dictionary of arguments to be be added to script_args. - restart_count: Pass an incrementing counter to enable re-execution the work. + restart_count: Passes an incrementing counter to enable the re-execution of LightningWorks. """ if restart_count: self.restart_count = restart_count From 4c9efe49d0cfa1c2e9fc64a986f17244f5283764 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 28 Jul 2022 02:07:29 -0700 Subject: [PATCH 36/40] Update src/lightning_app/CHANGELOG.md --- src/lightning_app/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index d34a16f4f4aaa..ab8eb25adea85 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,7 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) -- Add `LightningTrainingComponent` that orchestrate multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) +- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) ### Changed From d49ed8f22859cdf5bac1cc3d3ebd24495e214e20 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Jul 2022 09:08:20 +0000 Subject: [PATCH 37/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_app/components/python/test_python.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index bcb176bdf0184..678655d6ee908 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -76,8 +76,8 @@ def test_tracer_python_script_with_kwargs(): def test_tracer_component_with_code(): - """This test ensures the Tracer Component gets the latest code from the code object that is provided and arguments - are cleaned.""" + """This test ensures the Tracer Component gets the latest code from the code object that is provided and + arguments are cleaned.""" drive = Drive("lit://code") drive.component_name = "something" From 3888ba15208656a0761e58c1bb1cf7071cf2dfda Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 12:49:17 +0200 Subject: [PATCH 38/40] update --- src/lightning_app/utilities/state.py | 13 ++++++++++++- tests/tests_app/utilities/test_state.py | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 0802a426e7349..c644f25439140 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -168,7 +168,7 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: # The state needs to be fetched on access if it doesn't exist. self._request_state() - if name in self._state["vars"]: + if name in self._state.get("vars", {}): value = self._state["vars"][name] if isinstance(value, dict): return _maybe_create_drive("root." + ".".join(self._my_affiliation), value) @@ -187,12 +187,23 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: state=self._state["flows"][name], ) + elif name in self._state.get("structures", {}): + return AppState( + self._host, + self._port, + last_state=self._last_state["structures"][name], + state=self._state["structures"][name], + ) + raise AttributeError( f"Failed to access '{name}' through `AppState`. The state provides:" f" Variables: {list(self._state['vars'].keys())}," f" Components: {list(self._state.get('flows', {}).keys()) + list(self._state.get('works', {}).keys())}", ) + def __getitem__(self, key: str): + return self.__getattr__(key) + def __setattr__(self, name: str, value: Any) -> None: if name in self._APP_PRIVATE_KEYS: object.__setattr__(self, name, value) diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py index 0740ffc615b87..49b68619cbecc 100644 --- a/tests/tests_app/utilities/test_state.py +++ b/tests/tests_app/utilities/test_state.py @@ -7,6 +7,7 @@ import lightning_app from lightning_app import LightningApp, LightningFlow, LightningWork +from lightning_app.structures import Dict, List from lightning_app.utilities.app_helpers import AppStatePlugin, BaseStatePlugin from lightning_app.utilities.state import AppState @@ -280,3 +281,22 @@ def test_app_state_with_no_env_var(**__): assert state._host == "http://127.0.0.1" assert state._port == 7501 assert state._url == "http://127.0.0.1:7501" + + +class FlowStructures(LightningFlow): + def __init__(self): + super().__init__() + self.w_list = List(Work(), Work()) + self.w_dict = Dict(**{"0": Work(), "1": Work()}) + + def run(self): + self._exit() + + +def test_app_state_with_structures(): + app = LightningApp(FlowStructures()) + state = AppState() + state._last_state = app.state + state._state = app.state + assert state.w_list["0"].counter == 0 + assert state.w_dict["0"].counter == 0 From a72397bfb7b84e404ef19c579d206a30c92821b2 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 28 Jul 2022 19:14:20 +0200 Subject: [PATCH 39/40] update --- src/lightning_app/runners/backends/backend.py | 1 - src/lightning_app/utilities/state.py | 49 ++++++++++++++++++- tests/tests_app/utilities/test_state.py | 23 ++++++++- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py index c370c7098b778..87bb103823fd2 100644 --- a/src/lightning_app/runners/backends/backend.py +++ b/src/lightning_app/runners/backends/backend.py @@ -87,7 +87,6 @@ def _prepare_queues(self, app): app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) app.delta_queue = self.queues.get_delta_queue(**kw) - app.error_queue = self.queues.get_error_queue(**kw) app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw) app.api_delta_queue = self.queues.get_api_delta_queue(**kw) app.request_queues = {} diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index c644f25439140..1242cbe1f622d 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -3,7 +3,7 @@ import logging import os from copy import deepcopy -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from deepdiff import DeepDiff from requests import Session @@ -168,6 +168,11 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: # The state needs to be fetched on access if it doesn't exist. self._request_state() + # import streamlit as st + + # st.write(name) + # st.write(self._state) + if name in self._state.get("vars", {}): value = self._state["vars"][name] if isinstance(value, dict): @@ -237,6 +242,48 @@ def __repr__(self) -> str: def __bool__(self) -> bool: return bool(self._state) + def __len__(self) -> int: + # The state needs to be fetched on access if it doesn't exist. + self._request_state() + + keys = [] + for component in ["flows", "works", "structures"]: + keys.extend(list(self._state.get(component, {}))) + return len(keys) + + def items(self) -> List[Dict[str, Any]]: + # The state needs to be fetched on access if it doesn't exist. + self._request_state() + + items = [] + for component in ["flows", "works"]: + state = self._state.get(component, {}) + last_state = self._last_state.get(component, {}) + for name, state_value in state.items(): + v = AppState( + self._host, + self._port, + last_state=last_state[name], + state=state_value, + ) + items.append((name, v)) + + structures = self._state.get("structures", {}) + last_structures = self._last_state.get("structures", {}) + if structures: + for component in ["flows", "works"]: + state = structures.get(component, {}) + last_state = last_structures.get(component, {}) + for name, state_value in state.items(): + v = AppState( + self._host, + self._port, + last_state=last_state[name], + state=state_value, + ) + items.append((name, v)) + return items + @staticmethod def _configure_session() -> Session: return _configure_session() diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py index 49b68619cbecc..3b9f1b790cfc7 100644 --- a/tests/tests_app/utilities/test_state.py +++ b/tests/tests_app/utilities/test_state.py @@ -287,7 +287,17 @@ class FlowStructures(LightningFlow): def __init__(self): super().__init__() self.w_list = List(Work(), Work()) - self.w_dict = Dict(**{"0": Work(), "1": Work()}) + self.w_dict = Dict(**{"toto": Work(), "toto_2": Work()}) + + def run(self): + self._exit() + + +class FlowStructuresEmpty(LightningFlow): + def __init__(self): + super().__init__() + self.w_list = List() + self.w_dict = Dict() def run(self): self._exit() @@ -299,4 +309,13 @@ def test_app_state_with_structures(): state._last_state = app.state state._state = app.state assert state.w_list["0"].counter == 0 - assert state.w_dict["0"].counter == 0 + assert len(state.w_list) == 2 + assert state.w_dict["toto"].counter == 0 + assert [k for k, _ in state.w_dict.items()] == ["toto", "toto_2"] + assert [k for k, _ in state.w_list.items()] == ["0", "1"] + + app = LightningApp(FlowStructuresEmpty()) + state = AppState() + state._last_state = app.state + state._state = app.state + assert state.w_list From d3ee31b057f0e58890af02f2f0d7c8d9c22c654f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 29 Jul 2022 15:29:03 +0200 Subject: [PATCH 40/40] update --- src/lightning_app/core/flow.py | 1 - src/lightning_app/utilities/proxies.py | 5 ++--- src/lightning_app/utilities/state.py | 5 ----- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index c7f769d5e7212..f6b6e34e81538 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -207,7 +207,6 @@ def _attach_backend(flow: "LightningFlow", backend): structure = getattr(flow, struct_name) for flow in structure.flows: LightningFlow._attach_backend(flow, backend) - flow._backend = backend for work in structure.works: backend._wrap_run_method(_LightningAppRef().get_current(), work) work._backend = backend diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index ce7a768e78a0b..c33e41bb70203 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -408,9 +408,8 @@ def run_once(self): persist_artifacts(work=self.work) # 15. Destroy the state observer. - if self.state_observer: - self.state_observer.join(0) - self.state_observer = None + self.state_observer.join(0) + self.state_observer = None # 15. An asynchronous work shouldn't return a return value. if ret is not None: diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 1242cbe1f622d..5cd7979de09d9 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -168,11 +168,6 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: # The state needs to be fetched on access if it doesn't exist. self._request_state() - # import streamlit as st - - # st.write(name) - # st.write(self._state) - if name in self._state.get("vars", {}): value = self._state["vars"][name] if isinstance(value, dict):