From 9188cbe427b9d599cd25bc8426fa45cf6ae7b41a Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Mon, 7 Feb 2022 10:00:33 -0500 Subject: [PATCH 01/56] Add cpu metrics to cpu accelerator --- pytorch_lightning/accelerators/cpu.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 69bef65644431..30f68804c3144 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -13,6 +13,7 @@ # limitations under the License. from typing import Any, Dict, List, Union +import psutil import torch from pytorch_lightning.accelerators.accelerator import Accelerator @@ -32,11 +33,13 @@ def setup_environment(self, root_device: torch.device) -> None: """ super().setup_environment(root_device) if root_device.type != "cpu": - raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") + raise MisconfigurationException( + f"Device should be CPU, got {root_device} instead." + ) - def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: - """CPU device stats aren't supported yet.""" - return {} + def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: + """Get CPU stats from psutil.""" + return get_cpu_process_metrics() @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> int: @@ -67,3 +70,11 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) + +def get_cpu_process_metrics() -> Dict[str, float]: + metrics = { + f"vm_percent": psutil.virtual_memory().percent, + f"cpu_percent": psutil.cpu_percent(), + f"swap_percent": psutil.swap_memory().percent, + } + return metrics From efd5d3bfe6e765675d980c1ce7f52e2f5b00f549 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Mon, 7 Feb 2022 10:06:36 -0500 Subject: [PATCH 02/56] Add cpu metrics to device stats monitor --- .../callbacks/device_stats_monitor.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 0929358cf0f74..63945940fff35 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -21,6 +21,7 @@ from typing import Any, Dict, Optional import pytorch_lightning as pl +from pytorch_lightning.accelerators.cpu import get_cpu_process_metrics from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -32,6 +33,9 @@ class DeviceStatsMonitor(Callback): Automatically monitors and logs device stats during training stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. + Args: + cpu_stats: if True, will also record CPU stats in addition to accelerator stats (default: True). + Raises: MisconfigurationException: If ``Trainer`` has no logger. @@ -43,17 +47,26 @@ class DeviceStatsMonitor(Callback): >>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP """ - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def __init__(self, cpu_stats: bool = True) -> None: + super().__init__() + self.cpu_stats = cpu_stats + + def setup( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + stage: Optional[str] = None, + ) -> None: if not trainer.loggers: raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.") - def on_train_batch_start( + def _on_train_batch_common( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int, - unused: int = 0, + key: str, ) -> None: if not trainer.loggers: raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") @@ -63,10 +76,24 @@ def on_train_batch_start( device = trainer.strategy.root_device device_stats = trainer.accelerator.get_device_stats(device) + + if self.cpu_stats: + device_stats.update(get_cpu_process_metrics()) + for logger in trainer.loggers: separator = logger.group_separator - prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_start", separator) - logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped) + prefixed_device_stats = _prefix_metric_keys(device_stats, key, separator) + logger.log_metrics(prefixed_device_stats, step=trainer.global_step) + + def on_train_batch_start( + self, + trainer: "pl.Trainer", + pl_module: "pl.LightningModule", + batch: Any, + batch_idx: int, + unused: Optional[int] = 0, + ) -> None: + self._on_train_batch_common(trainer, pl_module, batch, batch_idx, "on_train_batch_start") def on_train_batch_end( self, @@ -77,18 +104,7 @@ def on_train_batch_end( batch_idx: int, unused: int = 0, ) -> None: - if not trainer.loggers: - raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") - - if not trainer._logger_connector.should_update_logs: - return - - device = trainer.strategy.root_device - device_stats = trainer.accelerator.get_device_stats(device) - for logger in trainer.loggers: - separator = logger.group_separator - prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_end", separator) - logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped) + self._on_train_batch_common(trainer, pl_module, batch, batch_idx, "on_train_batch_end") def _prefix_metric_keys(metrics_dict: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]: From 29f918d43d8ae7fd4ba0666445aff636d47e4091 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Mon, 7 Feb 2022 11:00:30 -0500 Subject: [PATCH 03/56] Add tests to make sure cpu stats are added correctly --- pytorch_lightning/accelerators/cpu.py | 2 +- .../callbacks/device_stats_monitor.py | 3 +- tests/callbacks/test_device_stats_monitor.py | 122 ++++++++++++++++++ 3 files changed, 125 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 30f68804c3144..4a2c3d14ed4ee 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -71,7 +71,7 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: description=f"{cls.__class__.__name__}", ) -def get_cpu_process_metrics() -> Dict[str, float]: +def get_cpu_process_metrics() -> dict[str, float]: metrics = { f"vm_percent": psutil.virtual_memory().percent, f"cpu_percent": psutil.cpu_percent(), diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 63945940fff35..c463abb39708f 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -77,7 +77,8 @@ def _on_train_batch_common( device = trainer.strategy.root_device device_stats = trainer.accelerator.get_device_stats(device) - if self.cpu_stats: + if self.cpu_stats and device.type != "cpu": + # Don't query CPU stats twice if CPU is accelerator device_stats.update(get_cpu_process_metrics()) for logger in trainer.loggers: diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index c3108a8cef9aa..63c7dd2ed8e6a 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, Optional +from unittest import mock import pytest @@ -54,6 +55,127 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) +@RunIf(max_torch="1.7") +@RunIf(min_gpus=1) +def test_device_stats_gpu_from_nvidia(tmpdir): + """Test GPU stats are logged using a logger with Pytorch < 1.8.0.""" + model = BoringModel() + device_stats = DeviceStatsMonitor() + + class DebugLogger(CSVLogger): + @rank_zero_only + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] + for f in fields: + assert any(f in h for h in metrics.keys()) + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=7, + log_every_n_steps=1, + accelerator="gpu", + devices=1, + callbacks=[device_stats], + logger=DebugLogger(tmpdir), + enable_checkpointing=False, + enable_progress_bar=False, + ) + + trainer.fit(model) + + +@RunIf(max_torch="1.7") +@RunIf(min_gpus=1) +def test_device_stats_gpu_from_nvidia_and_cpu(tmpdir): + """Test GPU stats + CPU stats are logged using a logger with Pytorch < 1.8.0.""" + model = BoringModel() + device_stats = DeviceStatsMonitor(cpu_stats=True) + + class DebugLogger(CSVLogger): + @rank_zero_only + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + fields = [ + "utilization.gpu", + "memory.used", + "memory.free", + "utilization.memory", + "vm_percent", + "cpu_percent", + "swap_percent", + ] + for f in fields: + assert any(f in h for h in metrics.keys()) + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=7, + log_every_n_steps=1, + accelerator="gpu", + devices=1, + callbacks=[device_stats], + logger=DebugLogger(tmpdir), + enable_checkpointing=False, + enable_progress_bar=False, + ) + + trainer.fit(model) + + +def test_device_stats_cpu(tmpdir): + """Test CPU stats are logged when no accelerator is used.""" + model = BoringModel() + + class DebugLogger(CSVLogger): + @rank_zero_only + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + fields = ["vm_percent", "cpu_percent", "swap_percent"] + for f in fields: + assert any(f in h for h in metrics.keys()) + + device_stats = DeviceStatsMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=7, + log_every_n_steps=1, + callbacks=[device_stats], + logger=DebugLogger(tmpdir), + enable_checkpointing=False, + enable_progress_bar=False, + ) + + trainer.fit(model) + + +@mock.patch("pytorch_lightning.accelerators.cpu.get_cpu_process_metrics") +@mock.patch("pytorch_lightning.callbacks.device_stats_monitor.get_cpu_process_metrics") +def test_device_stats_cpu_queried_once(cpu_metrics_device_stats_mock, cpu_metrics_cpu_accelerator_mock, tmpdir): + """Make sure that get_cpu_process_metrics is only queried once if the accelerator is CPU and cpu_stats=True.""" + model = BoringModel() + + device_stats = DeviceStatsMonitor(cpu_stats=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=1, + log_every_n_steps=1, + callbacks=[device_stats], + logger=True, + enable_checkpointing=False, + enable_progress_bar=False, + ) + + trainer.fit(model) + + # Note that you need to mock where the function is imported + # (not where it is defined). Please see the following for + # an explanation: https://docs.python.org/3/library/unittest.mock.html#where-to-patch + assert cpu_metrics_device_stats_mock.call_count == 0 # called inside DeviceStatsMonitor + assert cpu_metrics_cpu_accelerator_mock.call_count == 2 # called inside CPUAccelerator + + @RunIf(tpu=True) def test_device_stats_monitor_tpu(tmpdir): """Test TPU stats are logged using a logger.""" From dbb67795f303a5070c06f0abfb05be2d2b51090d Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Wed, 9 Feb 2022 09:01:14 -0500 Subject: [PATCH 04/56] Check psutil availability before running --- pytorch_lightning/accelerators/cpu.py | 11 ++++++++++- pytorch_lightning/utilities/imports.py | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 4a2c3d14ed4ee..f4bb5583be21e 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -13,14 +13,17 @@ # limitations under the License. from typing import Any, Dict, List, Union -import psutil import torch from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.types import _DEVICE +if _PSUTIL_AVAILABLE: + import psutil + class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" @@ -72,6 +75,12 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: ) def get_cpu_process_metrics() -> dict[str, float]: + if not _PSUTIL_AVAILABLE: + raise ModuleNotFoundError( + "`get_cpu_process_metrics` requires `psutil` to be installed. " + + "Install it by running `pip install -U psutil`." + ) + metrics = { f"vm_percent": psutil.virtual_memory().percent, f"cpu_percent": psutil.cpu_percent(), diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 9bf0fdd046134..a33b3adf059a1 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -104,6 +104,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") _FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group") +_HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") _HOROVOD_AVAILABLE = _module_available("horovod.torch") _HYDRA_AVAILABLE = _package_available("hydra") _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental") @@ -113,7 +114,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _NEPTUNE_GREATER_EQUAL_0_9 = _NEPTUNE_AVAILABLE and _compare_version("neptune", operator.ge, "0.9.0") _OMEGACONF_AVAILABLE = _package_available("omegaconf") _POPTORCH_AVAILABLE = _package_available("poptorch") -_HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") +_PSUTIL_AVAILABLE = _package_available("psutil") _RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) _TORCHTEXT_AVAILABLE = _package_available("torchtext") From 54a70e33a977bd296c6b772929ab1f033b32be1c Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Wed, 9 Feb 2022 09:09:18 -0500 Subject: [PATCH 05/56] Add cpu_ prefix to all logged cpu values --- pytorch_lightning/accelerators/cpu.py | 4 ++-- tests/callbacks/test_device_stats_monitor.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index f4bb5583be21e..48276c85a854c 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -82,8 +82,8 @@ def get_cpu_process_metrics() -> dict[str, float]: ) metrics = { - f"vm_percent": psutil.virtual_memory().percent, + f"cpu_vm_percent": psutil.virtual_memory().percent, f"cpu_percent": psutil.cpu_percent(), - f"swap_percent": psutil.swap_memory().percent, + f"cpu_swap_percent": psutil.swap_memory().percent, } return metrics diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 63c7dd2ed8e6a..13898cb6fbf86 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -100,9 +100,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> "memory.used", "memory.free", "utilization.memory", - "vm_percent", + "cpu_vm_percent", "cpu_percent", - "swap_percent", + "cpu_swap_percent", ] for f in fields: assert any(f in h for h in metrics.keys()) @@ -130,7 +130,7 @@ def test_device_stats_cpu(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = ["vm_percent", "cpu_percent", "swap_percent"] + fields = ["cpu_vm_percent", "cpu_percent", "cpu_swap_percent"] for f in fields: assert any(f in h for h in metrics.keys()) From 47002e5aee4f9d6fbaa90973ac7db3338233798a Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Wed, 9 Feb 2022 09:13:56 -0500 Subject: [PATCH 06/56] Asserting CPU metric keys aren't in GPU only logger --- tests/callbacks/test_device_stats_monitor.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 13898cb6fbf86..576f452183246 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -25,6 +25,12 @@ from tests.helpers import BoringModel from tests.helpers.runif import RunIf +CPU_METRIC_KEYS = [ + "cpu_vm_percent", + "cpu_percent", + "cpu_swap_percent", +] + @RunIf(min_gpus=1) def test_device_stats_gpu_from_torch(tmpdir): @@ -69,6 +75,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> for f in fields: assert any(f in h for h in metrics.keys()) + for f in CPU_METRIC_KEYS: + assert not any(f in h for h in metrics.keys()), "CPU Stats should not be included" + trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, @@ -100,10 +109,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> "memory.used", "memory.free", "utilization.memory", - "cpu_vm_percent", - "cpu_percent", - "cpu_swap_percent", - ] + ] + CPU_METRIC_KEYS for f in fields: assert any(f in h for h in metrics.keys()) @@ -130,7 +136,7 @@ def test_device_stats_cpu(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = ["cpu_vm_percent", "cpu_percent", "cpu_swap_percent"] + fields = CPU_METRIC_KEYS for f in fields: assert any(f in h for h in metrics.keys()) From 17d48d7fa25f3d2e7b8271e84b8567b2394d9432 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Wed, 9 Feb 2022 09:16:44 -0500 Subject: [PATCH 07/56] Rename to _get_and_log_device_stats --- pytorch_lightning/callbacks/device_stats_monitor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index c463abb39708f..553ed135c22cd 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -60,7 +60,7 @@ def setup( if not trainer.loggers: raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.") - def _on_train_batch_common( + def _get_and_log_device_stats( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", @@ -94,7 +94,7 @@ def on_train_batch_start( batch_idx: int, unused: Optional[int] = 0, ) -> None: - self._on_train_batch_common(trainer, pl_module, batch, batch_idx, "on_train_batch_start") + self._get_and_log_device_stats(trainer, pl_module, batch, batch_idx, "on_train_batch_start") def on_train_batch_end( self, @@ -105,7 +105,7 @@ def on_train_batch_end( batch_idx: int, unused: int = 0, ) -> None: - self._on_train_batch_common(trainer, pl_module, batch, batch_idx, "on_train_batch_end") + self._get_and_log_device_stats(trainer, pl_module, batch, batch_idx, "on_train_batch_end") def _prefix_metric_keys(metrics_dict: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]: From edb93abe4b72509ea031a4ebcefd18f52427441a Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Mon, 14 Feb 2022 12:08:01 -0500 Subject: [PATCH 08/56] Addd CPUDeviceStatsEnum --- pytorch_lightning/accelerators/cpu.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 48276c85a854c..685a8fade41a7 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -17,6 +17,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities import device_parser +from pytorch_lightning.utilities.enums import LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.types import _DEVICE @@ -36,9 +37,7 @@ def setup_environment(self, root_device: torch.device) -> None: """ super().setup_environment(root_device) if root_device.type != "cpu": - raise MisconfigurationException( - f"Device should be CPU, got {root_device} instead." - ) + raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: """Get CPU stats from psutil.""" @@ -74,6 +73,14 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: description=f"{cls.__class__.__name__}", ) +class CPUDeviceStatsEnum(LightningEnum): + """Enum for CPU device stats.""" + + CPU_VM_PERCENT = "cpu_vm_percent" + CPU_PERCENT = "cpu_percent" + CPU_SWAP_PERCENT = "cpu_swap_percent" + + def get_cpu_process_metrics() -> dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( @@ -82,8 +89,8 @@ def get_cpu_process_metrics() -> dict[str, float]: ) metrics = { - f"cpu_vm_percent": psutil.virtual_memory().percent, - f"cpu_percent": psutil.cpu_percent(), - f"cpu_swap_percent": psutil.swap_memory().percent, + CPUDeviceStatsEnum.CPU_VM_PERCENT: psutil.virtual_memory().percent, + CPUDeviceStatsEnum.CPU_PERCENT: psutil.cpu_percent(), + CPUDeviceStatsEnum.CPU_SWAP_PERCENT: psutil.swap_memory().percent, } return metrics From 06dd8c496e9e09dfdd75e499c1e12c75cfb6162c Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 18 Feb 2022 13:10:00 -0500 Subject: [PATCH 09/56] Fix mypy issues --- pytorch_lightning/accelerators/cpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 685a8fade41a7..c72d904495b32 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -93,4 +93,5 @@ def get_cpu_process_metrics() -> dict[str, float]: CPUDeviceStatsEnum.CPU_PERCENT: psutil.cpu_percent(), CPUDeviceStatsEnum.CPU_SWAP_PERCENT: psutil.swap_memory().percent, } - return metrics + # Convert the keys from enum to strs + return {str(k): v for k, v in metrics.items()} From 3bffb4812d4d758e7cc7c50fa88257e517f9f68e Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 18 Feb 2022 14:52:47 -0500 Subject: [PATCH 10/56] CHange cpu metrics enum to constant str --- pytorch_lightning/accelerators/cpu.py | 27 +++++++++----------- tests/callbacks/test_device_stats_monitor.py | 11 +++++--- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index c72d904495b32..ea76bc3bfbe8c 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -17,7 +17,6 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities import device_parser -from pytorch_lightning.utilities.enums import LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.types import _DEVICE @@ -25,6 +24,11 @@ if _PSUTIL_AVAILABLE: import psutil +# CPU device metrics +CPU_VM_PERCENT = "cpu_vm_percent" +CPU_PERCENT = "cpu_percent" +CPU_SWAP_PERCENT = "cpu_swap_percent" + class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" @@ -37,7 +41,9 @@ def setup_environment(self, root_device: torch.device) -> None: """ super().setup_environment(root_device) if root_device.type != "cpu": - raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") + raise MisconfigurationException( + f"Device should be CPU, got {root_device} instead." + ) def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: """Get CPU stats from psutil.""" @@ -73,14 +79,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: description=f"{cls.__class__.__name__}", ) -class CPUDeviceStatsEnum(LightningEnum): - """Enum for CPU device stats.""" - - CPU_VM_PERCENT = "cpu_vm_percent" - CPU_PERCENT = "cpu_percent" - CPU_SWAP_PERCENT = "cpu_swap_percent" - - def get_cpu_process_metrics() -> dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( @@ -89,9 +87,8 @@ def get_cpu_process_metrics() -> dict[str, float]: ) metrics = { - CPUDeviceStatsEnum.CPU_VM_PERCENT: psutil.virtual_memory().percent, - CPUDeviceStatsEnum.CPU_PERCENT: psutil.cpu_percent(), - CPUDeviceStatsEnum.CPU_SWAP_PERCENT: psutil.swap_memory().percent, + CPU_VM_PERCENT: psutil.virtual_memory().percent, + CPU_PERCENT: psutil.cpu_percent(), + CPU_SWAP_PERCENT: psutil.swap_memory().percent, } - # Convert the keys from enum to strs - return {str(k): v for k, v in metrics.items()} + return metrics diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 576f452183246..57a8ed79fff71 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -17,6 +17,11 @@ import pytest from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.cpu import ( + CPU_PERCENT, + CPU_SWAP_PERCENT, + CPU_VM_PERCENT, +) from pytorch_lightning.callbacks import DeviceStatsMonitor from pytorch_lightning.callbacks.device_stats_monitor import _prefix_metric_keys from pytorch_lightning.loggers import CSVLogger @@ -26,9 +31,9 @@ from tests.helpers.runif import RunIf CPU_METRIC_KEYS = [ - "cpu_vm_percent", - "cpu_percent", - "cpu_swap_percent", + CPU_VM_PERCENT, + CPU_SWAP_PERCENT, + CPU_PERCENT, ] From 74e90696f045dbe8a198829fa393d5b4a220455e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Feb 2022 19:58:29 +0000 Subject: [PATCH 11/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/cpu.py | 4 +--- tests/callbacks/test_device_stats_monitor.py | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index ea76bc3bfbe8c..4507ec549fb6f 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -41,9 +41,7 @@ def setup_environment(self, root_device: torch.device) -> None: """ super().setup_environment(root_device) if root_device.type != "cpu": - raise MisconfigurationException( - f"Device should be CPU, got {root_device} instead." - ) + raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: """Get CPU stats from psutil.""" diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 57a8ed79fff71..5f8dac681469f 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -17,11 +17,7 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.cpu import ( - CPU_PERCENT, - CPU_SWAP_PERCENT, - CPU_VM_PERCENT, -) +from pytorch_lightning.accelerators.cpu import CPU_PERCENT, CPU_SWAP_PERCENT, CPU_VM_PERCENT from pytorch_lightning.callbacks import DeviceStatsMonitor from pytorch_lightning.callbacks.device_stats_monitor import _prefix_metric_keys from pytorch_lightning.loggers import CSVLogger From 317c35409e3f33a7abe55e907f0210bac01d7ae7 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 12:41:29 -0500 Subject: [PATCH 12/56] Make CPU metric constants private Co-authored-by: ananthsub --- pytorch_lightning/accelerators/cpu.py | 12 ++++++------ tests/callbacks/test_device_stats_monitor.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 4507ec549fb6f..f70d7cacbff9e 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -25,9 +25,9 @@ import psutil # CPU device metrics -CPU_VM_PERCENT = "cpu_vm_percent" -CPU_PERCENT = "cpu_percent" -CPU_SWAP_PERCENT = "cpu_swap_percent" +_CPU_VM_PERCENT = "cpu_vm_percent" +_CPU_PERCENT = "cpu_percent" +_CPU_SWAP_PERCENT = "cpu_swap_percent" class CPUAccelerator(Accelerator): @@ -85,8 +85,8 @@ def get_cpu_process_metrics() -> dict[str, float]: ) metrics = { - CPU_VM_PERCENT: psutil.virtual_memory().percent, - CPU_PERCENT: psutil.cpu_percent(), - CPU_SWAP_PERCENT: psutil.swap_memory().percent, + _CPU_VM_PERCENT: psutil.virtual_memory().percent, + _CPU_PERCENT: psutil.cpu_percent(), + _CPU_SWAP_PERCENT: psutil.swap_memory().percent, } return metrics diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 5f8dac681469f..ced97a7f2d8d1 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -27,9 +27,9 @@ from tests.helpers.runif import RunIf CPU_METRIC_KEYS = [ - CPU_VM_PERCENT, - CPU_SWAP_PERCENT, - CPU_PERCENT, + _CPU_VM_PERCENT, + _CPU_SWAP_PERCENT, + _CPU_PERCENT, ] From 10029b9f00a5c141a895d4bc4e1b7222d5e60994 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 12:43:37 -0500 Subject: [PATCH 13/56] Update CPU metric imports --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index ced97a7f2d8d1..6bffcac89d4c2 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -17,7 +17,7 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.cpu import CPU_PERCENT, CPU_SWAP_PERCENT, CPU_VM_PERCENT +from pytorch_lightning.accelerators.cpu import _CPU_PERCENT, _CPU_SWAP_PERCENT, _CPU_VM_PERCENT from pytorch_lightning.callbacks import DeviceStatsMonitor from pytorch_lightning.callbacks.device_stats_monitor import _prefix_metric_keys from pytorch_lightning.loggers import CSVLogger From 14cae030a99fe96fafdf4375c481c56a66be81bf Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 16:09:22 -0500 Subject: [PATCH 14/56] Update docstring and debugging.rst --- pytorch_lightning/callbacks/device_stats_monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 553ed135c22cd..dc44c3fd65261 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -34,7 +34,8 @@ class DeviceStatsMonitor(Callback): is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. Args: - cpu_stats: if True, will also record CPU stats in addition to accelerator stats (default: True). + cpu_stats: if True, will also record CPU stats in addition to accelerator stats. + Default: ``True``. Raises: MisconfigurationException: From e1878264bba652d82dc380555c09613677d3a17f Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 16:11:49 -0500 Subject: [PATCH 15/56] Update docstring and debugging.rst --- pytorch_lightning/callbacks/device_stats_monitor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index dc44c3fd65261..542705da4ae93 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -49,7 +49,6 @@ class DeviceStatsMonitor(Callback): """ def __init__(self, cpu_stats: bool = True) -> None: - super().__init__() self.cpu_stats = cpu_stats def setup( From 23d42600c9ca637d0845b2a5984c939400df6fdf Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 16:13:36 -0500 Subject: [PATCH 16/56] Replace metrics.keys() w metrics --- tests/callbacks/test_device_stats_monitor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 6bffcac89d4c2..d0cb544848515 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -44,7 +44,7 @@ class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] for f in fields: - assert any(f in h for h in metrics.keys()) + assert any(f in h for h in metrics) trainer = Trainer( default_root_dir=tmpdir, @@ -74,10 +74,10 @@ class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] for f in fields: - assert any(f in h for h in metrics.keys()) + assert any(f in h for h in metrics) for f in CPU_METRIC_KEYS: - assert not any(f in h for h in metrics.keys()), "CPU Stats should not be included" + assert not any(f in h for h in metrics), "CPU Stats should not be included" trainer = Trainer( default_root_dir=tmpdir, @@ -112,7 +112,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> "utilization.memory", ] + CPU_METRIC_KEYS for f in fields: - assert any(f in h for h in metrics.keys()) + assert any(f in h for h in metrics) trainer = Trainer( default_root_dir=tmpdir, @@ -139,7 +139,7 @@ class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: fields = CPU_METRIC_KEYS for f in fields: - assert any(f in h for h in metrics.keys()) + assert any(f in h for h in metrics) device_stats = DeviceStatsMonitor() trainer = Trainer( @@ -195,7 +195,7 @@ class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: fields = ["avg. free memory (MB)", "avg. peak memory (MB)"] for f in fields: - assert any(f in h for h in metrics.keys()) + assert any(f in h for h in metrics) trainer = Trainer( default_root_dir=tmpdir, From c245ff14d20775baef1fc2fc17b87b6ae3d16f7e Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 17:22:04 -0500 Subject: [PATCH 17/56] Updated GPU tests --- tests/callbacks/test_device_stats_monitor.py | 49 ++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index d0cb544848515..f52a9314d0acb 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -67,7 +67,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> def test_device_stats_gpu_from_nvidia(tmpdir): """Test GPU stats are logged using a logger with Pytorch < 1.8.0.""" model = BoringModel() - device_stats = DeviceStatsMonitor() + device_stats = DeviceStatsMonitor(cpu_stats=False) class DebugLogger(CSVLogger): @rank_zero_only @@ -76,9 +76,6 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> for f in fields: assert any(f in h for h in metrics) - for f in CPU_METRIC_KEYS: - assert not any(f in h for h in metrics), "CPU Stats should not be included" - trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, @@ -95,12 +92,11 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) -@RunIf(max_torch="1.7") @RunIf(min_gpus=1) def test_device_stats_gpu_from_nvidia_and_cpu(tmpdir): - """Test GPU stats + CPU stats are logged using a logger with Pytorch < 1.8.0.""" + """Test GPU stats + CPU stats are logged using a logger.""" model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=True) + device_stats = DeviceStatsMonitor() class DebugLogger(CSVLogger): @rank_zero_only @@ -117,7 +113,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, - limit_train_batches=7, + limit_train_batches=2, log_every_n_steps=1, accelerator="gpu", devices=1, @@ -129,6 +125,43 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) +@RunIf(min_gpus=1) +def test_device_stats_gpu_from_nvidia_no_cpu(tmpdir): + """Test only GPU stat stats are logged using a logger.""" + model = BoringModel() + device_stats = DeviceStatsMonitor(cpu_stats=False) + + class DebugLogger(CSVLogger): + @rank_zero_only + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + fields = [ + "utilization.gpu", + "memory.used", + "memory.free", + "utilization.memory", + ] + for f in fields: + assert any(f in h for h in metrics) + + for f in CPU_METRIC_KEYS: + assert not any(f in h for h in metrics), "CPU Stats should not be included" + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=2, + log_every_n_steps=1, + accelerator="gpu", + devices=1, + callbacks=[device_stats], + logger=DebugLogger(tmpdir), + enable_checkpointing=False, + enable_progress_bar=False, + ) + + trainer.fit(model) + + def test_device_stats_cpu(tmpdir): """Test CPU stats are logged when no accelerator is used.""" From 43ce53e27d46d0045e81ae34d23c62f61dfe6077 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 19:31:09 -0500 Subject: [PATCH 18/56] Clean up comments + strings Co-authored-by: Jirka Borovec --- pytorch_lightning/accelerators/cpu.py | 4 ++-- pytorch_lightning/callbacks/device_stats_monitor.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index f70d7cacbff9e..9c7c67d056dc0 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -80,8 +80,8 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: def get_cpu_process_metrics() -> dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( - "`get_cpu_process_metrics` requires `psutil` to be installed. " - + "Install it by running `pip install -U psutil`." + "`get_cpu_process_metrics` requires `psutil` to be installed." + " Install it by running `pip install -U psutil`." ) metrics = { diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 542705da4ae93..dd14bcc295ef6 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -35,7 +35,6 @@ class DeviceStatsMonitor(Callback): Args: cpu_stats: if True, will also record CPU stats in addition to accelerator stats. - Default: ``True``. Raises: MisconfigurationException: From 57eb885ec83de1b33204306b960f6cf6db065259 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Fri, 25 Feb 2022 20:13:19 -0500 Subject: [PATCH 19/56] Updated GPU metrics that are checked --- tests/callbacks/test_device_stats_monitor.py | 31 ++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index f52a9314d0acb..26e98e376e33c 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -42,7 +42,11 @@ def test_device_stats_gpu_from_torch(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] + fields = [ + "allocated_bytes.all.freed", + "inactive_split.all.peak", + "reserved_bytes.large_pool.peak", + ] for f in fields: assert any(f in h for h in metrics) @@ -72,7 +76,12 @@ def test_device_stats_gpu_from_nvidia(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] + fields = [ + "utilization.gpu", + "memory.used", + "memory.free", + "utilization.memory", + ] for f in fields: assert any(f in h for h in metrics) @@ -101,11 +110,10 @@ def test_device_stats_gpu_from_nvidia_and_cpu(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + # Just need to check one of the GPU metrics to make sure + # the GPU metrics are logged fields = [ - "utilization.gpu", - "memory.used", - "memory.free", - "utilization.memory", + "allocated.all.current", ] + CPU_METRIC_KEYS for f in fields: assert any(f in h for h in metrics) @@ -125,6 +133,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) + @RunIf(min_gpus=1) def test_device_stats_gpu_from_nvidia_no_cpu(tmpdir): """Test only GPU stat stats are logged using a logger.""" @@ -134,15 +143,14 @@ def test_device_stats_gpu_from_nvidia_no_cpu(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + # Just need to check one of the GPU metrics to make sure + # the GPU metrics are logged fields = [ - "utilization.gpu", - "memory.used", - "memory.free", - "utilization.memory", + "allocated.all.current", ] for f in fields: assert any(f in h for h in metrics) - + for f in CPU_METRIC_KEYS: assert not any(f in h for h in metrics), "CPU Stats should not be included" @@ -162,7 +170,6 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) - def test_device_stats_cpu(tmpdir): """Test CPU stats are logged when no accelerator is used.""" model = BoringModel() From 374d1ab38ee9bcca12ab436d4443ee4055c8063c Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sat, 26 Feb 2022 07:29:55 -0500 Subject: [PATCH 20/56] Updated GPU metric key --- tests/callbacks/test_device_stats_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 26e98e376e33c..ce3b0d1dd907a 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -113,7 +113,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> # Just need to check one of the GPU metrics to make sure # the GPU metrics are logged fields = [ - "allocated.all.current", + "active.all.allocated", ] + CPU_METRIC_KEYS for f in fields: assert any(f in h for h in metrics) @@ -146,7 +146,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> # Just need to check one of the GPU metrics to make sure # the GPU metrics are logged fields = [ - "allocated.all.current", + "active.all.allocated", ] for f in fields: assert any(f in h for h in metrics) From 5012b8c6280191a65ea12c6d3f8528571fb4cd0d Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sat, 26 Feb 2022 18:11:18 -0500 Subject: [PATCH 21/56] Add back-ticks around code in error message + docs Co-authored-by: Rohit Gupta --- pytorch_lightning/callbacks/device_stats_monitor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index dd14bcc295ef6..a31eea49b7cae 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -34,7 +34,7 @@ class DeviceStatsMonitor(Callback): is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. Args: - cpu_stats: if True, will also record CPU stats in addition to accelerator stats. + cpu_stats: if ``True``, will also record CPU stats in addition to accelerator stats. Raises: MisconfigurationException: @@ -57,7 +57,9 @@ def setup( stage: Optional[str] = None, ) -> None: if not trainer.loggers: - raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.") + raise MisconfigurationException( + "Cannot use `DeviceStatsMonitor` callback with `Trainer` that has no logger." + ) def _get_and_log_device_stats( self, From c7816bff6b28263b5052b6d8b60189e770234a53 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sat, 26 Feb 2022 18:25:13 -0500 Subject: [PATCH 22/56] Parameterized torch + cpu tests and updated docstrings --- tests/callbacks/test_device_stats_monitor.py | 46 ++++---------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index ce3b0d1dd907a..788b96884c815 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -101,44 +101,12 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) +@pytest.mark.parametrize("cpu_stats", [True, False]) @RunIf(min_gpus=1) -def test_device_stats_gpu_from_nvidia_and_cpu(tmpdir): - """Test GPU stats + CPU stats are logged using a logger.""" +def test_device_stats_gpu_from_torch_toggle_cpu(tmpdir, cpu_stats): + """Test only CPU stats can be enabled/disabled when using GPU.""" model = BoringModel() - device_stats = DeviceStatsMonitor() - - class DebugLogger(CSVLogger): - @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - # Just need to check one of the GPU metrics to make sure - # the GPU metrics are logged - fields = [ - "active.all.allocated", - ] + CPU_METRIC_KEYS - for f in fields: - assert any(f in h for h in metrics) - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=2, - log_every_n_steps=1, - accelerator="gpu", - devices=1, - callbacks=[device_stats], - logger=DebugLogger(tmpdir), - enable_checkpointing=False, - enable_progress_bar=False, - ) - - trainer.fit(model) - - -@RunIf(min_gpus=1) -def test_device_stats_gpu_from_nvidia_no_cpu(tmpdir): - """Test only GPU stat stats are logged using a logger.""" - model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=False) + device_stats = DeviceStatsMonitor(cpu_stats=cpu_stats) class DebugLogger(CSVLogger): @rank_zero_only @@ -148,10 +116,14 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> fields = [ "active.all.allocated", ] + + # If cpu stats, also check CPU metric keys are logged + fields += CPU_METRIC_KEYS if cpu_stats else [] for f in fields: assert any(f in h for h in metrics) - for f in CPU_METRIC_KEYS: + # If not cpu stats, make sure CPU metric keys aren't logged + if not cpu_stats: assert not any(f in h for h in metrics), "CPU Stats should not be included" trainer = Trainer( From 73d38fbe79c10306024ffadc38a3930ce2913b5e Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sat, 26 Feb 2022 18:26:15 -0500 Subject: [PATCH 23/56] Only run GPU torch metrics logged for > 1.8 Co-authored-by: Rohit Gupta --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 788b96884c815..d3f2386e22e73 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -102,7 +102,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> @pytest.mark.parametrize("cpu_stats", [True, False]) -@RunIf(min_gpus=1) +@RunIf(min_gpus=1, min_torch="1.8") def test_device_stats_gpu_from_torch_toggle_cpu(tmpdir, cpu_stats): """Test only CPU stats can be enabled/disabled when using GPU.""" model = BoringModel() From ea0765ead83d3addc4d6d7a4c006f6193e7eb3eb Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sun, 27 Feb 2022 15:37:24 +0530 Subject: [PATCH 24/56] update test --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index d3f2386e22e73..3128656d7de72 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -240,7 +240,7 @@ def test_device_stats_monitor_no_logger(tmpdir): enable_progress_bar=False, ) - with pytest.raises(MisconfigurationException, match="Trainer that has no logger."): + with pytest.raises(MisconfigurationException, match="`Trainer` that has no logger."): trainer.fit(model) From 5d1c0bcdce5a25a2d7c9e03c3113840119ab644f Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sun, 27 Feb 2022 17:01:49 -0500 Subject: [PATCH 25/56] Remove mocking comment + reduce steps --- tests/callbacks/test_device_stats_monitor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 3128656d7de72..dbfebaac611b4 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -128,8 +128,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer = Trainer( default_root_dir=tmpdir, - max_epochs=2, + max_epochs=1, limit_train_batches=2, + limit_val_batches=2, log_every_n_steps=1, accelerator="gpu", devices=1, @@ -156,8 +157,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> device_stats = DeviceStatsMonitor() trainer = Trainer( default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=7, + max_epochs=1, + limit_train_batches=2, + limit_val_batches=2, log_every_n_steps=1, callbacks=[device_stats], logger=DebugLogger(tmpdir), @@ -188,9 +190,6 @@ def test_device_stats_cpu_queried_once(cpu_metrics_device_stats_mock, cpu_metric trainer.fit(model) - # Note that you need to mock where the function is imported - # (not where it is defined). Please see the following for - # an explanation: https://docs.python.org/3/library/unittest.mock.html#where-to-patch assert cpu_metrics_device_stats_mock.call_count == 0 # called inside DeviceStatsMonitor assert cpu_metrics_cpu_accelerator_mock.call_count == 2 # called inside CPUAccelerator From 0e9020070db290bd99808fd30bddfdb440c61ed5 Mon Sep 17 00:00:00 2001 From: Eric Wiener Date: Sun, 27 Feb 2022 17:15:40 -0500 Subject: [PATCH 26/56] Update change log --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f201d6ccf62d3..8fca1e36f1b0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -534,6 +534,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with resuming from a checkpoint trained with QAT ([#11346](https://github.com/PyTorchLightning/pytorch-lightning/pull/11346)) +- Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/PyTorchLightning/pytorch-lightning/pull/11795)) + + ## [1.5.10] - 2022-02-08 ### Fixed From 4e54f3242fdd3899e5f58281323e7529aeab0cc5 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 26 Apr 2022 18:44:58 +0530 Subject: [PATCH 27/56] Update docs --- docs/source/tuning/profiler_basic.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/tuning/profiler_basic.rst b/docs/source/tuning/profiler_basic.rst index 899e657904d46..6dc365b4afd1b 100644 --- a/docs/source/tuning/profiler_basic.rst +++ b/docs/source/tuning/profiler_basic.rst @@ -119,3 +119,6 @@ This can be measured with the :class:`~pytorch_lightning.callbacks.device_stats_ from pytorch_lightning.callbacks import DeviceStatsMonitor trainer = Trainer(callbacks=[DeviceStatsMonitor()]) + +CPU metrics will be tracked by default regardless of the accelerator used. To disable logging +CPU metrics, you can specify ``DeviceStatsMonitor(cpu_stats=False)``. From c6864defcdfb4c21ff97fdbc1bdb2e1397e0f56e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Apr 2022 13:12:41 +0000 Subject: [PATCH 28/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/cpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 9c7c67d056dc0..3b88fa65e8330 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -77,6 +77,7 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: description=f"{cls.__class__.__name__}", ) + def get_cpu_process_metrics() -> dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( From 4ccf4d77aff1ba3ec3d222399260abaa80b520aa Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 00:55:14 +0530 Subject: [PATCH 29/56] Update cpu stats logging logic --- pytorch_lightning/callbacks/device_stats_monitor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index a31eea49b7cae..88bf8b62d10d5 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -34,7 +34,7 @@ class DeviceStatsMonitor(Callback): is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. Args: - cpu_stats: if ``True``, will also record CPU stats in addition to accelerator stats. + cpu_stats: if ``True``, it will record CPU stats. Raises: MisconfigurationException: @@ -47,7 +47,7 @@ class DeviceStatsMonitor(Callback): >>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP """ - def __init__(self, cpu_stats: bool = True) -> None: + def __init__(self, cpu_stats: Optional[bool] = None) -> None: self.cpu_stats = cpu_stats def setup( @@ -75,8 +75,11 @@ def _get_and_log_device_stats( if not trainer._logger_connector.should_update_logs: return + device_stats = {} device = trainer.strategy.root_device - device_stats = trainer.accelerator.get_device_stats(device) + + if self.cpu_stats is None or self.cpu_stats: + device_stats = trainer.accelerator.get_device_stats(device) if self.cpu_stats and device.type != "cpu": # Don't query CPU stats twice if CPU is accelerator From ce6be7f16169b3b306e615e44345244e05a18503 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 01:12:47 +0530 Subject: [PATCH 30/56] fix errors --- pytorch_lightning/accelerators/cpu.py | 27 ++----------------- .../callbacks/device_stats_monitor.py | 2 +- pytorch_lightning/utilities/memory.py | 25 +++++++++++++++++ 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 3b88fa65e8330..2654c440b7491 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -18,17 +18,9 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE +from pytorch_lightning.utilities.memory import get_cpu_process_metrics from pytorch_lightning.utilities.types import _DEVICE -if _PSUTIL_AVAILABLE: - import psutil - -# CPU device metrics -_CPU_VM_PERCENT = "cpu_vm_percent" -_CPU_PERCENT = "cpu_percent" -_CPU_SWAP_PERCENT = "cpu_swap_percent" - class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" @@ -43,7 +35,7 @@ def setup_environment(self, root_device: torch.device) -> None: if root_device.type != "cpu": raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") - def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get CPU stats from psutil.""" return get_cpu_process_metrics() @@ -76,18 +68,3 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) - - -def get_cpu_process_metrics() -> dict[str, float]: - if not _PSUTIL_AVAILABLE: - raise ModuleNotFoundError( - "`get_cpu_process_metrics` requires `psutil` to be installed." - " Install it by running `pip install -U psutil`." - ) - - metrics = { - _CPU_VM_PERCENT: psutil.virtual_memory().percent, - _CPU_PERCENT: psutil.cpu_percent(), - _CPU_SWAP_PERCENT: psutil.swap_memory().percent, - } - return metrics diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 88bf8b62d10d5..68f86764b2c6e 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -21,9 +21,9 @@ from typing import Any, Dict, Optional import pytorch_lightning as pl -from pytorch_lightning.accelerators.cpu import get_cpu_process_metrics from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.memory import get_cpu_process_metrics from pytorch_lightning.utilities.types import STEP_OUTPUT from pytorch_lightning.utilities.warnings import rank_zero_deprecation diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index f42f662d2fc65..b184a98cb776e 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -24,6 +24,10 @@ from torch.nn import Module from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE + +if _PSUTIL_AVAILABLE: + import psutil def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: @@ -141,3 +145,24 @@ def get_model_size_mb(model: Module) -> float: torch.save(model.state_dict(), model_size) size_mb = model_size.getbuffer().nbytes / 1e6 return size_mb + + +# CPU device metrics +_CPU_VM_PERCENT = "cpu_vm_percent" +_CPU_PERCENT = "cpu_percent" +_CPU_SWAP_PERCENT = "cpu_swap_percent" + + +def get_cpu_process_metrics() -> Dict[str, float]: + if not _PSUTIL_AVAILABLE: + raise ModuleNotFoundError( + "`get_cpu_process_metrics` requires `psutil` to be installed." + " Install it by running `pip install -U psutil`." + ) + + metrics = { + _CPU_VM_PERCENT: psutil.virtual_memory().percent, + _CPU_PERCENT: psutil.cpu_percent(), + _CPU_SWAP_PERCENT: psutil.swap_memory().percent, + } + return metrics From 629bd29aae181dbadaffa0b734e94efa371c2a29 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 11:09:57 +0530 Subject: [PATCH 31/56] Update docstring --- pytorch_lightning/callbacks/device_stats_monitor.py | 6 +++++- tests/callbacks/test_device_stats_monitor.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 68f86764b2c6e..b121a5a751887 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -34,7 +34,11 @@ class DeviceStatsMonitor(Callback): is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. Args: - cpu_stats: if ``True``, it will record CPU stats. + cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. + It will raise a warning if `psutil` is not installed till version 1.7.0. + if ``True``, it will log CPU stats regardless of the accelerator, and it will + raise an exception if `psutil` is not installed. + if ``False``, it will not log CPU stats regardless of the accelerator. Raises: MisconfigurationException: diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index dbfebaac611b4..b35f918084a6a 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -17,11 +17,11 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.cpu import _CPU_PERCENT, _CPU_SWAP_PERCENT, _CPU_VM_PERCENT from pytorch_lightning.callbacks import DeviceStatsMonitor from pytorch_lightning.callbacks.device_stats_monitor import _prefix_metric_keys from pytorch_lightning.loggers import CSVLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.memory import _CPU_PERCENT, _CPU_SWAP_PERCENT, _CPU_VM_PERCENT from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests.helpers import BoringModel from tests.helpers.runif import RunIf From 59453d65a1d2e27e9e676a6f4cee7e69b67ce976 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 12:43:52 +0530 Subject: [PATCH 32/56] Update DeviceStatsMonitor and tests --- .../callbacks/device_stats_monitor.py | 19 +++++++++++--- tests/callbacks/test_device_stats_monitor.py | 25 +++++++++++-------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index b121a5a751887..189d71e1e1277 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -23,9 +23,10 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.memory import get_cpu_process_metrics from pytorch_lightning.utilities.types import STEP_OUTPUT -from pytorch_lightning.utilities.warnings import rank_zero_deprecation +from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn class DeviceStatsMonitor(Callback): @@ -35,7 +36,7 @@ class DeviceStatsMonitor(Callback): Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. - It will raise a warning if `psutil` is not installed till version 1.7.0. + It will raise a warning if `psutil` is not installed till version 1.9.0. if ``True``, it will log CPU stats regardless of the accelerator, and it will raise an exception if `psutil` is not installed. if ``False``, it will not log CPU stats regardless of the accelerator. @@ -82,8 +83,18 @@ def _get_and_log_device_stats( device_stats = {} device = trainer.strategy.root_device - if self.cpu_stats is None or self.cpu_stats: - device_stats = trainer.accelerator.get_device_stats(device) + if self.cpu_stats is None: + if device.type == "cpu" and not _PSUTIL_AVAILABLE: + rank_zero_warn( + "`psutil` is not installed. `DeviceStatsMonitor` will not log CPU stats." + " To install `psutil`, run `pip install psutil`." + " It will raise an exception if `psutil` is not installed post version 1.9.0." + ) + return + elif not self.cpu_stats and device.type == "cpu": + return + + device_stats = trainer.accelerator.get_device_stats(device) if self.cpu_stats and device.type != "cpu": # Don't query CPU stats twice if CPU is accelerator diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index b35f918084a6a..c9d987d43b72f 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -87,8 +87,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer = Trainer( default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=7, + max_epochs=1, + limit_train_batches=2, log_every_n_steps=1, accelerator="gpu", devices=1, @@ -101,7 +101,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) -@pytest.mark.parametrize("cpu_stats", [True, False]) +@pytest.mark.parametrize("cpu_stats", [None, True, False]) @RunIf(min_gpus=1, min_torch="1.8") def test_device_stats_gpu_from_torch_toggle_cpu(tmpdir, cpu_stats): """Test only CPU stats can be enabled/disabled when using GPU.""" @@ -118,13 +118,13 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> ] # If cpu stats, also check CPU metric keys are logged - fields += CPU_METRIC_KEYS if cpu_stats else [] + fields = CPU_METRIC_KEYS if cpu_stats or cpu_stats is None else [] for f in fields: assert any(f in h for h in metrics) - # If not cpu stats, make sure CPU metric keys aren't logged - if not cpu_stats: - assert not any(f in h for h in metrics), "CPU Stats should not be included" + # If not cpu stats, make sure CPU metric keys aren't logged + if cpu_stats is not None and not cpu_stats: + assert not any(f in h for h in metrics), "CPU Stats should not be included" trainer = Trainer( default_root_dir=tmpdir, @@ -143,18 +143,23 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) -def test_device_stats_cpu(tmpdir): +@pytest.mark.parametrize("cpu_stats", [None, True, False]) +def test_device_stats_cpu(tmpdir, cpu_stats): """Test CPU stats are logged when no accelerator is used.""" model = BoringModel() class DebugLogger(CSVLogger): @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = CPU_METRIC_KEYS + fields = CPU_METRIC_KEYS if cpu_stats or cpu_stats is None else [] for f in fields: assert any(f in h for h in metrics) - device_stats = DeviceStatsMonitor() + # If not cpu stats, make sure CPU metric keys aren't logged + if cpu_stats is not None and not cpu_stats: + assert not any(f in h for h in metrics), "CPU Stats should not be included" + + device_stats = DeviceStatsMonitor(cpu_stats=cpu_stats) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, From e97ab3168356d4a5328d5c2549f86868dce081d0 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 13:26:34 +0530 Subject: [PATCH 33/56] Update RunIf --- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/memory.py | 2 +- tests/helpers/runif.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 289b7faa431e2..414e8d04962ff 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -46,6 +46,7 @@ _module_available, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, + _PSUTIL_AVAILABLE, _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index b184a98cb776e..5930b738de909 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -156,7 +156,7 @@ def get_model_size_mb(model: Module) -> float: def get_cpu_process_metrics() -> Dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( - "`get_cpu_process_metrics` requires `psutil` to be installed." + "Fetching CPU device stats requires `psutil` to be installed." " Install it by running `pip install -U psutil`." ) diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 5a2464f6fd6ba..cc07ca9006fe6 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -30,6 +30,7 @@ _HPU_AVAILABLE, _IPU_AVAILABLE, _OMEGACONF_AVAILABLE, + _PSUTIL_AVAILABLE, _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, _TORCH_QUANTIZE_AVAILABLE, @@ -84,6 +85,7 @@ def __new__( omegaconf: bool = False, slow: bool = False, bagua: bool = False, + psutil: bool = False, **kwargs, ): """ @@ -111,6 +113,7 @@ def __new__( omegaconf: Require that omry/omegaconf is installed. slow: Mark the test as slow, our CI will run it in a separate job. bagua: Require that BaguaSys/bagua is installed. + psutil: Require that psutil is installed. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. """ conditions = [] @@ -231,6 +234,10 @@ def __new__( conditions.append(not _BAGUA_AVAILABLE or sys.platform in ("win32", "darwin")) reasons.append("Bagua") + if psutil: + conditions.append(not _PSUTIL_AVAILABLE) + reasons.append("psutil") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs From ae4a2d920abb4a6e97361fd1f3ada2fe1bb60584 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 28 Apr 2022 14:27:53 +0530 Subject: [PATCH 34/56] Update tests --- .../callbacks/device_stats_monitor.py | 16 +++++++----- requirements/test.txt | 1 + tests/callbacks/test_device_stats_monitor.py | 26 ++++++++++++++++++- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 189d71e1e1277..429e377b821e4 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -25,8 +25,8 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.memory import get_cpu_process_metrics +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.types import STEP_OUTPUT -from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn class DeviceStatsMonitor(Callback): @@ -65,6 +65,15 @@ def setup( raise MisconfigurationException( "Cannot use `DeviceStatsMonitor` callback with `Trainer` that has no logger." ) + device = trainer.strategy.root_device + + if self.cpu_stats is None: + if device.type == "cpu" and not _PSUTIL_AVAILABLE: + rank_zero_warn( + "`DeviceStatsMonitor` will not log CPU stats as `psutil` is not installed." + " To install `psutil`, run `pip install psutil`." + " It will raise an exception if `psutil` is not installed post v1.9.0." + ) def _get_and_log_device_stats( self, @@ -85,11 +94,6 @@ def _get_and_log_device_stats( if self.cpu_stats is None: if device.type == "cpu" and not _PSUTIL_AVAILABLE: - rank_zero_warn( - "`psutil` is not installed. `DeviceStatsMonitor` will not log CPU stats." - " To install `psutil`, run `pip install psutil`." - " It will raise an exception if `psutil` is not installed post version 1.9.0." - ) return elif not self.cpu_stats and device.type == "cpu": return diff --git a/requirements/test.txt b/requirements/test.txt index 51d9ecf71db44..4373baec7a388 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -9,6 +9,7 @@ pre-commit>=1.0 pytest-forked sklearn jsonargparse +psutil # needed in tests cloudpickle>=1.3 diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index c9d987d43b72f..50aa6e3b1fa49 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -102,7 +102,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> @pytest.mark.parametrize("cpu_stats", [None, True, False]) -@RunIf(min_gpus=1, min_torch="1.8") +@RunIf(min_gpus=1, min_torch="1.8", psutil=True) def test_device_stats_gpu_from_torch_toggle_cpu(tmpdir, cpu_stats): """Test only CPU stats can be enabled/disabled when using GPU.""" model = BoringModel() @@ -143,6 +143,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) +@RunIf(psutil=True) @pytest.mark.parametrize("cpu_stats", [None, True, False]) def test_device_stats_cpu(tmpdir, cpu_stats): """Test CPU stats are logged when no accelerator is used.""" @@ -175,6 +176,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) +@RunIf(psutil=True) @mock.patch("pytorch_lightning.accelerators.cpu.get_cpu_process_metrics") @mock.patch("pytorch_lightning.callbacks.device_stats_monitor.get_cpu_process_metrics") def test_device_stats_cpu_queried_once(cpu_metrics_device_stats_mock, cpu_metrics_cpu_accelerator_mock, tmpdir): @@ -255,3 +257,25 @@ def test_prefix_metric_keys(tmpdir): separator = "." converted_metrics = _prefix_metric_keys(metrics, prefix, separator) assert converted_metrics == {"foo.1": 1.0, "foo.2": 2.0, "foo.3": 3.0} + + +@mock.patch("pytorch_lightning.utilities.imports._PSUTIL_AVAILABLE", return_value=False) +def test_warning_being_raised_when_psutil_is_not_available_with_device_stats(mock_psutil_available): + """Test that warning is raised when psutil is not available.""" + + model = BoringModel() + + class DebugStatsMonitor(DeviceStatsMonitor): + def setup(self, trainer, pl_module, stage): + super().setup(trainer, pl_module, stage) + raise SystemExit + + trainer = Trainer( + callbacks=DebugStatsMonitor(), + max_epochs=1, + log_every_n_steps=1, + enable_checkpointing=False, + enable_progress_bar=False, + ) + with pytest.raises(SystemExit), pytest.warns(UserWarning, match="psutil is not available"): + trainer.fit(model) From 33c2bf493d539df4d7cf7acdcf207fb2c81f4ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Apr 2022 15:59:26 +0200 Subject: [PATCH 35/56] Self review --- .../callbacks/device_stats_monitor.py | 44 +++++++------------ pytorch_lightning/utilities/memory.py | 7 +-- requirements/test.txt | 2 +- 3 files changed, 20 insertions(+), 33 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 429e377b821e4..c89017fc0dc11 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -65,37 +65,27 @@ def setup( raise MisconfigurationException( "Cannot use `DeviceStatsMonitor` callback with `Trainer` that has no logger." ) + # warn in setup to warn once device = trainer.strategy.root_device + if self.cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: + rank_zero_warn( + "`DeviceStatsMonitor` will not log CPU stats as `psutil` is not installed." + " To install `psutil`, run `pip install psutil`." + " It will raise an exception if `psutil` is not installed post v1.9.0." + ) - if self.cpu_stats is None: - if device.type == "cpu" and not _PSUTIL_AVAILABLE: - rank_zero_warn( - "`DeviceStatsMonitor` will not log CPU stats as `psutil` is not installed." - " To install `psutil`, run `pip install psutil`." - " It will raise an exception if `psutil` is not installed post v1.9.0." - ) - - def _get_and_log_device_stats( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - batch: Any, - batch_idx: int, - key: str, - ) -> None: - if not trainer.loggers: - raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") - + def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: if not trainer._logger_connector.should_update_logs: return + if not trainer.loggers: + raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") - device_stats = {} device = trainer.strategy.root_device - - if self.cpu_stats is None: - if device.type == "cpu" and not _PSUTIL_AVAILABLE: - return - elif not self.cpu_stats and device.type == "cpu": + if self.cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: + # we just warn for now + return + if self.cpu_stats is False and device.type == "cpu": + # cpu stats are disabled return device_stats = trainer.accelerator.get_device_stats(device) @@ -117,7 +107,7 @@ def on_train_batch_start( batch_idx: int, unused: Optional[int] = 0, ) -> None: - self._get_and_log_device_stats(trainer, pl_module, batch, batch_idx, "on_train_batch_start") + self._get_and_log_device_stats(trainer, "on_train_batch_start") def on_train_batch_end( self, @@ -128,7 +118,7 @@ def on_train_batch_end( batch_idx: int, unused: int = 0, ) -> None: - self._get_and_log_device_stats(trainer, pl_module, batch, batch_idx, "on_train_batch_end") + self._get_and_log_device_stats(trainer, "on_train_batch_end") def _prefix_metric_keys(metrics_dict: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]: diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 5930b738de909..7302a30867fd2 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -26,9 +26,6 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE -if _PSUTIL_AVAILABLE: - import psutil - def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: """Detach all tensors in `in_dict`. @@ -159,10 +156,10 @@ def get_cpu_process_metrics() -> Dict[str, float]: "Fetching CPU device stats requires `psutil` to be installed." " Install it by running `pip install -U psutil`." ) + import psutil - metrics = { + return { _CPU_VM_PERCENT: psutil.virtual_memory().percent, _CPU_PERCENT: psutil.cpu_percent(), _CPU_SWAP_PERCENT: psutil.swap_memory().percent, } - return metrics diff --git a/requirements/test.txt b/requirements/test.txt index 4373baec7a388..4c8a112e3a3e3 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -9,10 +9,10 @@ pre-commit>=1.0 pytest-forked sklearn jsonargparse -psutil # needed in tests cloudpickle>=1.3 scikit-learn>0.22.1 onnxruntime +psutil # for `DeviceStatsMonitor` pandas # needed in benchmarks From 26b43c4b294d497e74f8ae4857b75047a979e697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Apr 2022 16:09:50 +0200 Subject: [PATCH 36/56] Refactor test --- .../callbacks/device_stats_monitor.py | 2 +- pytorch_lightning/utilities/__init__.py | 1 - tests/callbacks/test_device_stats_monitor.py | 30 +++++++------------ tests/helpers/runif.py | 2 +- 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index c89017fc0dc11..677210b7ddbca 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -97,7 +97,7 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: for logger in trainer.loggers: separator = logger.group_separator prefixed_device_stats = _prefix_metric_keys(device_stats, key, separator) - logger.log_metrics(prefixed_device_stats, step=trainer.global_step) + logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped) def on_train_batch_start( self, diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 414e8d04962ff..289b7faa431e2 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -46,7 +46,6 @@ _module_available, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, - _PSUTIL_AVAILABLE, _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 50aa6e3b1fa49..54b2b90676867 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -13,8 +13,10 @@ # limitations under the License. from typing import Dict, Optional from unittest import mock +from unittest.mock import Mock import pytest +import torch from pytorch_lightning import Trainer from pytorch_lightning.callbacks import DeviceStatsMonitor @@ -259,23 +261,13 @@ def test_prefix_metric_keys(tmpdir): assert converted_metrics == {"foo.1": 1.0, "foo.2": 2.0, "foo.3": 3.0} -@mock.patch("pytorch_lightning.utilities.imports._PSUTIL_AVAILABLE", return_value=False) -def test_warning_being_raised_when_psutil_is_not_available_with_device_stats(mock_psutil_available): +def test_warning_being_raised_when_psutil_is_not_available_with_device_stats(monkeypatch): """Test that warning is raised when psutil is not available.""" - - model = BoringModel() - - class DebugStatsMonitor(DeviceStatsMonitor): - def setup(self, trainer, pl_module, stage): - super().setup(trainer, pl_module, stage) - raise SystemExit - - trainer = Trainer( - callbacks=DebugStatsMonitor(), - max_epochs=1, - log_every_n_steps=1, - enable_checkpointing=False, - enable_progress_bar=False, - ) - with pytest.raises(SystemExit), pytest.warns(UserWarning, match="psutil is not available"): - trainer.fit(model) + import pytorch_lightning.callbacks.device_stats_monitor as imports + + monkeypatch.setattr(imports, "_PSUTIL_AVAILABLE", False) + monitor = DeviceStatsMonitor() + trainer = Trainer() + assert trainer.strategy.root_device == torch.device("cpu") + with pytest.warns(UserWarning, match="psutil` is not installed"): + monitor.setup(trainer, Mock()) diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index cc07ca9006fe6..2fb822e6afec1 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -20,7 +20,7 @@ from packaging.version import Version from pkg_resources import get_distribution -from pytorch_lightning.utilities import ( +from pytorch_lightning.utilities.imports import ( _APEX_AVAILABLE, _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, From 56afcfbcfafa6cca62e526524ddb008c0d5e59a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Apr 2022 19:32:34 +0200 Subject: [PATCH 37/56] Simplify tests --- tests/callbacks/test_device_stats_monitor.py | 102 ++----------------- 1 file changed, 10 insertions(+), 92 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 54b2b90676867..9c6b9d745a359 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -68,113 +68,32 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer.fit(model) -@RunIf(max_torch="1.7") -@RunIf(min_gpus=1) -def test_device_stats_gpu_from_nvidia(tmpdir): - """Test GPU stats are logged using a logger with Pytorch < 1.8.0.""" - model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=False) - - class DebugLogger(CSVLogger): - @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = [ - "utilization.gpu", - "memory.used", - "memory.free", - "utilization.memory", - ] - for f in fields: - assert any(f in h for h in metrics) - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - log_every_n_steps=1, - accelerator="gpu", - devices=1, - callbacks=[device_stats], - logger=DebugLogger(tmpdir), - enable_checkpointing=False, - enable_progress_bar=False, - ) - - trainer.fit(model) - - -@pytest.mark.parametrize("cpu_stats", [None, True, False]) -@RunIf(min_gpus=1, min_torch="1.8", psutil=True) -def test_device_stats_gpu_from_torch_toggle_cpu(tmpdir, cpu_stats): - """Test only CPU stats can be enabled/disabled when using GPU.""" - model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=cpu_stats) - - class DebugLogger(CSVLogger): - @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - # Just need to check one of the GPU metrics to make sure - # the GPU metrics are logged - fields = [ - "active.all.allocated", - ] - - # If cpu stats, also check CPU metric keys are logged - fields = CPU_METRIC_KEYS if cpu_stats or cpu_stats is None else [] - for f in fields: - assert any(f in h for h in metrics) - - # If not cpu stats, make sure CPU metric keys aren't logged - if cpu_stats is not None and not cpu_stats: - assert not any(f in h for h in metrics), "CPU Stats should not be included" - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - log_every_n_steps=1, - accelerator="gpu", - devices=1, - callbacks=[device_stats], - logger=DebugLogger(tmpdir), - enable_checkpointing=False, - enable_progress_bar=False, - ) - - trainer.fit(model) - - @RunIf(psutil=True) -@pytest.mark.parametrize("cpu_stats", [None, True, False]) +@pytest.mark.parametrize("cpu_stats", (None, True, False)) def test_device_stats_cpu(tmpdir, cpu_stats): """Test CPU stats are logged when no accelerator is used.""" model = BoringModel() class DebugLogger(CSVLogger): - @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - fields = CPU_METRIC_KEYS if cpu_stats or cpu_stats is None else [] - for f in fields: - assert any(f in h for h in metrics) - - # If not cpu stats, make sure CPU metric keys aren't logged - if cpu_stats is not None and not cpu_stats: - assert not any(f in h for h in metrics), "CPU Stats should not be included" + enabled = cpu_stats or cpu_stats is None + for f in CPU_METRIC_KEYS: + has_cpu_metrics = any(f in h for h in metrics) + assert has_cpu_metrics if enabled else not has_cpu_metrics device_stats = DeviceStatsMonitor(cpu_stats=cpu_stats) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, - limit_val_batches=2, + limit_val_batches=0, log_every_n_steps=1, - callbacks=[device_stats], + callbacks=device_stats, logger=DebugLogger(tmpdir), enable_checkpointing=False, enable_progress_bar=False, + accelerator="auto", ) - trainer.fit(model) @@ -184,19 +103,18 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> def test_device_stats_cpu_queried_once(cpu_metrics_device_stats_mock, cpu_metrics_cpu_accelerator_mock, tmpdir): """Make sure that get_cpu_process_metrics is only queried once if the accelerator is CPU and cpu_stats=True.""" model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=True) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=1, + limit_val_batches=0, log_every_n_steps=1, - callbacks=[device_stats], + callbacks=device_stats, logger=True, enable_checkpointing=False, enable_progress_bar=False, ) - trainer.fit(model) assert cpu_metrics_device_stats_mock.call_count == 0 # called inside DeviceStatsMonitor From b3f0b81171138567e780d40ae8c12e09a94fdeef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Apr 2022 19:34:30 +0200 Subject: [PATCH 38/56] Convert global to local --- tests/callbacks/test_device_stats_monitor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 9c6b9d745a359..a9b2d4ffbe4e8 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -28,12 +28,6 @@ from tests.helpers import BoringModel from tests.helpers.runif import RunIf -CPU_METRIC_KEYS = [ - _CPU_VM_PERCENT, - _CPU_SWAP_PERCENT, - _CPU_PERCENT, -] - @RunIf(min_gpus=1) def test_device_stats_gpu_from_torch(tmpdir): @@ -73,6 +67,11 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> def test_device_stats_cpu(tmpdir, cpu_stats): """Test CPU stats are logged when no accelerator is used.""" model = BoringModel() + CPU_METRIC_KEYS = [ + _CPU_VM_PERCENT, + _CPU_SWAP_PERCENT, + _CPU_PERCENT, + ] class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: From d540a19d12dcebc59c3cc3d8e56de48bea18b1e9 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 29 Apr 2022 10:48:02 +0530 Subject: [PATCH 39/56] Fix tests --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index a9b2d4ffbe4e8..961b6fd8cb292 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -91,7 +91,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> logger=DebugLogger(tmpdir), enable_checkpointing=False, enable_progress_bar=False, - accelerator="auto", + accelerator="cpu", ) trainer.fit(model) From 9c4bb1f9db09314a2eb7a68748f99e136a52b5c5 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Wed, 4 May 2022 13:01:56 +0530 Subject: [PATCH 40/56] Update tests/callbacks/test_device_stats_monitor.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 961b6fd8cb292..5521eb3f020b7 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -178,7 +178,7 @@ def test_prefix_metric_keys(tmpdir): assert converted_metrics == {"foo.1": 1.0, "foo.2": 2.0, "foo.3": 3.0} -def test_warning_being_raised_when_psutil_is_not_available_with_device_stats(monkeypatch): +def test_device_stats_monitor_warning_when_psutil_not_available(monkeypatch): """Test that warning is raised when psutil is not available.""" import pytorch_lightning.callbacks.device_stats_monitor as imports From ae4c99ff4ae9864fcedb85bfa729b3afb204bc9b Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Wed, 4 May 2022 13:02:05 +0530 Subject: [PATCH 41/56] Update pytorch_lightning/callbacks/device_stats_monitor.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/callbacks/device_stats_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 6ffb6c3634dce..160d169500dac 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -37,9 +37,9 @@ class DeviceStatsMonitor(Callback): Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. It will raise a warning if `psutil` is not installed till version 1.9.0. - if ``True``, it will log CPU stats regardless of the accelerator, and it will + If ``True``, it will log CPU stats regardless of the accelerator, and it will raise an exception if `psutil` is not installed. - if ``False``, it will not log CPU stats regardless of the accelerator. + If ``False``, it will not log CPU stats regardless of the accelerator. Raises: MisconfigurationException: From f50d0a786fdea4afd71f8a0ae2e1e96e6ada11f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 May 2022 07:33:06 +0000 Subject: [PATCH 42/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/callbacks/device_stats_monitor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 160d169500dac..1860ef4c5bf85 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -96,9 +96,7 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: for logger in trainer.loggers: separator = logger.group_separator - prefixed_device_stats = _prefix_metric_keys( - device_stats, f"{self.__class__.__qualname__}.{key}", separator - ) + prefixed_device_stats = _prefix_metric_keys(device_stats, f"{self.__class__.__qualname__}.{key}", separator) logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped) def on_train_batch_start( From 5a7160055da56e8fb681ebfeff1282309ce703b9 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Wed, 4 May 2022 13:06:25 +0530 Subject: [PATCH 43/56] Address reviews --- pytorch_lightning/callbacks/device_stats_monitor.py | 9 +++------ tests/callbacks/test_device_stats_monitor.py | 3 ++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 160d169500dac..5eb847340e0dc 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -62,12 +62,11 @@ def setup( stage: Optional[str] = None, ) -> None: if not trainer.loggers: - raise MisconfigurationException( - "Cannot use `DeviceStatsMonitor` callback with `Trainer` that has no logger." - ) + raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") # warn in setup to warn once device = trainer.strategy.root_device if self.cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: + # TODO: raise an exception from v1.9 rank_zero_warn( "`DeviceStatsMonitor` will not log CPU stats as `psutil` is not installed." " To install `psutil`, run `pip install psutil`." @@ -96,9 +95,7 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: for logger in trainer.loggers: separator = logger.group_separator - prefixed_device_stats = _prefix_metric_keys( - device_stats, f"{self.__class__.__qualname__}.{key}", separator - ) + prefixed_device_stats = _prefix_metric_keys(device_stats, f"{self.__class__.__qualname__}.{key}", separator) logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped) def on_train_batch_start( diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 5521eb3f020b7..be940f2c371c6 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -165,7 +165,7 @@ def test_device_stats_monitor_no_logger(tmpdir): enable_progress_bar=False, ) - with pytest.raises(MisconfigurationException, match="`Trainer` that has no logger."): + with pytest.raises(MisconfigurationException, match="Cannot use `DeviceStatsMonitor` callback."): trainer.fit(model) @@ -186,5 +186,6 @@ def test_device_stats_monitor_warning_when_psutil_not_available(monkeypatch): monitor = DeviceStatsMonitor() trainer = Trainer() assert trainer.strategy.root_device == torch.device("cpu") + # TODO: raise an exception from v1.9 with pytest.warns(UserWarning, match="psutil` is not installed"): monitor.setup(trainer, Mock()) From a3aa1f5d048e41695f22fbae0afdce8de9d1af3b Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Wed, 4 May 2022 20:16:35 +0530 Subject: [PATCH 44/56] Update pytorch_lightning/callbacks/device_stats_monitor.py Co-authored-by: Rohit Gupta --- pytorch_lightning/callbacks/device_stats_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 5eb847340e0dc..66d74d260f032 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -36,9 +36,9 @@ class DeviceStatsMonitor(Callback): Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. - It will raise a warning if `psutil` is not installed till version 1.9.0. + It will raise a warning if ``psutil`` is not installed till version 1.9.0. If ``True``, it will log CPU stats regardless of the accelerator, and it will - raise an exception if `psutil` is not installed. + raise an exception if ``psutil`` is not installed. If ``False``, it will not log CPU stats regardless of the accelerator. Raises: From 6d136fb83e941ed847926876c860f96445705990 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Wed, 4 May 2022 20:17:01 +0530 Subject: [PATCH 45/56] Update docs/source/tuning/profiler_basic.rst Co-authored-by: Rohit Gupta --- docs/source/tuning/profiler_basic.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tuning/profiler_basic.rst b/docs/source/tuning/profiler_basic.rst index 6dc365b4afd1b..c3ddc114dce9a 100644 --- a/docs/source/tuning/profiler_basic.rst +++ b/docs/source/tuning/profiler_basic.rst @@ -120,5 +120,5 @@ This can be measured with the :class:`~pytorch_lightning.callbacks.device_stats_ trainer = Trainer(callbacks=[DeviceStatsMonitor()]) -CPU metrics will be tracked by default regardless of the accelerator used. To disable logging +CPU metrics will be tracked by default on the CPU accelerator. To enable it for other accelerators set ``DeviceStatsMonitor(cpu_stats=True)``. To disable logging CPU metrics, you can specify ``DeviceStatsMonitor(cpu_stats=False)``. From 941c630a4109cbbe8e8eac8c9fb3f88c5398159c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 4 May 2022 18:50:57 +0200 Subject: [PATCH 46/56] Address Rohit's comments --- pytorch_lightning/accelerators/cpu.py | 25 +++++++++++- .../callbacks/device_stats_monitor.py | 5 ++- pytorch_lightning/utilities/memory.py | 22 ----------- tests/callbacks/test_device_stats_monitor.py | 38 ++++--------------- 4 files changed, 33 insertions(+), 57 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 2654c440b7491..0577fd2f529b3 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -18,7 +18,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.memory import get_cpu_process_metrics +from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE from pytorch_lightning.utilities.types import _DEVICE @@ -37,7 +37,7 @@ def setup_environment(self, root_device: torch.device) -> None: def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get CPU stats from psutil.""" - return get_cpu_process_metrics() + return get_cpu_stats() @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> int: @@ -68,3 +68,24 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) + + +# CPU device metrics +_CPU_VM_PERCENT = "cpu_vm_percent" +_CPU_PERCENT = "cpu_percent" +_CPU_SWAP_PERCENT = "cpu_swap_percent" + + +def get_cpu_stats() -> Dict[str, float]: + if not _PSUTIL_AVAILABLE: + raise ModuleNotFoundError( + "Fetching CPU device stats requires `psutil` to be installed." + " Install it by running `pip install -U psutil`." + ) + import psutil + + return { + _CPU_VM_PERCENT: psutil.virtual_memory().percent, + _CPU_PERCENT: psutil.cpu_percent(), + _CPU_SWAP_PERCENT: psutil.swap_memory().percent, + } diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 66d74d260f032..18b5988c0df97 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -24,7 +24,6 @@ from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE -from pytorch_lightning.utilities.memory import get_cpu_process_metrics from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -91,7 +90,9 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: if self.cpu_stats and device.type != "cpu": # Don't query CPU stats twice if CPU is accelerator - device_stats.update(get_cpu_process_metrics()) + from pytorch_lightning.accelerators.cpu import get_cpu_stats + + device_stats.update(get_cpu_stats()) for logger in trainer.loggers: separator = logger.group_separator diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 7302a30867fd2..f42f662d2fc65 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -24,7 +24,6 @@ from torch.nn import Module from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: @@ -142,24 +141,3 @@ def get_model_size_mb(model: Module) -> float: torch.save(model.state_dict(), model_size) size_mb = model_size.getbuffer().nbytes / 1e6 return size_mb - - -# CPU device metrics -_CPU_VM_PERCENT = "cpu_vm_percent" -_CPU_PERCENT = "cpu_percent" -_CPU_SWAP_PERCENT = "cpu_swap_percent" - - -def get_cpu_process_metrics() -> Dict[str, float]: - if not _PSUTIL_AVAILABLE: - raise ModuleNotFoundError( - "Fetching CPU device stats requires `psutil` to be installed." - " Install it by running `pip install -U psutil`." - ) - import psutil - - return { - _CPU_VM_PERCENT: psutil.virtual_memory().percent, - _CPU_PERCENT: psutil.cpu_percent(), - _CPU_SWAP_PERCENT: psutil.swap_memory().percent, - } diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index be940f2c371c6..23db60063a3d0 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -19,11 +19,11 @@ import torch from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.cpu import _CPU_PERCENT, _CPU_SWAP_PERCENT, _CPU_VM_PERCENT, get_cpu_stats from pytorch_lightning.callbacks import DeviceStatsMonitor from pytorch_lightning.callbacks.device_stats_monitor import _prefix_metric_keys from pytorch_lightning.loggers import CSVLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.memory import _CPU_PERCENT, _CPU_SWAP_PERCENT, _CPU_VM_PERCENT from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -64,18 +64,15 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> @RunIf(psutil=True) @pytest.mark.parametrize("cpu_stats", (None, True, False)) -def test_device_stats_cpu(tmpdir, cpu_stats): +@mock.patch("pytorch_lightning.accelerators.cpu.get_cpu_stats", side_effect=get_cpu_stats) +def test_device_stats_cpu(cpu_stats_mock, tmpdir, cpu_stats): """Test CPU stats are logged when no accelerator is used.""" model = BoringModel() - CPU_METRIC_KEYS = [ - _CPU_VM_PERCENT, - _CPU_SWAP_PERCENT, - _CPU_PERCENT, - ] + CPU_METRIC_KEYS = (_CPU_VM_PERCENT, _CPU_SWAP_PERCENT, _CPU_PERCENT) class DebugLogger(CSVLogger): def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - enabled = cpu_stats or cpu_stats is None + enabled = cpu_stats is not False for f in CPU_METRIC_KEYS: has_cpu_metrics = any(f in h for h in metrics) assert has_cpu_metrics if enabled else not has_cpu_metrics @@ -95,29 +92,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> ) trainer.fit(model) - -@RunIf(psutil=True) -@mock.patch("pytorch_lightning.accelerators.cpu.get_cpu_process_metrics") -@mock.patch("pytorch_lightning.callbacks.device_stats_monitor.get_cpu_process_metrics") -def test_device_stats_cpu_queried_once(cpu_metrics_device_stats_mock, cpu_metrics_cpu_accelerator_mock, tmpdir): - """Make sure that get_cpu_process_metrics is only queried once if the accelerator is CPU and cpu_stats=True.""" - model = BoringModel() - device_stats = DeviceStatsMonitor(cpu_stats=True) - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=1, - limit_val_batches=0, - log_every_n_steps=1, - callbacks=device_stats, - logger=True, - enable_checkpointing=False, - enable_progress_bar=False, - ) - trainer.fit(model) - - assert cpu_metrics_device_stats_mock.call_count == 0 # called inside DeviceStatsMonitor - assert cpu_metrics_cpu_accelerator_mock.call_count == 2 # called inside CPUAccelerator + expected = 4 if cpu_stats is not False else 0 # (batch_start + batch_end) * train_batches + assert cpu_stats_mock.call_count == expected @RunIf(tpu=True) From fd45c8d827af19bf92568e12ae02ee636cbb1489 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 5 May 2022 13:51:41 +0530 Subject: [PATCH 47/56] Address reviews --- .../callbacks/device_stats_monitor.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index 18b5988c0df97..ca1fd528f8616 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -52,7 +52,7 @@ class DeviceStatsMonitor(Callback): """ def __init__(self, cpu_stats: Optional[bool] = None) -> None: - self.cpu_stats = cpu_stats + self._cpu_stats = cpu_stats def setup( self, @@ -64,31 +64,27 @@ def setup( raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") # warn in setup to warn once device = trainer.strategy.root_device - if self.cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: + if self._cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: # TODO: raise an exception from v1.9 rank_zero_warn( "`DeviceStatsMonitor` will not log CPU stats as `psutil` is not installed." " To install `psutil`, run `pip install psutil`." " It will raise an exception if `psutil` is not installed post v1.9.0." ) + self._cpu_stats = False def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None: if not trainer._logger_connector.should_update_logs: return - if not trainer.loggers: - raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") device = trainer.strategy.root_device - if self.cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: - # we just warn for now - return - if self.cpu_stats is False and device.type == "cpu": + if self._cpu_stats is False and device.type == "cpu": # cpu stats are disabled return device_stats = trainer.accelerator.get_device_stats(device) - if self.cpu_stats and device.type != "cpu": + if self._cpu_stats and device.type != "cpu": # Don't query CPU stats twice if CPU is accelerator from pytorch_lightning.accelerators.cpu import get_cpu_stats From 941315e68f32ec5cb0a7bf461b56c7a1cfc0c01f Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Thu, 5 May 2022 16:11:07 +0530 Subject: [PATCH 48/56] Update pytorch_lightning/callbacks/device_stats_monitor.py Co-authored-by: Rohit Gupta --- pytorch_lightning/callbacks/device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index ca1fd528f8616..4d5280e67d578 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -35,7 +35,7 @@ class DeviceStatsMonitor(Callback): Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. - It will raise a warning if ``psutil`` is not installed till version 1.9.0. + It will raise a warning if ``psutil`` is not installed till v1.9.0. If ``True``, it will log CPU stats regardless of the accelerator, and it will raise an exception if ``psutil`` is not installed. If ``False``, it will not log CPU stats regardless of the accelerator. From 25b8c5edaa70b5282566e30fe438443d0017b978 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Thu, 5 May 2022 16:11:15 +0530 Subject: [PATCH 49/56] Update pytorch_lightning/accelerators/cpu.py Co-authored-by: Rohit Gupta --- pytorch_lightning/accelerators/cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 0577fd2f529b3..fea8ee70d17df 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -36,7 +36,7 @@ def setup_environment(self, root_device: torch.device) -> None: raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: - """Get CPU stats from psutil.""" + """Get CPU stats from ``psutil`` package.""" return get_cpu_stats() @staticmethod From b392415efba8fde8b7ef844f9b0ee43139234467 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 5 May 2022 17:47:18 +0530 Subject: [PATCH 50/56] FIx tpu tests --- tests/callbacks/test_device_stats_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 23db60063a3d0..c96c115162430 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -113,9 +113,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - limit_train_batches=1, + limit_train_batches=2, accelerator="tpu", - devices=8, + devices=1, log_every_n_steps=1, callbacks=[device_stats], logger=DebugLogger(tmpdir), From 7f64889bc3359ecc5e91bf3770ec2e6059150ee4 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 6 May 2022 14:21:20 +0530 Subject: [PATCH 51/56] TPU test --- tests/callbacks/test_device_stats_monitor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index c96c115162430..1287c5807ce6d 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -96,6 +96,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> assert cpu_stats_mock.call_count == expected +@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test, timeout") @RunIf(tpu=True) def test_device_stats_monitor_tpu(tmpdir): """Test TPU stats are logged using a logger.""" From f06cc077172a19c06ebbf5fda51cb3f2c86e700b Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 9 May 2022 14:11:29 +0530 Subject: [PATCH 52/56] only on fit --- pytorch_lightning/callbacks/device_stats_monitor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index e05a722091837..00fd79d0f7ccf 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -60,8 +60,12 @@ def setup( pl_module: "pl.LightningModule", stage: Optional[str] = None, ) -> None: + if stage != "fit": + return + if not trainer.loggers: raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.") + # warn in setup to warn once device = trainer.strategy.root_device if self._cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE: From 2ef65557d6d58bf2196255ea6b7a5a9672a81919 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Mon, 9 May 2022 15:53:09 +0530 Subject: [PATCH 53/56] Fix test --- tests/callbacks/test_device_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 1287c5807ce6d..35fa91698b591 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -165,4 +165,4 @@ def test_device_stats_monitor_warning_when_psutil_not_available(monkeypatch): assert trainer.strategy.root_device == torch.device("cpu") # TODO: raise an exception from v1.9 with pytest.warns(UserWarning, match="psutil` is not installed"): - monitor.setup(trainer, Mock()) + monitor.setup(trainer, Mock(), "fit") From f394a1e4a43b13080c60957dba6a99f41a7dc58e Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Mon, 9 May 2022 16:26:37 +0530 Subject: [PATCH 54/56] Fix tpu tests --- pytorch_lightning/trainer/trainer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index cd32003229a30..7561587c2c635 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1319,7 +1319,7 @@ def _run_evaluate(self) -> _EVALUATE_OUTPUT: # reset trainer on this loop and all child loops in case user connected a custom loop self._evaluation_loop.trainer = self - with self.profiler.profile(f"run_{self.state.stage}_evaluation"), _evaluation_context(): + with self.profiler.profile(f"run_{self.state.stage}_evaluation"), _evaluation_context(self.accelerator): eval_loop_results = self._evaluation_loop.run() # remove the tensors from the eval results @@ -1335,7 +1335,7 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: self.reset_predict_dataloader(self.lightning_module) # reset trainer on this loop and all child loops in case user connected a custom loop self.predict_loop.trainer = self - with _evaluation_context(): + with _evaluation_context(self.accelerator): return self.predict_loop.run() def _run_sanity_check(self) -> None: @@ -2801,11 +2801,13 @@ def configure_optimizers(self): @contextmanager -def _evaluation_context() -> Generator: +def _evaluation_context(accelerator: Accelerator) -> Generator: # inference mode is not supported with gloo backend (#9431) context_manager_class = ( torch.inference_mode - if _TORCH_GREATER_EQUAL_1_9 and not (dist.is_initialized() and dist.get_backend() == "gloo") + if _TORCH_GREATER_EQUAL_1_9 + and not (dist.is_initialized() and dist.get_backend() == "gloo") + and not isinstance(accelerator, TPUAccelerator) else torch.no_grad ) with context_manager_class(): From c17bf699a4bcb537a1913e2301aaedca001739a7 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 10 May 2022 06:31:38 +0530 Subject: [PATCH 55/56] Fix tests --- tests/callbacks/test_device_stats_monitor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 35fa91698b591..27bc6c1b6b289 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -96,8 +96,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> assert cpu_stats_mock.call_count == expected -@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test, timeout") -@RunIf(tpu=True) +@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test for TPUs, timeout") def test_device_stats_monitor_tpu(tmpdir): """Test TPU stats are logged using a logger.""" From a052a384f8a16f343f7e008ebd9e1ea1796dc513 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 10 May 2022 06:35:47 +0530 Subject: [PATCH 56/56] Fix tests --- dockers/tpu-tests/tpu_test_cases.jsonnet | 2 +- tests/callbacks/test_device_stats_monitor.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index ea7d5b6e09a97..c474ea6641076 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -32,12 +32,12 @@ local tputests = base.BaseTest { pip install -e . echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" + # TODO (@kaushikb11): Add device stats tests here coverage run --source=pytorch_lightning -m pytest -v --capture=no \ tests/strategies/test_tpu_spawn.py \ tests/profiler/test_xla_profiler.py \ pytorch_lightning/utilities/xla_device.py \ tests/accelerators/test_tpu.py \ - tests/callbacks/test_device_stats_monitor.py \ tests/models/test_tpu.py test_exit_code=$? echo "\n||| END PYTEST LOGS |||\n" diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 27bc6c1b6b289..35fa91698b591 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -96,7 +96,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> assert cpu_stats_mock.call_count == expected -@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test for TPUs, timeout") +@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test, timeout") +@RunIf(tpu=True) def test_device_stats_monitor_tpu(tmpdir): """Test TPU stats are logged using a logger."""