Lightning-AI · kaushikb11 · Aug 2, 2022 · Jul 25, 2022 · Jul 25, 2022 · Jul 25, 2022
@@ -79,5 +79,4 @@ Known limitations
 -----------------
 
 * `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
-* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported.
 * :func:`torch.inference_mode` is not supported
@@ -66,3 +66,34 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     trainer.fit(model, datamodule=dm)
 
 For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
+
+----
+
+Enabling DeviceStatsMonitor with HPUs
+----------------------------------------
+
+:class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage.
+This callback can be passed for training with HPUs. It returns a map of the following metrics with their values:
+
+    **Limit** : amount of total memory on HPU device.
+    **InUse** : amount of allocated memory at any instance.
+    **MaxInUse** : amount of total active memory allocated.
+    **NumAllocs** : number of allocations.
+    **NumFrees** : number of freed chunks.
+    **ActiveAllocs** : number of active allocations.
+    **MaxAllocSize** : maximum allocated size.
+    **TotalSystemAllocs** : total number of system allocations.
+    **TotalSystemFrees** : total number of system frees.
+    **TotalActiveAllocs** : total number of active allocations.
+
+The below snippet shows how DeviceStatsMonitor can be enabled.
+
+.. code-block:: python
+
+    from pytorch_lightning import Trainer
+    from pytorch_lightning.callbacks import DeviceStatsMonitor
+
+    device_stats = DeviceStatsMonitor()
+    trainer = Trainer(accelerator="hpu", callbacks=[device_stats])
+
+For more details, please refer to `Memory Stats APIs <https://docs.habana.ai/en/v1.5.0/PyTorch/PyTorch_User_Guide/Python_Packages.html#memory-stats-apis>`__.
@@ -111,6 +111,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for async checkpointing ([#13658](https://github.com/Lightning-AI/lightning/pull/13658))
 
 
+- Added support for HPU Device stats monitor ([#13819](https://github.com/Lightning-AI/lightning/pull/13819))
+
+
 ### Changed
 
 - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642))

@@ -39,9 +39,24 @@ def setup_environment(self, root_device: torch.device) -> None:
             raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.")
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """HPU device stats aren't supported yet."""
-        rank_zero_debug("HPU device stats aren't supported yet.")
-        return {}
+        """Returns a map of the following metrics with their values:
+
+        Limit,
+        InUse,
+        MaxInUse,
+        NumAllocs,
+        NumFrees,
+        ActiveAllocs,
+        MaxAllocSize,
+        TotalSystemAllocs,
+        TotalSystemFrees,
+        TotalActiveAllocs
+        """
+        try:
+            return torch_hpu.hpu.memory_stats(device)
+        except (AttributeError, NameError):
+            rank_zero_debug("HPU `get_device_stats` failed")
+            return {}
 
     @staticmethod
     def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]:

@@ -303,3 +303,23 @@ def training_epoch_end(self, outputs) -> None:
     trainer.fit(model)
 
     assert all(model.optims)
+
+
+@RunIf(hpu=True)
+def test_hpu_device_stats_monitor(tmpdir):
+
+    hpu_stats = HPUAccelerator().get_device_stats("hpu")
+    fields = [
+        "Limit",
+        "InUse",
+        "MaxInUse",
+        "NumAllocs",
+        "NumFrees",
+        "ActiveAllocs",
+        "MaxAllocSize",
+        "TotalSystemAllocs",
+        "TotalSystemFrees",
+        "TotalActiveAllocs",
+    ]
+    for f in fields:
+        assert any(f in h for h in hpu_stats.keys())