Skip to content

Commit 00d2ddf

Browse files
qgallouedeczaristei
authored andcommitted
[Trackio] Allow single-gpu training and monitor power (huggingface#39595)
Allow not distributed and monitor power
1 parent ed2e52f commit 00d2ddf

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

src/transformers/integrations/integration_utils.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,19 +1115,22 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs):
11151115
"total_flos",
11161116
]
11171117

1118-
if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
1118+
if is_torch_available() and torch.cuda.is_available():
11191119
device_idx = torch.cuda.current_device()
11201120
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
11211121
memory_allocated = torch.cuda.memory_allocated(device_idx)
1122-
1122+
power = torch.cuda.power_draw(device_idx)
11231123
gpu_memory_logs = {
11241124
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
11251125
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
1126+
f"gpu/{device_idx}/power": power / 1000, # Watts
11261127
}
1127-
1128-
gathered_logs = [None] * dist.get_world_size()
1129-
dist.all_gather_object(gathered_logs, gpu_memory_logs)
1130-
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
1128+
if dist.is_available() and dist.is_initialized():
1129+
gathered_logs = [None] * dist.get_world_size()
1130+
dist.all_gather_object(gathered_logs, gpu_memory_logs)
1131+
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
1132+
else:
1133+
gpu_memory_logs = {}
11311134

11321135
if not self._initialized:
11331136
self.setup(args, state, model)

0 commit comments

Comments
 (0)