Fix race condition when executing with multi-node where some ranks does not wait for setup (#7016)

findkim · jubick1337 · commit 904536b0feb3 · 2023-08-07T17:54:10.000-07:00
Signed-off-by: Kim Ngo &lt;6362111+findkim@users.noreply.github.com&gt;
Signed-off-by: jubick1337 &lt;mattyson.so@gmail.com&gt;
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py
@@ -14,13 +14,14 @@
 # limitations under the License.
 
 import os
+import shutil
 from typing import Dict, List
 
 import torch
 import wget
 from torch.hub import _get_torch_home
 
-from nemo.utils import logging
+from nemo.utils import get_rank, logging
 
 __all__ = [
     "get_megatron_lm_model",
@@ -202,16 +203,14 @@ def _download(path: str, url: str):
     if url is None:
         return None
 
-    if not os.path.exists(path):
-        master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-        if not os.path.exists(path):
-            if master_device:
-                os.makedirs(MEGATRON_CACHE, exist_ok=True)
-                logging.info(f"Downloading from {url}")
-                wget.download(url, path)
-            # wait until the master process downloads the file and writes it to the cache dir
-            if torch.distributed.is_initialized():
-                torch.distributed.barrier()
+    if get_rank.is_global_rank_zero() and not os.path.exists(path):
+        os.makedirs(MEGATRON_CACHE, exist_ok=True)
+        logging.info(f"Downloading from {url} to {path}")
+        downloaded_path = wget.download(url)
+        shutil.move(downloaded_path, path)
+    # wait until the master process downloads the file and writes it to the cache dir
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
 
     return path