Renamed parallelize_plan to tensor_parallel_plan (#2387)

pbontrager · web-flow · commit cb83655a1471 · 2025-02-13T10:46:40.000-05:00
diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
@@ -21,7 +21,7 @@ output_dir: /tmp/torchtune/llama3_70B/full # /tmp may be deleted by your system.
 
 # Parallelism
 tensor_parallel_dim: 1
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Tokenizer
diff --git a/recipes/configs/llama3/70B_generation_distributed.yaml b/recipes/configs/llama3/70B_generation_distributed.yaml
@@ -13,7 +13,7 @@ output_dir: ./
 model:
   _component_: torchtune.models.llama3.llama3_70b
 
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Transform arguments
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
@@ -20,7 +20,7 @@ output_dir: /tmp/torchtune/llama3_1_70B/full # /tmp may be deleted by your syste
 
 # Parallelism
 tensor_parallel_dim: 1
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Tokenizer
diff --git a/recipes/configs/llama3_1/70B_generation_distributed.yaml b/recipes/configs/llama3_1/70B_generation_distributed.yaml
@@ -13,7 +13,7 @@ output_dir: ./
 model:
   _component_: torchtune.models.llama3_1.llama3_1_70b
 
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Transform arguments
diff --git a/recipes/configs/llama3_3/70B_full.yaml b/recipes/configs/llama3_3/70B_full.yaml
@@ -20,7 +20,7 @@ output_dir: /tmp/torchtune/llama3_3_70B/full # /tmp may be deleted by your syste
 
 # Parallelism
 tensor_parallel_dim: 1
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Tokenizer
diff --git a/recipes/configs/llama3_3/70B_generation_distributed.yaml b/recipes/configs/llama3_3/70B_generation_distributed.yaml
@@ -13,7 +13,7 @@ output_dir: ./
 model:
   _component_: torchtune.models.llama3_3.llama3_3_70b
 
-parallelize_plan:
+tensor_parallel_plan:
   _component_: torchtune.models.llama3.base_llama_tp_plan
 
 # Transform arguments
diff --git a/recipes/dev/generate_v2_distributed.py b/recipes/dev/generate_v2_distributed.py
@@ -111,7 +111,7 @@ def setup(self, cfg: DictConfig) -> None:
         parallelize_module(
             model,
             tp_device_mesh,
-            parallelize_plan=config.instantiate(cfg.parallelize_plan),
+            parallelize_plan=config.instantiate(cfg.tensor_parallel_plan),
         )
 
         with training.set_default_dtype(self._dtype), self._device:
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -145,11 +145,13 @@ def __init__(self, cfg: DictConfig) -> None:
         # Initialize distributed variables
         self.world_size, self.rank = utils.get_world_size_and_rank()
         self._is_rank_zero = self.rank == 0
-        self.parallelize_plan = config.instantiate(cfg.get("parallelize_plan", None))
+        self.tensor_parallel_plan = config.instantiate(
+            cfg.get("tensor_parallel_plan", None)
+        )
         self.tensor_parallel_dim = cfg.get("tensor_parallel_dim", 1)
-        if self.tensor_parallel_dim > 1 and self.parallelize_plan is None:
+        if self.tensor_parallel_dim > 1 and self.tensor_parallel_plan is None:
             raise ValueError(
-                "Parallelism plan need to be provided when tensor parallel is enabled."
+                "Tensor Parallel plan needs to be provided when tensor parallel is enabled."
             )
         if self.world_size % self.tensor_parallel_dim != 0:
             raise ValueError(
@@ -549,7 +551,7 @@ def _setup_model(
             parallelize_module(
                 model,
                 device_mesh["tp"],
-                parallelize_plan=self.parallelize_plan,
+                parallelize_plan=self.tensor_parallel_plan,
             )
 
         # We currently have two versions of activation checkpointing in this recipe
diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
@@ -156,7 +156,7 @@ def test_loss_2d_parallel(
         tokenizer_path = Path(TOKENIZER_PATHS[model_type])
         ckpt_dir = ckpt_path.parent
         log_file = gen_log_file_name(tmpdir)
-        parallelize_plan = "torchtune.models.llama3.base_llama_tp_plan"
+        tp_plan = "torchtune.models.llama3.base_llama_tp_plan"
 
         # Config file needed for model conversion.
         write_hf_ckpt_config(ckpt_dir)
@@ -175,7 +175,7 @@ def test_loss_2d_parallel(
             tokenizer.path='{tokenizer_path}' \
             tokenizer.prompt_template=null \
             tensor_parallel_dim={tensor_parallel_dim} \
-            parallelize_plan._component_={parallelize_plan} \
+            tensor_parallel_plan._component_={tp_plan} \
             metric_logger.filename={log_file} \
         """.split()
         model_config = MODEL_TEST_CONFIGS[model_type]

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def setup(self, cfg: DictConfig) -> None:`
`111`	`111`	`parallelize_module(`
`112`	`112`	`model,`
`113`	`113`	`tp_device_mesh,`
`114`		`- parallelize_plan=config.instantiate(cfg.parallelize_plan),`
	`114`	`+ parallelize_plan=config.instantiate(cfg.tensor_parallel_plan),`
`115`	`115`	`)`
`116`	`116`
`117`	`117`	`with training.set_default_dtype(self._dtype), self._device:`