feat: support new tp refactor for training

kmehant · kmehant · commit 72d52c2489a4 · 2025-03-28T01:15:07.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -374,9 +374,7 @@ def __init__(
             if not is_torch_version(">=", FSDP_PYTORCH_VERSION):
                 raise ValueError(f"FSDP requires PyTorch >= {FSDP_PYTORCH_VERSION}")
 
-        if os.environ.get("ACCELERATE_USE_TP", "false") == "true" or isinstance(
-            torch_tp_plugin, TorchTensorParallelPlugin
-        ):
+        if isinstance(torch_tp_plugin, TorchTensorParallelPlugin):
             if not is_torch_version(">=", BETA_TP_AVAILABLE_PYTORCH_VERSION):
                 raise ValueError(f"TP requires PyTorch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}")
 
@@ -396,14 +394,8 @@ def __init__(
             if not is_torch_version(">=", FSDP2_PYTORCH_VERSION):
                 raise ImportError(f"FSDP2 requires PyTorch >= {FSDP2_PYTORCH_VERSION}")
 
-        if torch_tp_plugin is None:
-            torch_tp_plugin = (
-                TorchTensorParallelPlugin() if os.environ.get("ACCELERATE_USE_TP", "false") == "true" else None
-            )
-        else:
-            if not isinstance(torch_tp_plugin, TorchTensorParallelPlugin):
-                raise TypeError("`torch_tp_plugin` must be a TorchTensorParallelPlugin object.")
-            os.environ["ACCELERATE_USE_TP"] = "true"
+        if torch_tp_plugin is not None and not isinstance(torch_tp_plugin, TorchTensorParallelPlugin):
+            raise TypeError("`torch_tp_plugin` must be a TorchTensorParallelPlugin object.")
 
         if megatron_lm_plugin is None:  # init from env variables
             megatron_lm_plugin = (
@@ -1600,15 +1592,14 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                     if self.ddp_handler is not None:
                         self.ddp_handler.register_comm_hook(model)
             elif self.distributed_type == DistributedType.TP:
+                if not compare_versions("transformers", ">=", BETA_TP_AVAILABLE_TRANSFORMERS_VERSION):
+                    raise ValueError(f"TP requires transformers >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}")
                 if hasattr(model, "supports_tp_plan") and not model.supports_tp_plan:
-                    if not compare_versions("transformers", ">=", BETA_TP_AVAILABLE_TRANSFORMERS_VERSION):
-                        raise ValueError(f"TP requires transformers >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}")
                     raise NotImplementedError(
                         "Provided model does not support tensor parallelism. \
                         Tensor parallelism plan can be added as base_model_tp_plan to model config class \
                         and _tp_plan attribute to model class."
                     )
-                model.tensor_parallel(self.state.torch_tp_plugin.torch_device_mesh["tp"])
             elif self.is_fsdp2:
                 model = fsdp2_prepare_model(self, model)
 
@@ -2225,8 +2216,7 @@ def _prepare_device_mesh(self):
             return self.state.torch_tp_plugin.torch_device_mesh
         elif self.distributed_type == DistributedType.DEEPSPEED and hasattr(self.state, "ds_device_mesh"):
             return self.state.ds_device_mesh
-        else:
-            return None
+        return None
 
     def _prepare_msamp(self, *args, device_placement):
         if not is_msamp_available():
diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -382,7 +382,7 @@ def get_cluster_input():
                         )
 
     fsdp_config = {}
-    tp_config = {}
+
     if distributed_type in [
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_NPU,
@@ -510,21 +510,7 @@ def get_cluster_input():
                 default=False,
                 error_message="Please enter yes or no.",
             )
-        if not use_fsdp:
-            use_tp = _ask_field(
-                "Do you want to use TensorParallel? [yes/NO]: ",
-                _convert_yes_no_to_bool,
-                default=False,
-                error_message="Please enter yes or no.",
-            )
-            if use_tp:
-                distributed_type = DistributedType.TP
-            if distributed_type == DistributedType.TP:
-                tp_config["tp_size"] = _ask_field(
-                    "What should be your Tensor Parallel degree? [1]: ",
-                    int,
-                    default=1,
-                )
+
     megatron_lm_config = {}
     if distributed_type in [DistributedType.MULTI_GPU]:
         use_megatron_lm = _ask_field(
@@ -863,7 +849,6 @@ def get_cluster_input():
         fp8_config=fp8_config,
         deepspeed_config=deepspeed_config,
         fsdp_config=fsdp_config,
-        tp_config=tp_config,
         megatron_lm_config=megatron_lm_config,
         ipex_config=ipex_config,
         mpirun_config=mpirun_config,
diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py
@@ -194,8 +194,6 @@ class ClusterConfig(BaseConfig):
     deepspeed_config: dict = None
     # args for fsdp
     fsdp_config: dict = None
-    # args for tp
-    tp_config: dict = None
     # args for megatron_lm
     megatron_lm_config: dict = None
     # args for ipex
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -75,7 +75,6 @@
     "tpu": "TPU",
     "use_deepspeed": "DeepSpeed Arguments",
     "use_fsdp": "FSDP Arguments",
-    "use_tp": "PyTorch TP Arguments",
     "use_megatron_lm": "Megatron-LM Arguments",
     "fp8_backend": "FP8 Arguments",
 }
@@ -264,12 +263,6 @@ def launch_command_parser(subparsers=None):
         action="store_true",
         help="Whether to use fsdp.",
     )
-    paradigm_args.add_argument(
-        "--use_tp",
-        default=False,
-        action="store_true",
-        help="Whether to use PyTorch TP.",
-    )
     paradigm_args.add_argument(
         "--use_megatron_lm",
         default=False,
@@ -611,15 +604,6 @@ def launch_command_parser(subparsers=None):
         help="Decides Whether (true|false) intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. (useful only when `use_fsdp` flag is passed).",
     )
 
-    # tp args
-    tp_args = parser.add_argument_group("TP Arguments", "Arguments related to Tensor Parallelism using PyToch.")
-    tp_args.add_argument(
-        "--tp_size",
-        default=1,
-        type=int,
-        help="PyTorch Tensor Parallelism (TP) degree. Set a value greater than 1 to activate. (useful only when `use_tp` flag is passed)",
-    )
-
     # megatron_lm args
     megatron_lm_args = parser.add_argument_group("Megatron-LM Arguments", "Arguments related to Megatron-LM.")
     megatron_lm_args.add_argument(
@@ -1001,9 +985,9 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
 
 def _validate_launch_command(args):
     # Sanity checks
-    if sum([args.multi_gpu, args.cpu, args.tpu, args.use_deepspeed, args.use_fsdp, args.use_tp]) > 1:
+    if sum([args.multi_gpu, args.cpu, args.tpu, args.use_deepspeed, args.use_fsdp]) > 1:
         raise ValueError(
-            "You can only use one of `--cpu`, `--multi_gpu`, `--tpu`, `--use_deepspeed`, `--use_fsdp`, `--use_tp` at a time."
+            "You can only use one of `--cpu`, `--multi_gpu`, `--tpu`, `--use_deepspeed`, `--use_fsdp` at a time."
         )
     if args.multi_gpu and (args.num_processes is not None) and (args.num_processes < 2):
         raise ValueError("You need to use at least 2 processes to use `--multi_gpu`.")
@@ -1020,7 +1004,6 @@ def _validate_launch_command(args):
             and not args.tpu_use_cluster
             and not args.use_deepspeed
             and not args.use_fsdp
-            and not args.use_tp
             and not args.use_megatron_lm
         ):
             args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED
@@ -1040,7 +1023,6 @@ def _validate_launch_command(args):
             )
             args.tpu = defaults.distributed_type == DistributedType.XLA
             args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
-            args.use_tp = defaults.distributed_type == DistributedType.TP
             args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
             args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
         if args.gpu_ids is None:
@@ -1191,8 +1173,6 @@ def launch_command(args):
         deepspeed_launcher(args)
     elif args.use_fsdp and not args.cpu:
         multi_gpu_launcher(args)
-    elif args.use_tp and not args.cpu:
-        multi_gpu_launcher(args)
     elif args.use_megatron_lm and not args.cpu:
         multi_gpu_launcher(args)
     elif args.multi_gpu and not args.cpu:
diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -966,7 +966,7 @@ def __init__(
                     self.distributed_type = DistributedType.MEGATRON_LM
                     megatron_lm_plugin.set_mixed_precision(self._mixed_precision)
                     self.megatron_lm_plugin = megatron_lm_plugin
-                if os.environ.get("ACCELERATE_USE_TP", "false") == "true" or self.torch_tp_plugin is not None:
+                if self.torch_tp_plugin is not None:
                     self.distributed_type = DistributedType.TP
             elif self.distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_XPU, DistributedType.NO]:
                 if is_ipex_available():
diff --git a/src/accelerate/test_utils/scripts/external_deps/test_performance.py b/src/accelerate/test_utils/scripts/external_deps/test_performance.py
@@ -91,9 +91,8 @@ def training_function(config, args):
 
     set_seed(seed)
     train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)
-
     # Instantiate the model (we build the model here so that the seed also control new weights initialization)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, tp_plan=args.tp_plan)
 
     if args.add_pad_token:
         if model.config.pad_token_id is None:
@@ -255,6 +254,12 @@ def main():
         default=False,
         help="To add pad token if not exists.",
     )
+    parser.add_argument(
+        "--tp_plan",
+        type=str,
+        default=None,
+        help="To use TP or not",
+    )
     args = parser.parse_args()
     config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
     training_function(config, args)
diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py
@@ -49,7 +49,7 @@
 XPU_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.4.0"
 MITA_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.1.0"
 BETA_TP_AVAILABLE_PYTORCH_VERSION = "2.3.0"
-BETA_TP_AVAILABLE_TRANSFORMERS_VERSION = "4.47.0"
+BETA_TP_AVAILABLE_TRANSFORMERS_VERSION = "4.50.0"
 
 STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
 
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -2028,7 +2028,6 @@ class TorchTensorParallelPlugin:
     torch_device_mesh: Optional["torch.distributed.DeviceMesh"] = field(default=None)
 
     def __post_init__(self):
-        self.tp_size = self.tp_size if os.environ.get("TP_SIZE", "1") == "1" else int(os.environ.get("TP_SIZE", "1"))
         if self.tp_size == 1:
             raise ValueError("Provide TP degree > 1.")
 
@@ -2046,6 +2045,8 @@ def __post_init__(self):
 
         mesh_dim_name = "tp"
 
+        # device mesh is not used for model sharding
+        # it is only used for preparing data loader
         self.torch_device_mesh = init_device_mesh(device, (self.tp_size,), mesh_dim_names=(mesh_dim_name,))
 
 
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
@@ -306,10 +306,6 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
         current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
         current_env["FSDP_ACTIVATION_CHECKPOINTING"] = str(args.fsdp_activation_checkpointing).lower()
 
-    if args.use_tp:
-        current_env["ACCELERATE_USE_TP"] = "true"
-        current_env["TP_SIZE"] = str(args.tp_size)
-
     if args.use_megatron_lm:
         prefix = "MEGATRON_LM_"
         current_env["ACCELERATE_USE_MEGATRON_LM"] = "true"
diff --git a/tests/tp/test_tp.py b/tests/tp/test_tp.py
@@ -49,14 +49,15 @@ def setUp(self):
     def test_working_of_tp(self):
         self.test_file_path = self.test_scripts_folder / "test_performance.py"
         cmd = get_launch_command(
-            num_processes=self.test_tp_size, num_machines=1, machine_rank=0, use_tp=True, tp_size=self.test_tp_size
+            num_processes=self.test_tp_size, num_machines=1, machine_rank=0, tp_size=self.test_tp_size
         )
         cmd.extend(
             [
                 self.test_file_path,
                 f"--output_dir={self.tmpdir}",
                 f"--model_name_or_path={self.model_name_or_path}",
                 "--add_pad_token=true",
+                "--tp_plan='auto'",
             ]
         )
         with patch_environment(omp_num_threads=1):

Original file line number	Diff line number	Diff line change
`@@ -49,14 +49,15 @@ def setUp(self):`
`49`	`49`	`def test_working_of_tp(self):`
`50`	`50`	`self.test_file_path = self.test_scripts_folder / "test_performance.py"`
`51`	`51`	`cmd = get_launch_command(`
`52`		`- num_processes=self.test_tp_size, num_machines=1, machine_rank=0, use_tp=True, tp_size=self.test_tp_size`
	`52`	`+ num_processes=self.test_tp_size, num_machines=1, machine_rank=0, tp_size=self.test_tp_size`
`53`	`53`	`)`
`54`	`54`	`cmd.extend(`
`55`	`55`	`[`
`56`	`56`	`self.test_file_path,`
`57`	`57`	`f"--output_dir={self.tmpdir}",`
`58`	`58`	`f"--model_name_or_path={self.model_name_or_path}",`
`59`	`59`	`"--add_pad_token=true",`
	`60`	`+ "--tp_plan='auto'",`
`60`	`61`	`]`
`61`	`62`	`)`
`62`	`63`	`with patch_environment(omp_num_threads=1):`