revert back pretrain_llama3_8b.py

shjwudp · shjwudp · commit f8a59d8ce465 · 2025-08-28T08:30:40.000+08:00
formt code

Signed-off-by: jianbinc &lt;shjwudp@gmail.com&gt;
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -371,6 +371,8 @@ def __init__(
                 "Setting FSDP option to megatron"
             )
             fsdp = 'megatron'
+            if self.save_ckpt_format != "fsdp_dtensor":
+                raise NotImplementedError(f"FSDP checkpointing is not supported with {self.save_ckpt_format}.")
 
         if fsdp == "pytorch":
             raise NotImplementedError("PyTorch FSDP2 is not supported with MegatronParallel.")
@@ -936,28 +938,24 @@ def _get_fsdp_dtensor_state_dict(
         model_key="model",
         optimizer_key="optimizer_states",
     ):
+        from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import (
+            preprocess_state_dict_for_uneven_dtensor,
+        )
         from megatron.core.transformer.fsdp_dtensor_checkpoint import (
             handle_fp8_extra_state_case,
             handle_swiglu_in_state_dict,
         )
-        from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import (
-            preprocess_state_dict_for_uneven_dtensor,
-        )
 
         state_dict = raw_state_dict.copy()
         handle_fp8_extra_state_case(state_dict[model_key])
         module = self.model[0].module
-        if torch.distributed.get_rank() == 0:
-            print(self.model, module)
         if getattr(module.config, "gated_linear_unit", False):
             model_state_dict = state_dict[model_key].copy()
             if optimizer_key in state_dict:
                 optimizer_state_dict = state_dict[optimizer_key].copy()
             else:
                 optimizer_state_dict = {}
-            handle_swiglu_in_state_dict(
-                module.module, model_state_dict, optimizer_state_dict
-            )
+            handle_swiglu_in_state_dict(module.module, model_state_dict, optimizer_state_dict)
             state_dict[model_key] = model_state_dict
             if optimizer_key in state_dict:
                 state_dict[optimizer_key] = optimizer_state_dict
@@ -1060,9 +1058,7 @@ def _load_fsdp_dtensor_checkpoint(self, path, sharded_state_dict, strict):
             checkpoint_id=path,
             planner=planner,
         )
-        sharded_state_dict.update(
-            self._load_fsdp_dtensor_common_state(ckpt_dir=path)
-        )
+        sharded_state_dict.update(self._load_fsdp_dtensor_common_state(ckpt_dir=path))
         if "loops" in sharded_state_dict:
             sharded_state_dict["fit_loop"] = sharded_state_dict["loops"]["fit_loop"]
 
diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py
@@ -84,49 +84,6 @@ def override_recipe_configs(
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
     )
-    # for saving checkpoints
-    ckpt_path = "/lustre/fsw/coreai_devtech_all/jianbinc/playground/nemo_nvfsdp_update/NeMo/checkpoints"
-    recipe.log.log_dir = ckpt_path
-    import nemo.lightning as nl
-    import nemo_run as run
-
-    recipe.log.ckpt = run.Config(
-        nl.ModelCheckpoint,
-        train_time_interval=None,
-        save_last=True,
-        every_n_train_steps=100,
-        save_top_k=1,
-        save_on_train_epoch_end=True,
-        save_optim_on_train_end=True,
-        always_save_context=False,
-        filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
-    )
-    
-    # nl.ModelCheckpoint(
-    #     train_time_interval=None,
-    # )
-    # # recipe.log.ckpt.train_time_interval = None
-    # recipe.log.ckpt.save_last = True
-    # recipe.log.ckpt.every_n_train_steps = 100
-    # recipe.log.ckpt.save_top_k = 1
-    # recipe.log.ckpt.save_on_train_epoch_end = True
-    # recipe.log.ckpt.save_optim_on_train_end = True
-    # recipe.log.ckpt.always_save_context = False
-
-    # for loading checkpoints
-    recipe.resume.resume_if_exists = True
-    recipe.resume.resume_ignore_no_checkpoint = True
-    # recipe.resume.restore_config = RestoreConfig(
-    #     path=ckpt_path,
-    #     load_model_state=True,
-    #     load_optim_state=True,
-    # )
-
-    recipe.trainer.strategy.save_ckpt_format = "fsdp_dtensor"
-    recipe.trainer.strategy.ddp.average_in_collective = False
-    # recipe.trainer.strategy.ddp.data_parallel_sharding_strategy = "optim"
-
-    recipe.optim.config.use_precision_aware_optimizer = False
 
     # data module configs
     if args.use_hf_tokenizer: