Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/legacy/seq2seq/seq2seq_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:

return (
RandomSampler(self.train_dataset)
if self.args.local_rank == -1
if self.args.local_process_index == -1
else DistributedSampler(self.train_dataset)
)

Expand Down
15 changes: 5 additions & 10 deletions tests/deepspeed/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,6 @@ def test_hf_ds_config_mismatch(self):

with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(
local_rank=0,
fp16=fp16,
deepspeed=ds_config,
per_device_train_batch_size=per_device_train_batch_size,
Expand Down Expand Up @@ -552,7 +551,7 @@ def test_hf_scheduler_hf_optimizer(self):
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
Expand All @@ -566,7 +565,7 @@ def test_ds_scheduler_hf_optimizer(self):
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
Expand All @@ -580,7 +579,7 @@ def test_hf_scheduler_ds_optimizer(self):
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
Expand All @@ -598,7 +597,7 @@ def test_stage3_nvme_offload(self):
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
trainer = get_regression_trainer(
local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
)
with CaptureLogger(deepspeed_logger) as cl:
trainer.train()
Expand All @@ -616,7 +615,6 @@ def model_init():
return model

trainer = get_regression_trainer(
local_rank=0,
fp16=True,
model_init=model_init,
deepspeed=ds_config_zero3_dict,
Expand All @@ -642,7 +640,7 @@ def test_hf_optimizer_with_offload(self, stage, dtype):
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
with mockenv_context(**self.dist_env_1_gpu):
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
kwargs = {"deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl:
Expand All @@ -659,7 +657,6 @@ def test_fake_notebook_no_launcher(self, stage, dtype):
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu):
kwargs = {
"local_rank": 0,
"deepspeed": self.get_config_dict(stage),
"output_dir": self.get_auto_remove_tmp_dir(),
}
Expand All @@ -683,7 +680,6 @@ def test_early_get_last_lr(self, stage, dtype):
kwargs = {
"a": a,
"b": b,
"local_rank": 0,
"train_len": 8,
"deepspeed": self.get_config_dict(stage),
"per_device_train_batch_size": 8,
Expand Down Expand Up @@ -729,7 +725,6 @@ def test_gradient_accumulation(self, stage, dtype):
kwargs = {
"a": a,
"b": b,
"local_rank": 0,
"train_len": train_len,
"deepspeed": self.get_config_dict(stage),
"output_dir": self.get_auto_remove_tmp_dir(),
Expand Down
8 changes: 1 addition & 7 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1437,9 +1437,7 @@ def test_training_arguments_are_left_untouched(self):
args = TrainingArguments(tmp_dir, report_to=[])
dict1, dict2 = args.to_dict(), trainer.args.to_dict()
for key in dict1:
# Logging dir can be slightly different as they default to something with the time.
if key != "logging_dir":
self.assertEqual(dict1[key], dict2[key])
self.assertEqual(dict1[key], dict2[key])

def test_number_of_steps_in_training(self):
# Regular training has n_epochs * len(train_dl) steps
Expand Down Expand Up @@ -5433,7 +5431,6 @@ def hp_name(trial):
num_train_epochs=4,
disable_tqdm=True,
load_best_model_at_end=True,
logging_dir="runs",
run_name="test",
model_init=model_init,
)
Expand Down Expand Up @@ -5482,7 +5479,6 @@ def compute_objective(metrics: dict[str, float]) -> list[float]:
num_train_epochs=10,
disable_tqdm=True,
load_best_model_at_end=True,
logging_dir="runs",
run_name="test",
model_init=model_init,
compute_metrics=AlmostAccuracy(),
Expand Down Expand Up @@ -5572,7 +5568,6 @@ def hp_name(params):
num_train_epochs=4,
disable_tqdm=True,
load_best_model_at_end=True,
logging_dir="runs",
run_name="test",
model_init=model_init,
)
Expand Down Expand Up @@ -6170,7 +6165,6 @@ def model_init(config):
num_train_epochs=4,
disable_tqdm=True,
load_best_model_at_end=True,
logging_dir="runs",
run_name="test",
model_init=model_init,
)
Expand Down