Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2039,7 +2039,7 @@ def get_env(self):

"""
env = os.environ.copy()
paths = [self.src_dir_str]
paths = [self.repo_root_dir_str, self.src_dir_str]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an implicit issue here.

  1. In the env where PYTHONPATH is empty, the line 2049 code join src and test to PYTHONPATH with an extra ":"(since paths has a "" string because of the code paths.append(env.get("PYTHONPATH", ""))), so the final PYTHONPATH will be like /transformers/src:/transformers/tests: with a trailing :, which implies the current folder which is /transformers.
  2. But when it comes the an env with an non-empty PYTHONPATH, the env["PYTHONPATH"] = ":".join(paths) will generate /transformers/src:/transformers/tests:A:B path without training :, which means current folder not included in the PYTHONPATH

So, in situation 2), test cases like tests/trainer/test_trainer_distributed_loss.py will fail, since while torchrun, the from tests.trainer.test_trainer import StoreLossCallback will not found, since there is only /transformers/src:/transformers/tests in PYTHONPATH, but the import statement actually needs /transformers/

what i do is making it explicit, but this is just for the cases i can see, i don't know other background.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM. But just to understand better,

when it comes the an env with an non-empty PYTHONPATH

So in your environment, this is the case? Do you set it somehow?

In our report, I see

tests/trainer/test_trainer_distributed_loss.py::TestTrainerDistributedLoss::test_trainer
(line 675) AssertionError: 0.8057947486768737 not less than 0.1

So it seems there is no import issue in our runner env.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ydshieh yes, in my CUDA env, there is no such issue either, since PYTHONPATH is empty. But in my XPU env, since Intel oneAPI will preset some paths into PYTHONPATH, and it doesn't work. That's the background i looked into that issue and found it,:)

if "/examples" in self.test_file_dir_str:
paths.append(self.examples_dir_str)
else:
Expand Down
3 changes: 1 addition & 2 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@
require_torch_fp16,
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_multi_gpu,
require_torch_non_multi_accelerator,
require_torch_non_multi_gpu,
require_torch_tensorrt_fx,
Expand Down Expand Up @@ -3766,7 +3765,7 @@ def test_num_train_epochs_in_training(self):
train_output = trainer.train()
self.assertEqual(train_output.global_step, int(self.n_epochs))

@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_num_batches_in_training_with_gradient_accumulation(self):
with tempfile.TemporaryDirectory() as tmp_dir:
for num_train_epochs in [1, 2]:
Expand Down
9 changes: 5 additions & 4 deletions tests/trainer/test_trainer_distributed_loss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json

import datasets
import torch

from tests.trainer.test_trainer import StoreLossCallback
from transformers import (
Expand All @@ -15,16 +14,18 @@
)
from transformers.testing_utils import (
TestCasePlus,
backend_device_count,
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_multi_accelerator,
torch_device,
)


class TestTrainerDistributedLoss(TestCasePlus):
@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_trainer(self):
device_count = torch.cuda.device_count()
device_count = backend_device_count(torch_device)
min_bs = 1
output_dir = self.get_auto_remove_tmp_dir()
for gpu_num, enable, bs, name in (
Expand Down
10 changes: 6 additions & 4 deletions tests/trainer/test_trainer_distributed_worker_seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
)
from transformers.testing_utils import (
TestCasePlus,
backend_device_count,
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_multi_accelerator,
torch_device,
)


Expand Down Expand Up @@ -47,17 +49,17 @@ def __init__(self):
self.fc = nn.Linear(3, 1)

def forward(self, x):
local_tensor = torch.tensor(x, device="cuda")
local_tensor = torch.tensor(x, device=torch_device)
gathered = gather_from_all_gpus(local_tensor, dist.get_world_size())
assert not all(torch.allclose(t, gathered[0]) for t in gathered[1:])
y = self.fc(x)
return (y.mean(), y)


class TestTrainerDistributedWorkerSeed(TestCasePlus):
@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_trainer(self):
device_count = torch.cuda.device_count()
device_count = backend_device_count(torch_device)
output_dir = self.get_auto_remove_tmp_dir()
distributed_args = f"""--nproc_per_node={device_count}
--master_port={get_torch_dist_unique_port()}
Expand Down