Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name = 'NCCL'
version = '2.18.3'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '12.2.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this one also need NCCL-2.18.3_fix-cudaMemcpyAsync.patch like NCCL-2.18.3-GCCcore-12.3.0-CUDA-12.1.1.eb

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense I guess, added

checksums = [
('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9',
'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96'),
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.39')]

dependencies = [
('CUDA', '12.0.0', '', SYSTEM),
('UCX-CUDA', '1.13.1', versionsuffix),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
name = 'PyTorch'
version = '2.1.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
'PyTorch-2.0.1_fix-skip-decorators.patch',
'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
'PyTorch-2.0.1_fix-vsx-loadu.patch',
'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
'PyTorch-2.0.1_skip-failing-gradtest.patch',
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_disable-gcc12-warning.patch',
'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch',
'PyTorch-2.1.0_fix-validationError-output-test.patch',
'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch',
'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
'PyTorch-2.1.0_skip-test_wrap_bad.patch',
'PyTorch-2.1.2_add-cuda-skip-markers.patch',
'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
'PyTorch-2.1.2_fix-device-mesh-check.patch',
'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch',
'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch',
'PyTorch-2.1.2_fix-test_cuda-non-x86.patch',
'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.1.2_fix-test_memory_profiler.patch',
'PyTorch-2.1.2_fix-test_parallelize_api.patch',
'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch',
'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch',
'PyTorch-2.1.2_relax-cuda-tolerances.patch',
'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch',
'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
]
checksums = [
{'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
{'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
{'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
'1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
{'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
{'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
{'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
{'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
'7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
{'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
{'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
{'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch':
'84bb51a719abc677031a7a3dfe4382ff098b0cbd8b39b8bed2a7fa03f80ac1e9'},
{'PyTorch-2.1.0_fix-validationError-output-test.patch':
'7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'},
{'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
'3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
{'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
{'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch':
'0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'},
{'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
'35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
{'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
{'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
'6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
{'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
'5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
{'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
'5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
{'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
{'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
{'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
{'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
{'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch':
'f583532c59f35f36998851957d501b3ac8c883884efd61bbaa308db55cb6bdcd'},
{'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch':
'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'},
{'PyTorch-2.1.2_fix-test_cuda-non-x86.patch': '1ed76fcc87e6c50606ac286487292a3d534707068c94af74c3a5de8153fa2c2c'},
{'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
{'PyTorch-2.1.2_fix-test_memory_profiler.patch':
'30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'},
{'PyTorch-2.1.2_fix-test_parallelize_api.patch':
'f8387a1693af344099c806981ca38df1306d7f4847d7d44713306338384b1cfd'},
{'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch':
'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch':
'90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'},
{'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch':
'07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'},
{'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
{'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
'6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
{'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch':
'943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'},
{'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch':
'7f5befddcb006b6ab5377de6ee3c29df375c5f8ef5e42b998d35113585b983f3'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.24.3'),
('hypothesis', '6.68.2'),
# For tests
('pytest-flakefinder', '1.1.0'),
('pytest-rerunfailures', '12.0'),
('pytest-shard', '0.1.2'),
('unittest-xml-reporting', '3.1.0'),
]

dependencies = [
('CUDA', '12.0.0', '', SYSTEM),
('cuDNN', '8.8.0.121', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.7.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.18.3', '-CUDA-%(cudaver)s'),
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.10.8'),
('protobuf', '23.0'),
('protobuf-python', '4.23.0'),
('pybind11', '2.10.3'),
('SciPy-bundle', '2023.02'),
('PyYAML', '6.0'),
('MPFR', '4.2.0'),
('GMP', '6.2.1'),
('numactl', '2.0.16'),
('FFmpeg', '5.1.2'),
('Pillow', '9.4.0'),
('expecttest', '0.1.3'),
('networkx', '3.0'),
('sympy', '1.12'),
('Z3', '4.12.2', '-Python-%(pyver)s'),
]

buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
# Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184
'distributed/tensor/parallel/test_tp_random_state',
# failures on OmniPath systems, which don't support some optional InfiniBand features
# See https://github.com/pytorch/tensorpipe/issues/413
'distributed/pipeline/sync/skip/test_gpipe',
'distributed/pipeline/sync/skip/test_leak',
'distributed/pipeline/sync/test_bugs',
'distributed/pipeline/sync/test_inplace',
'distributed/pipeline/sync/test_pipe',
'distributed/pipeline/sync/test_transparency',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 10

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/122318 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
This test seems to expect at most 4 GPUs.
Especially when the number of GPUs is not a power of 2 (e.g. 6) test_fsdp_tp_integration_tensor_parallel_size_2_cpu_offload_CPUOffload(offload_params=False) fails:

torch.testing._internal.common_distributed: [ERROR] File "/dev/shm//pytorch/test/distributed/fsdp/test_fsdp_tp_integration.py", line 157, in _sync_tp_grads
torch.testing._internal.common_distributed: [ERROR] per_param_masks = unsharded_zeros.split(splits)
torch.testing._internal.common_distributed: [ERROR] File "/tmp/easybuild-install/lib/python3.10/site-packages/torch/_tensor.py", line 864, in split
torch.testing._internal.common_distributed: [ERROR] return torch._VF.split_with_sizes(self, split_size, dim)
torch.testing._internal.common_distributed: [ERROR] RuntimeError: split_with_sizes expects split_sizes to sum exactly to 105 (input tensor's size at dimension 0), but got split_sizes=[20, 4, 16, 4, 48, 12]

See https://github.com/pytorch/pytorch/issues/141237

Limitting to 4 GPUs seems to work.

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index bc7a4aef4a3..61eb13162f2 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -71,6 +71,10 @@ class SimpleModel(torch.nn.Module):


class TestTPFSDPIntegration(FSDPTest):
+ @property
+ def world_size(self):
+ return min(4, super().world_size)
+
def _get_params_and_sharding_info(
self,
model: SimpleModel,
Loading