Skip to content

Commit da1d0f7

Browse files
authored
Merge pull request #18853 from Flamefire/20230922150734_new_pr_PyTorch1131
{lib}[GCCcore/12.2.0,foss/2022b] PyTorch v1.13.1, cuDNN v8.5.0.96, magma v2.7.1, ... w/ CUDA 11.7.0
2 parents fd25a84 + cb0e091 commit da1d0f7

7 files changed

+429
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name = 'cuDNN'
2+
version = '8.5.0.96'
3+
versionsuffix = '-CUDA-%(cudaver)s'
4+
5+
homepage = 'https://developer.nvidia.com/cudnn'
6+
description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
7+
a GPU-accelerated library of primitives for deep neural networks."""
8+
9+
toolchain = SYSTEM
10+
11+
# note: cuDNN is tied to specific to CUDA versions,
12+
# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
13+
local_short_ver = '.'.join(version.split('.')[:3])
14+
source_urls = ['https://developer.download.nvidia.com/compute/redist/cudnn/'
15+
'v%s/local_installers/%%(cudashortver)s/' % local_short_ver]
16+
sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
17+
checksums = [
18+
{
19+
'%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
20+
'5454a6fd94f008728caae9adad993c4e85ef36302e26bce43bea7d458a5e7b6d',
21+
'%(namelower)s-linux-ppc64le-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
22+
'00373c3d5e0b536a5557d0d0eb50706777f213a222b4030e1b71b1bec43d205f',
23+
'%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
24+
'86780abbecd4634e7363fad1d000ae23b7905a5f8383bddbf7332c6934791dde',
25+
}
26+
]
27+
28+
dependencies = [('CUDA', '11.7.0')]
29+
30+
sanity_check_paths = {
31+
'files': [
32+
'include/cudnn.h', 'lib64/libcudnn_adv_infer_static.a', 'lib64/libcudnn_adv_train_static.a',
33+
'lib64/libcudnn_cnn_infer_static.a', 'lib64/libcudnn_cnn_train_static.a',
34+
'lib64/libcudnn_ops_infer_static.a', 'lib64/libcudnn_ops_train_static.a',
35+
'lib64/libcudnn.%s' % SHLIB_EXT
36+
],
37+
'dirs': ['include', 'lib64'],
38+
}
39+
40+
moduleclass = 'numlib'
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
easyblock = 'CMakeMake'
2+
3+
name = 'magma'
4+
version = '2.7.1'
5+
versionsuffix = '-CUDA-%(cudaver)s'
6+
7+
homepage = 'https://icl.cs.utk.edu/magma/'
8+
description = """The MAGMA project aims to develop a dense linear algebra library similar to
9+
LAPACK but for heterogeneous/hybrid architectures, starting with current Multicore+GPU systems."""
10+
11+
toolchain = {'name': 'foss', 'version': '2022b'}
12+
toolchainopts = {'pic': True, 'openmp': True}
13+
14+
source_urls = ['https://icl.cs.utk.edu/projectsfiles/%(name)s/downloads/']
15+
sources = [SOURCE_TAR_GZ]
16+
checksums = ['d9c8711c047a38cae16efde74bee2eb3333217fd2711e1e9b8606cbbb4ae1a50']
17+
18+
builddependencies = [
19+
('CMake', '3.24.3'),
20+
]
21+
dependencies = [
22+
('CUDA', '11.7.0', '', SYSTEM),
23+
('UCX-CUDA', '1.13.1', versionsuffix),
24+
]
25+
26+
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
27+
cuda_compute_capabilities = ['3.5', '5.0', '6.0', '7.0', '7.5', '8.0', '8.6']
28+
29+
# make sure both static and shared libs are built
30+
local_common_opts = '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DGPU_TARGET="%%(cuda_sm_space_sep)s"'
31+
configopts = [
32+
(local_common_opts + ' -DBUILD_SHARED_LIBS=%s ') % local_shared for local_shared in ('ON', 'OFF')
33+
]
34+
35+
sanity_check_paths = {
36+
'files': ['lib/libmagma.%s' % SHLIB_EXT, 'lib/libmagma.a'],
37+
'dirs': ['include'],
38+
}
39+
40+
moduleclass = 'math'
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name = 'NCCL'
2+
version = '2.16.2'
3+
versionsuffix = '-CUDA-%(cudaver)s'
4+
5+
homepage = 'https://developer.nvidia.com/nccl'
6+
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
7+
communication primitives that are performance optimized for NVIDIA GPUs."""
8+
9+
toolchain = {'name': 'GCCcore', 'version': '12.2.0'}
10+
11+
github_account = 'NVIDIA'
12+
source_urls = [GITHUB_SOURCE]
13+
sources = ['v%(version)s-1.tar.gz']
14+
patches = ['NCCL-2.16.2_fix-cpuid.patch']
15+
checksums = [
16+
{'v2.16.2-1.tar.gz': '7f7c738511a8876403fc574d13d48e7c250d934d755598d82e14bab12236fc64'},
17+
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
18+
]
19+
20+
builddependencies = [('binutils', '2.39')]
21+
22+
dependencies = [
23+
('CUDA', '11.7.0', '', SYSTEM),
24+
('UCX-CUDA', '1.13.1', versionsuffix),
25+
]
26+
27+
prebuildopts = "sed -i 's/NVCUFLAGS := /NVCUFLAGS := -allow-unsupported-compiler /' makefiles/common.mk && "
28+
buildopts = "VERBOSE=1"
29+
30+
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
31+
cuda_compute_capabilities = ['3.5', '5.0', '6.0', '7.0', '7.5', '8.0', '8.6']
32+
33+
moduleclass = 'lib'
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
name = 'PyTorch'
2+
version = '1.13.1'
3+
versionsuffix = '-CUDA-%(cudaver)s'
4+
5+
homepage = 'https://pytorch.org/'
6+
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
7+
PyTorch is a deep learning framework that puts Python first."""
8+
9+
toolchain = {'name': 'foss', 'version': '2022b'}
10+
11+
source_urls = [GITHUB_RELEASE]
12+
sources = ['%(namelower)s-v%(version)s.tar.gz']
13+
patches = [
14+
'PyTorch-1.7.0_disable-dev-shm-test.patch',
15+
'PyTorch-1.10.0_fix-kineto-crash.patch',
16+
'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch',
17+
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
18+
'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch',
19+
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
20+
'PyTorch-1.12.1_fix-skip-decorators.patch',
21+
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
22+
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
23+
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
24+
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
25+
'PyTorch-1.12.1_fix-vsx-loadu.patch',
26+
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
27+
'PyTorch-1.12.1_skip-test_round_robin.patch',
28+
'PyTorch-1.13.1_allow-GCC-12-for-CUDA-11.7.patch',
29+
'PyTorch-1.13.1_disable-test-sharding.patch',
30+
'PyTorch-1.13.1_fix-duplicate-kDefaultTimeout-definition.patch',
31+
'PyTorch-1.13.1_fix-flaky-jit-test.patch',
32+
'PyTorch-1.13.1_fix-fsdp-fp16-test.patch',
33+
'PyTorch-1.13.1_fix-fsdp-tp-integration-test.patch',
34+
'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch',
35+
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
36+
'PyTorch-1.13.1_fix-kineto-crash-on-exit.patch',
37+
'PyTorch-1.13.1_fix-numpy-deprecations.patch',
38+
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
39+
'PyTorch-1.13.1_fix-pytest-args.patch',
40+
'PyTorch-1.13.1_fix-python-3.11-compat.patch',
41+
'PyTorch-1.13.1_fix-test-ops-conf.patch',
42+
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
43+
'PyTorch-1.13.1_fix-wrong-check-in-fsdp-tests.patch',
44+
'PyTorch-1.13.1_increase-tolerance-test_jit.patch',
45+
'PyTorch-1.13.1_increase-tolerance-test_ops.patch',
46+
'PyTorch-1.13.1_increase-tolerance-test_optim.patch',
47+
'PyTorch-1.13.1_install-vsx-vec-headers.patch',
48+
'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
49+
'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
50+
'PyTorch-1.13.1_skip-failing-grad-test.patch',
51+
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
52+
'PyTorch-1.13.1_skip-test_find_unused_parameters-detail.patch',
53+
'PyTorch-1.13.1_skip-test-requiring-online-access.patch',
54+
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
55+
'PyTorch-1.13.1_workaround-gcc12-destructor-exception-bug.patch',
56+
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
57+
]
58+
checksums = [
59+
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
60+
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
61+
{'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'},
62+
{'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch':
63+
'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'},
64+
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
65+
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
66+
{'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch':
67+
'090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'},
68+
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
69+
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
70+
{'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'},
71+
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
72+
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
73+
{'PyTorch-1.12.1_fix-test_wishart_log_prob.patch':
74+
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45'},
75+
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
76+
{'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch':
77+
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'},
78+
{'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'},
79+
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
80+
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
81+
{'PyTorch-1.13.1_allow-GCC-12-for-CUDA-11.7.patch':
82+
'4c9a4247dcf6e0f62fda2e7283f7de6f7c801d5e61c39d27a91a287f9d363d68'},
83+
{'PyTorch-1.13.1_disable-test-sharding.patch': 'df2074adeba47998ce2993d99ca64eb6f1c79ab7057f553b436efdec264d3572'},
84+
{'PyTorch-1.13.1_fix-duplicate-kDefaultTimeout-definition.patch':
85+
'882f8cfaf33490a4372928fb6673cbbfa40e5be1b64bf7e0cc2924d73cf872e8'},
86+
{'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'},
87+
{'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'},
88+
{'PyTorch-1.13.1_fix-fsdp-tp-integration-test.patch':
89+
'31e2d63b54ae1a8c554575f46db79bf8bbda851b6ca0ffe623c4911207a3c2bc'},
90+
{'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch':
91+
'18df8c61ecaa9fb659346c1e172828bca6b069f0145bb8f6a36b0a23b7bef0a6'},
92+
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
93+
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
94+
{'PyTorch-1.13.1_fix-kineto-crash-on-exit.patch':
95+
'f1e6808ee8d91a2ad76e0caedb4685e5aec3008d5e2e3c3c3e88cbb25cbd71b4'},
96+
{'PyTorch-1.13.1_fix-numpy-deprecations.patch': 'f461b570efe0434ddd806bf2fa7020eb213e3ed89d0eb4403e076f4276ba2a46'},
97+
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
98+
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
99+
{'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'},
100+
{'PyTorch-1.13.1_fix-python-3.11-compat.patch': 'fa4eb0e27e00a90bb217b77c0023089c4659c03f37d781ab4a681bdcb4f0432f'},
101+
{'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'},
102+
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
103+
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
104+
{'PyTorch-1.13.1_fix-wrong-check-in-fsdp-tests.patch':
105+
'cbb5ca9ad668a504a456a2cc02d7254b79ddfd9a971a1648f0508fb103a9fc89'},
106+
{'PyTorch-1.13.1_increase-tolerance-test_jit.patch':
107+
'b97913754a0ae0887b8137db0b0d57caff8c3d7bd96fe555ea27ea01ff14527a'},
108+
{'PyTorch-1.13.1_increase-tolerance-test_ops.patch':
109+
'c909fdfc2b12df457e1eb5514265ffec3eab653994949416f3f048668421e223'},
110+
{'PyTorch-1.13.1_increase-tolerance-test_optim.patch':
111+
'a079d824085eab89794f5ecfc67792f735ed8cfd3fe7db52e4dea62e583cfe06'},
112+
{'PyTorch-1.13.1_install-vsx-vec-headers.patch':
113+
'7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'},
114+
{'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
115+
{'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
116+
'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
117+
{'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'},
118+
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
119+
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
120+
{'PyTorch-1.13.1_skip-test_find_unused_parameters-detail.patch':
121+
'c71a3385ce5fc447f908a3df78ade2143d97e2538cf03b530db4f6cc8b32c22b'},
122+
{'PyTorch-1.13.1_skip-test-requiring-online-access.patch':
123+
'61c3b7859dc06a9969981b07aa2789630de110d6d1d3633d27364be47af74712'},
124+
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
125+
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
126+
{'PyTorch-1.13.1_workaround-gcc12-destructor-exception-bug.patch':
127+
'a09a2d7ebd428c65988729578bb3fa372565ba176ab9ed7abf11f6fcb15e903e'},
128+
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
129+
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
130+
]
131+
132+
osdependencies = [OS_PKG_IBVERBS_DEV]
133+
134+
builddependencies = [
135+
('CMake', '3.24.3'),
136+
('hypothesis', '6.68.2'),
137+
# For tests
138+
('pytest-rerunfailures', '12.0'),
139+
('pytest-shard', '0.1.2'),
140+
]
141+
142+
dependencies = [
143+
('CUDA', '11.7.0', '', SYSTEM),
144+
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
145+
('Python', '3.10.8'),
146+
('protobuf', '23.0'),
147+
('protobuf-python', '4.23.0'),
148+
('pybind11', '2.10.3'),
149+
('SciPy-bundle', '2023.02'),
150+
('PyYAML', '6.0'),
151+
('MPFR', '4.2.0'),
152+
('GMP', '6.2.1'),
153+
('numactl', '2.0.16'),
154+
('FFmpeg', '5.1.2'),
155+
('Pillow', '9.4.0'),
156+
('cuDNN', '8.5.0.96', '-CUDA-%(cudaver)s', SYSTEM),
157+
('magma', '2.7.1', '-CUDA-%(cudaver)s'),
158+
('NCCL', '2.16.2', '-CUDA-%(cudaver)s'),
159+
('expecttest', '0.1.3'),
160+
]
161+
162+
custom_opts = ['CMAKE_CUDA_FLAGS=-allow-unsupported-compiler']
163+
164+
excluded_tests = {
165+
'': [
166+
# This test seems to take too long on NVIDIA Ampere at least.
167+
'distributed/test_distributed_spawn',
168+
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
169+
'distributions/test_constraints',
170+
# no xdoctest
171+
'doctests',
172+
# failing on broadwell
173+
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
174+
'test_native_mha',
175+
# intermittent failures on various systems
176+
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
177+
'distributed/rpc/test_tensorpipe_agent',
178+
]
179+
}
180+
181+
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
182+
183+
# The readelf sanity check command can be taken out once the TestRPATH test from
184+
# https://github.com/pytorch/pytorch/pull/87593 is accepted, since it is then checked as part of the PyTorch test suite
185+
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
186+
sanity_check_commands = [
187+
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
188+
]
189+
190+
# Especially test_quantization has a few corner cases that are triggered by the random input values,
191+
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
192+
# So allow a low number of tests to fail as the tests "usually" succeed
193+
max_failed_tests = 2
194+
195+
tests = ['PyTorch-check-cpp-extension.py']
196+
197+
moduleclass = 'ai'
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
The JIT generator for CUDA fails as GCC 12 isn't officially compatible with CUDA 11.7.
2+
We can make it compatible by passing `-allow-unsupported-compiler`
3+
but also need to tell the PyTorch code about the raised maximum compiler version.
4+
5+
Author: Alexander Grund (TU Dresden)
6+
7+
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
8+
index 9ab43e5ccdd..15da97619be 100644
9+
--- a/torch/utils/cpp_extension.py
10+
+++ b/torch/utils/cpp_extension.py
11+
@@ -56,7 +56,7 @@ CUDA_GCC_VERSIONS = {
12+
'11.4': ((6, 0, 0), (11, 5, 0)),
13+
'11.5': ((6, 0, 0), (11, 5, 0)),
14+
'11.6': ((6, 0, 0), (11, 5, 0)),
15+
- '11.7': ((6, 0, 0), (11, 5, 0)),
16+
+ '11.7': ((6, 0, 0), (12, 3, 0)),
17+
}
18+
19+
CUDA_CLANG_VERSIONS = {
20+
@@ -227,7 +227,8 @@ COMMON_NVCC_FLAGS = [
21+
'-D__CUDA_NO_HALF_CONVERSIONS__',
22+
'-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
23+
'-D__CUDA_NO_HALF2_OPERATORS__',
24+
- '--expt-relaxed-constexpr'
25+
+ '--expt-relaxed-constexpr',
26+
+ '-allow-unsupported-compiler',
27+
]
28+
29+
COMMON_HIP_FLAGS = [
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
In distributed/test_c10d_nccl the tests
2+
test_find_unused_parameters_kwarg_debug_detail and
3+
test_find_unused_parameters_kwarg_grad_is_view_debug_detail
4+
are failing often on some systems with the root error seemingly being
5+
> terminate called after throwing an instance of 'c10::Error'
6+
> what(): CUDA error: driver shutting down
7+
8+
Stacktrace:
9+
frame #0: c10::Error::Error(<snip>) + 0x8d (0x2ae861eff2cd in <snip>/torch/lib/libc10.so)
10+
frame #1: c10::detail::torchCheckFail(<snip>) + 0xd0 (0x2ae861ec64d1 in <snip>/torch/lib/libc10.so)
11+
frame #2: c10::cuda::c10_cuda_check_implementation(<snip>) + 0x352 (0x2ae861e948c2 in <snip>/torch/lib/libc10_cuda.so)
12+
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const + 0x140 (0x2ae848587e80 in <snip>/torch/lib/libtorch_cuda.so)
13+
frame #4: c10d::ProcessGroupNCCL::WorkNCCL::isStarted() + 0x58 (0x2ae84858a1b8 in <snip>/torch/lib/libtorch_cuda.so)
14+
frame #5: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x3c8 (0x2ae84858ee18 in <snip>/torch/lib/libtorch_cuda.so)
15+
16+
Just skip the tests to avoid failing the testsuite.
17+
18+
Author: Alexander Grund (TU Dresden)
19+
20+
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
21+
index 6a0858eebf8..7340a89db82 100644
22+
--- a/test/distributed/test_c10d_nccl.py
23+
+++ b/test/distributed/test_c10d_nccl.py
24+
@@ -12,7 +12,7 @@ import time
25+
from contextlib import contextmanager
26+
from datetime import timedelta
27+
from itertools import product
28+
-from unittest import mock
29+
+from unittest import mock, skip
30+
31+
import torch
32+
import torch.distributed as c10d
33+
@@ -1460,6 +1460,7 @@ class DistributedDataParallelTest(
34+
35+
# TODO: Combine the following tests once https://github.com/pytorch/pytorch/issues/55967
36+
# is resolved.
37+
+ @skip("Debug level DETAIL fails on some systems/CUDA versions")
38+
@requires_nccl()
39+
@skip_if_lt_x_gpu(2)
40+
@with_dist_debug_levels(levels=["DETAIL"])
41+
@@ -1478,6 +1479,7 @@ class DistributedDataParallelTest(
42+
def test_find_unused_parameters_kwarg_debug_off(self):
43+
self._test_find_unused_parameters_kwarg()
44+
45+
+ @skip("Debug level DETAIL fails on some systems/CUDA versions")
46+
@requires_nccl()
47+
@skip_if_lt_x_gpu(2)
48+
@with_dist_debug_levels(levels=["DETAIL"])

0 commit comments

Comments
 (0)