-
Notifications
You must be signed in to change notification settings - Fork 522
[pytorch][tensorflow][build][test] Add RDMAV_FORK_SAFE #1090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
238c347
ba3413a
cad43a0
c1d6266
4f35834
7a491cd
7922237
bd07c92
c6fd4e5
162eeaa
0a9d4b1
7afa1a5
79fb5f0
d5f0113
857664d
69bcc9d
2363faf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,9 @@ LABEL dlc_major_version="1" | |
| ARG PYTHON=python3 | ||
| ARG PYTHON_VERSION=3.6.13 | ||
| ARG CUBLAS_VERSION=11.3.0.106 | ||
| ARG OPEN_MPI_PATH=/opt/amazon/openmpi/ | ||
| ARG OPEN_MPI_PATH=/opt/amazon/openmpi | ||
| ARG EFA_PATH=/opt/amazon/efa | ||
|
|
||
| ARG CUDA_HOME=/usr/local/cuda | ||
| ARG CONDA_PREFIX=/opt/conda | ||
| ARG METIS=metis-5.1.0 | ||
|
|
@@ -37,6 +39,7 @@ ENV DGLBACKEND=pytorch | |
| ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" | ||
| ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main | ||
| ENV MANUAL_BUILD=0 | ||
| ENV RDMAV_FORK_SAFE=1 | ||
|
|
||
| ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl | ||
| ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl | ||
|
|
@@ -100,8 +103,9 @@ RUN mkdir /tmp/efa \ | |
| && rm -rf /tmp/efa \ | ||
| && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz | ||
|
|
||
| RUN echo "pml = ob1" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf | ||
|
||
| ENV PATH="$OPEN_MPI_PATH/bin:$PATH" | ||
| ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH" | ||
| ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH | ||
|
|
||
| RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ | ||
| && curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,14 @@ ENV KMP_AFFINITY=granularity=fine,compact,1,0 | |
| ENV KMP_BLOCKTIME=1 | ||
| ENV KMP_SETTINGS=0 | ||
| ENV MANUAL_BUILD=0 | ||
| ENV RDMAV_FORK_SAFE=1 | ||
|
|
||
| ARG PYTHON=python3.7 | ||
| ARG PYTHON_PIP=python3-pip | ||
| ARG PIP=pip3 | ||
| ARG PYTHON_VERSION=3.7.10 | ||
| ARG OPEN_MPI_PATH=/opt/amazon/openmpi/ | ||
| ARG OPEN_MPI_PATH=/opt/amazon/openmpi | ||
| ARG EFA_PATH=/opt/amazon/efa | ||
| ARG NCCL_VERSION=2.7.8 | ||
| ARG EFA_VERSION=1.11.2 | ||
| ARG BRANCH_OFI=1.1.1 | ||
|
|
@@ -139,8 +141,8 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa | |
|
|
||
| # Set default NCCL parameters | ||
| RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf | ||
|
|
||
| ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH | ||
| RUN echo "pml = ob1" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf | ||
|
||
| ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH | ||
| # /usr/local/lib/libpython* needs to be accessible for dynamic linking | ||
| ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH | ||
| ENV PATH=$OPEN_MPI_PATH/bin/:$PATH | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,8 +42,7 @@ | |
| "custom_mpi_options": ( | ||
| "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 " | ||
| "-x HOROVOD_FUSION_THRESHOLD=16777216 " | ||
| "-x TF_CPP_MIN_LOG_LEVEL=3 " | ||
| "-x RDMAV_FORK_SAFE=1" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removing RDMAV_FORK_SAFE from tests |
||
| "-x TF_CPP_MIN_LOG_LEVEL=3" | ||
| ), | ||
| } | ||
| }, | ||
|
|
@@ -69,8 +68,7 @@ | |
| "custom_mpi_options": ( | ||
| "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 " | ||
| "-x HOROVOD_FUSION_THRESHOLD=16777216 " | ||
| "-x TF_CPP_MIN_LOG_LEVEL=3 " | ||
| "-x RDMAV_FORK_SAFE=1" | ||
| "-x TF_CPP_MIN_LOG_LEVEL=3" | ||
| ), | ||
| } | ||
| }, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,8 +31,6 @@ spec: | |
| - -x | ||
| - LD_LIBRARY_PATH | ||
| - -x | ||
| - RDMAV_FORK_SAFE=1 | ||
| - -x | ||
| - PATH | ||
| - -x | ||
| - NCCL_SOCKET_IFNAME=eth0 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| set -ex | ||
|
|
||
| export SM_HP_MP_PARAMETERS=\{\"ddp\":true,\"microbatches\":4,\"partitions\":2,\"pipeline\":\"interleaved\"\} | ||
| mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training | ||
| mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,6 +52,8 @@ def pytest_runtest_setup(item): | |
| efa_tests = [mark for mark in item.iter_markers("efa")] | ||
| if not efa_tests: | ||
| pytest.skip("Skipping non-efa tests") | ||
| if efa_tests and are_efa_tests_disabled(): | ||
|
||
| pytest.skip('Skipping EFA tests as EFA tests are disabled.') | ||
|
||
|
|
||
|
|
||
| def pytest_collection_modifyitems(session, config, items): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,6 +123,13 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): | |
| else: | ||
| integration_path = os.path.join("integration", sagemaker_test_type) | ||
|
|
||
| if job_type == "training": | ||
| if framework == "tensorflow": | ||
| if framework_major_version == "2": | ||
| integration_path = f"integration/sagemaker/test_mnist.py::test_smdataparallel_smmodelparallel_mnist" | ||
| else: | ||
| integration_path = f"integration/sagemaker/test_tuning_model_dir.py" | ||
|
||
|
|
||
| # Conditions for modifying tensorflow SageMaker pytest commands | ||
| if framework == "tensorflow" and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE: | ||
| if job_type == "inference": | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,7 +198,7 @@ def setup_eks_cluster(framework_name, is_neuron): | |
| long_name = framework_name | ||
| short_name = frameworks[long_name] | ||
| codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7] | ||
| num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will be reverted, this is needed to run multinode eks on PR |
||
| num_nodes = 4 | ||
| cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}" | ||
| # default volume size | ||
| volume_size = 80 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It will be reverted, skipping inference as training has fix